CINXE.COM
Computation and Language
<!DOCTYPE html> <html lang="en"> <head> <title>Computation and Language </title> <meta name="viewport" content="width=device-width, initial-scale=1"> <link rel="apple-touch-icon" sizes="180x180" href="/static/browse/0.3.4/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="/static/browse/0.3.4/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="/static/browse/0.3.4/images/icons/favicon-16x16.png"> <link rel="manifest" href="/static/browse/0.3.4/images/icons/site.webmanifest"> <link rel="mask-icon" href="/static/browse/0.3.4/images/icons/safari-pinned-tab.svg" color="#5bbad5"> <meta name="msapplication-TileColor" content="#da532c"> <meta name="theme-color" content="#ffffff"> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/arXiv.css?v=20241206" /> <link rel="stylesheet" type="text/css" media="print" href="/static/browse/0.3.4/css/arXiv-print.css?v=20200611" /> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/browse_search.css" /> <script language="javascript" src="/static/browse/0.3.4/js/accordion.js" /></script> <script src="/static/browse/0.3.4/js/mathjaxToggle.min.js" type="text/javascript"></script> <script type="text/javascript" language="javascript">mathjaxToggle();</script> </head> <body class="with-cu-identity"> <div class="flex-wrap-footer"> <header> <a href="#content" class="is-sr-only">Skip to main content</a> <!-- start desktop header --> <div class="columns is-vcentered is-hidden-mobile" id="cu-identity"> <div class="column" id="cu-logo"> <a href="https://www.cornell.edu/"><img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University" /></a> </div><div class="column" id="support-ack"> <span id="support-ack-url">We gratefully acknowledge support from the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors.</span> <a href="https://info.arxiv.org/about/donate.html" class="btn-header-donate">Donate</a> </div> </div> <div id="header" class="is-hidden-mobile"> <a aria-hidden="true" tabindex="-1" href="/IgnoreMe"></a> <div class="header-breadcrumbs"> <a href="/"><img src="/static/browse/0.3.4/images/arxiv-logo-one-color-white.svg" alt="arxiv logo" style="height:40px;"/></a> <span>></span> <a href="/list/cs.CL/recent">cs.CL</a> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div><!-- /end desktop header --> <div class="mobile-header"> <div class="columns is-mobile"> <div class="column logo-arxiv"><a href="https://arxiv.org/"><img src="/static/browse/0.3.4/images/arxiv-logomark-small-white.svg" alt="arXiv logo" style="height:60px;" /></a></div> <div class="column logo-cornell"><a href="https://www.cornell.edu/"> <picture> <source media="(min-width: 501px)" srcset="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg 400w" sizes="400w" /> <source srcset="/static/browse/0.3.4/images/icons/cu/cornell_seal_simple_black.svg 2x" /> <img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University Logo" /> </picture> </a></div> <div class="column nav" id="toggle-container" role="menubar"> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-white"><title>open search</title><path d="M505 442.7L405.3 343c-4.5-4.5-10.6-7-17-7H372c27.6-35.3 44-79.7 44-128C416 93.1 322.9 0 208 0S0 93.1 0 208s93.1 208 208 208c48.3 0 92.7-16.4 128-44v16.3c0 6.4 2.5 12.5 7 17l99.7 99.7c9.4 9.4 24.6 9.4 33.9 0l28.3-28.3c9.4-9.4 9.4-24.6.1-34zM208 336c-70.7 0-128-57.2-128-128 0-70.7 57.2-128 128-128 70.7 0 128 57.2 128 128 0 70.7-57.2 128-128 128z"/></svg></button> <div class="mobile-toggle-block toggle-target"> <form class="mobile-search-form" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <input class="input" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <input type="hidden" name="source" value="header"> <input type="hidden" name="searchtype" value="all"> <button class="button">GO</button> </div> </form> </div> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-white" role="menu"><title>open navigation menu</title><path d="M16 132h416c8.837 0 16-7.163 16-16V76c0-8.837-7.163-16-16-16H16C7.163 60 0 67.163 0 76v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16z"/ ></svg></button> <div class="mobile-toggle-block toggle-target"> <nav class="mobile-menu" aria-labelledby="mobilemenulabel"> <h2 id="mobilemenulabel">quick links</h2> <ul> <li><a href="https://arxiv.org/login">Login</a></li> <li><a href="https://info.arxiv.org/help">Help Pages</a></li> <li><a href="https://info.arxiv.org/about">About</a></li> </ul> </nav> </div> </div> </div> </div><!-- /end mobile-header --> </header> <main> <div id="content"> <div id='content-inner'> <div id='dlpage'> <h1>Computation and Language</h1> <ul> <li><a href="#item0">New submissions</a></li> <li><a href="#item210">Cross-lists</a></li> <li><a href="#item276">Replacements</a></li> </ul> <p>See <a id="recent-cs.CL" aria-labelledby="recent-cs.CL" href="/list/cs.CL/recent">recent</a> articles</p> <h3>Showing new listings for Tuesday, 18 February 2025</h3> <div class='paging'>Total of 493 entries </div> <div class='morefewer'>Showing up to 2000 entries per page: <a href=/list/cs.CL/new?skip=0&show=1000 rel="nofollow"> fewer</a> | <span style="color: #454545">more</span> | <span style="color: #454545">all</span> </div> <dl id='articles'> <h3>New submissions (showing 209 of 209 entries)</h3> <dt> <a name='item1'>[1]</a> <a href ="/abs/2502.10497" title="Abstract" id="2502.10497"> arXiv:2502.10497 </a> [<a href="/pdf/2502.10497" title="Download PDF" id="pdf-2502.10497" aria-labelledby="pdf-2502.10497">pdf</a>, <a href="/format/2502.10497" title="Other formats" id="oth-2502.10497" aria-labelledby="oth-2502.10497">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Hallucinations and Truth: A Comprehensive Accuracy Evaluation of RAG, LoRA and DoRA </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Baqar,+M">Mohammad Baqar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Khanda,+R">Rajat Khanda</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 10 Pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Recent advancements in Generative AI have significantly improved the efficiency and adaptability of natural language processing (NLP) systems, particularly through Retrieval-Augmented Generation (RAG), Low-Rank Adaptation (LoRA), and Weight-Decomposed Low-Rank Adaptation (DoRA). RAG integrates external knowledge to enhance factual consistency in generative outputs, while LoRA enables parameter-efficient fine-tuning of large language models (LLMs). DoRA further refines this process by optimizing fine-tuning through adaptive parameter ranking and domain-aware weight adjustments, improving learning efficiency while maintaining inference performance. <br>This paper presents a large-scale empirical evaluation of RAG, LoRA, and DoRA, with model fine-tuning and generation performance assessed on 20,000 FAQ-based queries, while the knowledge base spans 400,000 entries. The study analyzes key performance metrics such as accuracy, relevance, and inference latency. Experimental results demonstrate that DoRA achieves the highest accuracy (90.1%), relevance score (0.88), and lowest latency (110 ms per query), outperforming both LoRA and RAG in real-world, domain-specific generative AI applications. <br>Furthermore, this study examines the trade-offs between fine-tuning efficiency, computational cost, and real-time adaptability across different models. Findings highlight RAG's effectiveness in knowledge grounding, LoRA's cost-efficient domain adaptation, and DoRA's ability to balance fine-tuning efficiency with model precision. These insights provide practical guidance for deploying AI-driven generative systems in accuracy-critical domains such as healthcare, finance, and legal services, ensuring scalability, reliability, and optimal performance in dynamic environments. </p> </div> </dd> <dt> <a name='item2'>[2]</a> <a href ="/abs/2502.10577" title="Abstract" id="2502.10577"> arXiv:2502.10577 </a> [<a href="/pdf/2502.10577" title="Download PDF" id="pdf-2502.10577" aria-labelledby="pdf-2502.10577">pdf</a>, <a href="https://arxiv.org/html/2502.10577v1" title="View HTML" id="html-2502.10577" aria-labelledby="html-2502.10577" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10577" title="Other formats" id="oth-2502.10577" aria-labelledby="oth-2502.10577">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Man Made Language Models? Evaluating LLMs' Perpetuation of Masculine Generics Bias </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Doyen,+E">Enzo Doyen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Todirascu,+A">Amalia Todirascu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 21 pages, 5 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Large language models (LLMs) have been shown to propagate and even amplify gender bias, in English and other languages, in specific or constrained contexts. However, no studies so far have focused on gender biases conveyed by LLMs' responses to generic instructions, especially with regard to masculine generics (MG). MG are a linguistic feature found in many gender-marked languages, denoting the use of the masculine gender as a "default" or supposedly neutral gender to refer to mixed group of men and women, or of a person whose gender is irrelevant or unknown. Numerous psycholinguistics studies have shown that MG are not neutral and induce gender bias. This work aims to analyze the use of MG by both proprietary and local LLMs in responses to generic instructions and evaluate their MG bias rate. We focus on French and create a human noun database from existing lexical resources. We filter existing French instruction datasets to retrieve generic instructions and analyze the responses of 6 different LLMs. Overall, we find that $\approx$39.5\% of LLMs' responses to generic instructions are MG-biased ($\approx$73.1\% across responses with human nouns). Our findings also reveal that LLMs are reluctant to using gender-fair language spontaneously. </p> </div> </dd> <dt> <a name='item3'>[3]</a> <a href ="/abs/2502.10582" title="Abstract" id="2502.10582"> arXiv:2502.10582 </a> [<a href="/pdf/2502.10582" title="Download PDF" id="pdf-2502.10582" aria-labelledby="pdf-2502.10582">pdf</a>, <a href="https://arxiv.org/html/2502.10582v1" title="View HTML" id="html-2502.10582" aria-labelledby="html-2502.10582" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10582" title="Other formats" id="oth-2502.10582" aria-labelledby="oth-2502.10582">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Named entity recognition for Serbian legal documents: Design, methodology and dataset development </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Kalu%C5%A1ev,+V">Vladimir Kalu拧ev</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Brklja%C4%8D,+B">Branko Brklja膷</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 9 pages, 6 figures, 1 table, associated NER4Legal_SRB model and dataset are available at <a href="https://huggingface.co/kalusev/NER4Legal_SRB" rel="external noopener nofollow" class="link-external link-https">this https URL</a> , paper submitted to 15th International Conference on Information Society and Technology (ICIST), Kopaonik, Serbia, 9-12 March 2025, conference track: Generative AI and Large Language Models </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Recent advancements in the field of natural language processing (NLP) and especially large language models (LLMs) and their numerous applications have brought research attention to design of different document processing tools and enhancements in the process of document archiving, search and retrieval. Domain of official, legal documents is especially interesting due to vast amount of data generated on the daily basis, as well as the significant community of interested practitioners (lawyers, law offices, administrative workers, state institutions and citizens). Providing efficient ways for automation of everyday work involving legal documents is therefore expected to have significant impact in different fields. In this work we present one LLM based solution for Named Entity Recognition (NER) in the case of legal documents written in Serbian language. It leverages on the pre-trained bidirectional encoder representations from transformers (BERT), which had been carefully adapted to the specific task of identifying and classifying specific data points from textual content. Besides novel dataset development for Serbian language (involving public court rulings), presented system design and applied methodology, the paper also discusses achieved performance metrics and their implications for objective assessment of the proposed solution. Performed cross-validation tests on the created manually labeled dataset with mean $F_1$ score of 0.96 and additional results on the examples of intentionally modified text inputs confirm applicability of the proposed system design and robustness of the developed NER solution. </p> </div> </dd> <dt> <a name='item4'>[4]</a> <a href ="/abs/2502.10596" title="Abstract" id="2502.10596"> arXiv:2502.10596 </a> [<a href="/pdf/2502.10596" title="Download PDF" id="pdf-2502.10596" aria-labelledby="pdf-2502.10596">pdf</a>, <a href="https://arxiv.org/html/2502.10596v1" title="View HTML" id="html-2502.10596" aria-labelledby="html-2502.10596" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10596" title="Other formats" id="oth-2502.10596" aria-labelledby="oth-2502.10596">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Post-training an LLM for RAG? Train on Self-Generated Demonstrations </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Finlayson,+M">Matthew Finlayson</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kulikov,+I">Ilia Kulikov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bikel,+D+M">Daneil M. Bikel</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Oguz,+B">Barlas Oguz</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+X">Xilun Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pappu,+A">Aasish Pappu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Large language models (LLMs) often struggle with knowledge intensive NLP tasks, such as answering "Who won the latest World Cup?" because the knowledge they learn during training may be insufficient or outdated. Conditioning generation on retrieved documents -- a technique known as retrieval augmented generation (RAG) -- mitigates these shortcomings by allowing the model to leverage in-context information. Practitioners can improve LLM RAG performance by fine-tuning on retrieval-augmented instructions, but must beware that this can cause undesirable model behaviors like hallucinations. We attribute this degradation to the fact that the training data is likely to be out-of-distribution for the model and may suffer from quality issues, such as misalignment between retrievals and target responses (since retrievals are frequently added post-hoc). We propose a recipe for training RAG-enabled LLMs using self-generated demonstrations, thereby avoiding training on out-of-distribution text and integrating retrievals into the LLM responses. We evaluate our method on knowledge intensive question answering (QA) tasks and show that our method teaches LLMs to properly handle in-context retrievals and abstain from questions it will likely get wrong. Compared to conventional RA-IT methods, our method prevents model degradation in non-RAG settings while exhibiting superior QA performance. </p> </div> </dd> <dt> <a name='item5'>[5]</a> <a href ="/abs/2502.10615" title="Abstract" id="2502.10615"> arXiv:2502.10615 </a> [<a href="/pdf/2502.10615" title="Download PDF" id="pdf-2502.10615" aria-labelledby="pdf-2502.10615">pdf</a>, <a href="https://arxiv.org/html/2502.10615v1" title="View HTML" id="html-2502.10615" aria-labelledby="html-2502.10615" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10615" title="Other formats" id="oth-2502.10615" aria-labelledby="oth-2502.10615">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Retrieval-augmented Encoders for Extreme Multi-label Text Classification </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yau-Shian Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chang,+W">Wei-Cheng Chang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+J">Jyun-Yu Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+J">Jiong Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+H">Hsiang-Fu Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Vishwanathan,+S+V+N">S. V. N. Vishwanathan</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Extreme multi-label classification (XMC) seeks to find relevant labels from an extremely large label collection for a given text input. To tackle such a vast label space, current state-of-the-art methods fall into two categories. The one-versus-all (OVA) method uses learnable label embeddings for each label, excelling at memorization (i.e., capturing detailed training signals for accurate head label prediction). In contrast, the dual-encoder (DE) model maps input and label text into a shared embedding space for better generalization (i.e., the capability of predicting tail labels with limited training data), but may fall short at memorization. To achieve generalization and memorization, existing XMC methods often combine DE and OVA models, which involves complex training pipelines. Inspired by the success of retrieval-augmented language models, we propose the Retrieval-augmented Encoders for XMC (RAEXMC), a novel framework that equips a DE model with retrieval-augmented capability for efficient memorization without additional trainable parameter. During training, RAEXMC is optimized by the contrastive loss over a knowledge memory that consists of both input instances and labels. During inference, given a test input, RAEXMC retrieves the top-$K$ keys from the knowledge memory, and aggregates the corresponding values as the prediction scores. We showcase the effectiveness and efficiency of RAEXMC on four public LF-XMC benchmarks. RAEXMC not only advances the state-of-the-art (SOTA) DE method DEXML, but also achieves more than 10x speedup on the largest LF-AmazonTitles-1.3M dataset under the same 8 A100 GPUs training environments. </p> </div> </dd> <dt> <a name='item6'>[6]</a> <a href ="/abs/2502.10632" title="Abstract" id="2502.10632"> arXiv:2502.10632 </a> [<a href="/pdf/2502.10632" title="Download PDF" id="pdf-2502.10632" aria-labelledby="pdf-2502.10632">pdf</a>, <a href="https://arxiv.org/html/2502.10632v1" title="View HTML" id="html-2502.10632" aria-labelledby="html-2502.10632" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10632" title="Other formats" id="oth-2502.10632" aria-labelledby="oth-2502.10632">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Code-Mixed Telugu-English Hate Speech Detection </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Kakarla,+S">Santhosh Kakarla</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Venkata,+G+S+B">Gautama Shastry Bulusu Venkata</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 4 pages, 1 figure, 2 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Hate speech detection in low-resource languages like Telugu is a growing challenge in NLP. This study investigates transformer-based models, including TeluguHateBERT, HateBERT, DeBERTa, Muril, IndicBERT, Roberta, and Hindi-Abusive-MuRIL, for classifying hate speech in Telugu. We fine-tune these models using Low-Rank Adaptation (LoRA) to optimize efficiency and performance. Additionally, we explore a multilingual approach by translating Telugu text into English using Google Translate to assess its impact on classification accuracy. <br>Our experiments reveal that most models show improved performance after translation, with DeBERTa and Hindi-Abusive-MuRIL achieving higher accuracy and F1 scores compared to training directly on Telugu text. Notably, Hindi-Abusive-MuRIL outperforms all other models in both the original Telugu dataset and the translated dataset, demonstrating its robustness across different linguistic settings. This suggests that translation enables models to leverage richer linguistic features available in English, leading to improved classification performance. The results indicate that multilingual processing can be an effective approach for hate speech detection in low-resource languages. These findings demonstrate that transformer models, when fine-tuned appropriately, can significantly improve hate speech detection in Telugu, paving the way for more robust multilingual NLP applications. </p> </div> </dd> <dt> <a name='item7'>[7]</a> <a href ="/abs/2502.10634" title="Abstract" id="2502.10634"> arXiv:2502.10634 </a> [<a href="/pdf/2502.10634" title="Download PDF" id="pdf-2502.10634" aria-labelledby="pdf-2502.10634">pdf</a>, <a href="https://arxiv.org/html/2502.10634v1" title="View HTML" id="html-2502.10634" aria-labelledby="html-2502.10634" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10634" title="Other formats" id="oth-2502.10634" aria-labelledby="oth-2502.10634">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Lost in the Passage: Passage-level In-context Learning Does Not Necessarily Need a "Passage" </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+H">Hao Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+C">Chenming Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+G">Gengyang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Y">Yunfang Wu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> By simply incorporating demonstrations into the context, in-context learning (ICL) enables large language models (LLMs) to yield awesome performance on many tasks. In this paper, we focus on passage-level long-context ICL for generation tasks and find that LLMs cannot learn the intrinsic relationships between the demonstration passage and the generation output. We conduct experiments with different LLMs on two typical generation tasks including single-document QA and distractor generation, demonstrating that even a completely meaningless demonstration passage with 1/4 length achieves much better performance than the original full passage. Analysis via attention score reveals that LLMs pay little attention to passages compared to other components in prompt and little attention flows from the passage to other parts of the demonstration, which further confirms our finding. Additionally, experiments on context compression indicate that compression approaches proven effective on other long-context tasks are not suitable for passage-level ICL, since simply using shorter meaningless demonstration passages has achieved competitive performance. </p> </div> </dd> <dt> <a name='item8'>[8]</a> <a href ="/abs/2502.10641" title="Abstract" id="2502.10641"> arXiv:2502.10641 </a> [<a href="/pdf/2502.10641" title="Download PDF" id="pdf-2502.10641" aria-labelledby="pdf-2502.10641">pdf</a>, <a href="https://arxiv.org/html/2502.10641v1" title="View HTML" id="html-2502.10641" aria-labelledby="html-2502.10641" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10641" title="Other formats" id="oth-2502.10641" aria-labelledby="oth-2502.10641">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Toward Equitable Access: Leveraging Crowdsourced Reviews to Investigate Public Perceptions of Health Resource Accessibility </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xue,+Z">Zhaoqian Xue</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+G">Guanhong Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wei,+K">Kai Wei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+C">Chong Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zeng,+Q">Qingcheng Zeng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+S">Songhua Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hua,+W">Wenyue Hua</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fan,+L">Lizhou Fan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yongfeng Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+L">Lingyao Li</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Access to health resources is a critical determinant of public well-being and societal resilience, particularly during public health crises when demand for medical services and preventive care surges. However, disparities in accessibility persist across demographic and geographic groups, raising concerns about equity. Traditional survey methods often fall short due to limitations in coverage, cost, and timeliness. This study leverages crowdsourced data from Google Maps reviews, applying advanced natural language processing techniques, specifically ModernBERT, to extract insights on public perceptions of health resource accessibility in the United States during the COVID-19 pandemic. Additionally, we employ Partial Least Squares regression to examine the relationship between accessibility perceptions and key socioeconomic and demographic factors including political affiliation, racial composition, and educational attainment. Our findings reveal that public perceptions of health resource accessibility varied significantly across the U.S., with disparities peaking during the pandemic and slightly easing post-crisis. Political affiliation, racial demographics, and education levels emerged as key factors shaping these perceptions. These findings underscore the need for targeted interventions and policy measures to address inequities, fostering a more inclusive healthcare infrastructure that can better withstand future public health challenges. </p> </div> </dd> <dt> <a name='item9'>[9]</a> <a href ="/abs/2502.10645" title="Abstract" id="2502.10645"> arXiv:2502.10645 </a> [<a href="/pdf/2502.10645" title="Download PDF" id="pdf-2502.10645" aria-labelledby="pdf-2502.10645">pdf</a>, <a href="https://arxiv.org/html/2502.10645v1" title="View HTML" id="html-2502.10645" aria-labelledby="html-2502.10645" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10645" title="Other formats" id="oth-2502.10645" aria-labelledby="oth-2502.10645">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> BabyLM Turns 3: Call for papers for the 2025 BabyLM workshop </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Charpentier,+L">Lucas Charpentier</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Choshen,+L">Leshem Choshen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cotterell,+R">Ryan Cotterell</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gul,+M+O">Mustafa Omer Gul</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+M">Michael Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jumelet,+J">Jaap Jumelet</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Linzen,+T">Tal Linzen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+J">Jing Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mueller,+A">Aaron Mueller</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ross,+C">Candace Ross</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shah,+R+S">Raj Sanjay Shah</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Warstadt,+A">Alex Warstadt</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wilcox,+E">Ethan Wilcox</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Williams,+A">Adina Williams</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> EMNLP 2025 BabyLM Workshop. arXiv admin note: text overlap with <a href="https://arxiv.org/abs/2404.06214" data-arxiv-id="2404.06214" class="link-https">arXiv:2404.06214</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> BabyLM aims to dissolve the boundaries between cognitive modeling and language modeling. We call for both workshop papers and for researchers to join the 3rd BabyLM competition. As in previous years, we call for participants in the data-efficient pretraining challenge in the general track. This year, we also offer a new track: INTERACTION. This new track encourages interactive behavior, learning from a teacher, and adapting the teaching material to the student. We also call for papers outside the competition in any relevant areas. These include training efficiency, cognitively plausible research, weak model evaluation, and more. </p> </div> </dd> <dt> <a name='item10'>[10]</a> <a href ="/abs/2502.10660" title="Abstract" id="2502.10660"> arXiv:2502.10660 </a> [<a href="/pdf/2502.10660" title="Download PDF" id="pdf-2502.10660" aria-labelledby="pdf-2502.10660">pdf</a>, <a href="https://arxiv.org/html/2502.10660v1" title="View HTML" id="html-2502.10660" aria-labelledby="html-2502.10660" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10660" title="Other formats" id="oth-2502.10660" aria-labelledby="oth-2502.10660">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> User Profile with Large Language Models: Construction, Updating, and Benchmarking </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Prottasha,+N+J">Nusrat Jahan Prottasha</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kowsher,+M">Md Kowsher</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Raman,+H">Hafijur Raman</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Anny,+I+J">Israt Jahan Anny</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bhat,+P">Prakash Bhat</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Garibay,+I">Ivan Garibay</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Garibay,+O">Ozlem Garibay</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> User profile modeling plays a key role in personalized systems, as it requires building accurate profiles and updating them with new information. In this paper, we present two high-quality open-source user profile datasets: one for profile construction and another for profile updating. These datasets offer a strong basis for evaluating user profile modeling techniques in dynamic settings. We also show a methodology that uses large language models (LLMs) to tackle both profile construction and updating. Our method uses a probabilistic framework to predict user profiles from input text, allowing for precise and context-aware profile generation. Our experiments demonstrate that models like Mistral-7b and Llama2-7b perform strongly in both tasks. LLMs improve the precision and recall of the generated profiles, and high evaluation scores confirm the effectiveness of our approach. </p> </div> </dd> <dt> <a name='item11'>[11]</a> <a href ="/abs/2502.10699" title="Abstract" id="2502.10699"> arXiv:2502.10699 </a> [<a href="/pdf/2502.10699" title="Download PDF" id="pdf-2502.10699" aria-labelledby="pdf-2502.10699">pdf</a>, <a href="https://arxiv.org/html/2502.10699v1" title="View HTML" id="html-2502.10699" aria-labelledby="html-2502.10699" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10699" title="Other formats" id="oth-2502.10699" aria-labelledby="oth-2502.10699">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Exploring Synaptic Resonance in Large Language Models: A Novel Approach to Contextual Memory Integration </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Applegarth,+G">George Applegarth</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Weatherstone,+C">Christian Weatherstone</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hollingsworth,+M">Maximilian Hollingsworth</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Middlebrook,+H">Henry Middlebrook</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Irvin,+M">Marcus Irvin</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Neural and Evolutionary Computing (cs.NE) </div> <p class='mathjax'> Contextual memory integration remains a high challenge in the development of language models, particularly in tasks that require maintaining coherence over extended sequences. Traditional approaches, such as self-attention mechanisms and memory-augmented architectures, often prioritize short-term dependencies, leading to fragmentation and inconsistency in long-range contextual understanding. Inspired by principles of synaptic plasticity observed in biological neural systems, a novel mechanism, Synaptic Resonance, is introduced to dynamically reinforce relevant memory pathways during training and inference. Unlike static memory representations, this mechanism continuously adjusts synaptic weight matrices based on contextual relevance, allowing for improved information retention without excessive computational overhead. Evaluations conducted on an open-source language model demonstrate reductions in perplexity, enhancements in contextual coherence, and increased robustness against input noise, highlighting the effectiveness of reinforcement-driven memory modulation. Comparative analysis against baseline models further reveals that the proposed approach achieves higher memory retention efficiency while maintaining computational feasibility. The architectural modifications integrate seamlessly into existing transformer-based frameworks, ensuring stable convergence and efficient inference without sacrificing scalability. Applications benefiting from improved long-term contextual consistency, such as dialogue systems and document summarization, stand to gain from this approach. Empirical findings suggest that dynamically reinforced memory pathways offer a promising alternative to conventional memory mechanisms, addressing longstanding limitations in extended sequence modeling. </p> </div> </dd> <dt> <a name='item12'>[12]</a> <a href ="/abs/2502.10708" title="Abstract" id="2502.10708"> arXiv:2502.10708 </a> [<a href="/pdf/2502.10708" title="Download PDF" id="pdf-2502.10708" aria-labelledby="pdf-2502.10708">pdf</a>, <a href="https://arxiv.org/html/2502.10708v1" title="View HTML" id="html-2502.10708" aria-labelledby="html-2502.10708" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10708" title="Other formats" id="oth-2502.10708" aria-labelledby="oth-2502.10708">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Injecting Domain-Specific Knowledge into Large Language Models: A Comprehensive Survey </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+Z">Zirui Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yan,+B">Bin Yan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Y">Yuhan Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fang,+M">Miao Fang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+M">Mingzhe Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yan,+R">Rui Yan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+X">Xiuying Chen</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> In processing </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large Language Models (LLMs) have demonstrated remarkable success in various tasks such as natural language understanding, text summarization, and machine translation. However, their general-purpose nature often limits their effectiveness in domain-specific applications that require specialized knowledge, such as healthcare, chemistry, or legal analysis. To address this, researchers have explored diverse methods to enhance LLMs by integrating domain-specific knowledge. In this survey, we provide a comprehensive overview of these methods, which we categorize into four key approaches: dynamic knowledge injection, static knowledge embedding, modular adapters, and prompt optimization. Each approach offers unique mechanisms to equip LLMs with domain expertise, balancing trade-offs between flexibility, scalability, and efficiency. We discuss how these methods enable LLMs to tackle specialized tasks, compare their advantages and disadvantages, evaluate domain-specific LLMs against general LLMs, and highlight the challenges and opportunities in this emerging field. For those interested in delving deeper into this area, we also summarize the commonly used datasets and benchmarks. To keep researchers updated on the latest studies, we maintain an open-source at: <a href="https://github.com/abilliyb/Knowledge_Injection_Survey_Papers" rel="external noopener nofollow" class="link-external link-https">this https URL</a>, dedicated to documenting research in the field of specialized LLM. </p> </div> </dd> <dt> <a name='item13'>[13]</a> <a href ="/abs/2502.10709" title="Abstract" id="2502.10709"> arXiv:2502.10709 </a> [<a href="/pdf/2502.10709" title="Download PDF" id="pdf-2502.10709" aria-labelledby="pdf-2502.10709">pdf</a>, <a href="https://arxiv.org/html/2502.10709v1" title="View HTML" id="html-2502.10709" aria-labelledby="html-2502.10709" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10709" title="Other formats" id="oth-2502.10709" aria-labelledby="oth-2502.10709">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> An Empirical Analysis of Uncertainty in Large Language Model Evaluations </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xie,+Q">Qiujie Xie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Q">Qingqiu Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+Z">Zhuohao Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yuejie Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yue Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+L">Linyi Yang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> ICLR 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> As LLM-as-a-Judge emerges as a new paradigm for assessing large language models (LLMs), concerns have been raised regarding the alignment, bias, and stability of LLM evaluators. While substantial work has focused on alignment and bias, little research has concentrated on the stability of LLM evaluators. In this paper, we conduct extensive experiments involving 9 widely used LLM evaluators across 2 different evaluation settings to investigate the uncertainty in model-based LLM evaluations. We pinpoint that LLM evaluators exhibit varying uncertainty based on model families and sizes. With careful comparative analyses, we find that employing special prompting strategies, whether during inference or post-training, can alleviate evaluation uncertainty to some extent. By utilizing uncertainty to enhance LLM's reliability and detection capability in Out-Of-Distribution (OOD) data, we further fine-tune an uncertainty-aware LLM evaluator named ConfiLM using a human-annotated fine-tuning set and assess ConfiLM's OOD evaluation ability on a manually designed test set sourced from the 2024 Olympics. Experimental results demonstrate that incorporating uncertainty as additional information during the fine-tuning phase can largely improve the model's evaluation performance in OOD scenarios. The code and data are released at: <a href="https://github.com/hasakiXie123/LLM-Evaluator-Uncertainty" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item14'>[14]</a> <a href ="/abs/2502.10725" title="Abstract" id="2502.10725"> arXiv:2502.10725 </a> [<a href="/pdf/2502.10725" title="Download PDF" id="pdf-2502.10725" aria-labelledby="pdf-2502.10725">pdf</a>, <a href="https://arxiv.org/html/2502.10725v1" title="View HTML" id="html-2502.10725" aria-labelledby="html-2502.10725" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10725" title="Other formats" id="oth-2502.10725" aria-labelledby="oth-2502.10725">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> PropNet: a White-Box and Human-Like Network for Sentence Representation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+F">Fei Yang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Transformer-based embedding methods have dominated the field of sentence representation in recent years. Although they have achieved remarkable performance on NLP missions, such as semantic textual similarity (STS) tasks, their black-box nature and large-data-driven training style have raised concerns, including issues related to bias, trust, and safety. Many efforts have been made to improve the interpretability of embedding models, but these problems have not been fundamentally resolved. To achieve inherent interpretability, we propose a purely white-box and human-like sentence representation network, PropNet. Inspired by findings from cognitive science, PropNet constructs a hierarchical network based on the propositions contained in a sentence. While experiments indicate that PropNet has a significant gap compared to state-of-the-art (SOTA) embedding models in STS tasks, case studies reveal substantial room for improvement. Additionally, PropNet enables us to analyze and understand the human cognitive processes underlying STS benchmarks. </p> </div> </dd> <dt> <a name='item15'>[15]</a> <a href ="/abs/2502.10735" title="Abstract" id="2502.10735"> arXiv:2502.10735 </a> [<a href="/pdf/2502.10735" title="Download PDF" id="pdf-2502.10735" aria-labelledby="pdf-2502.10735">pdf</a>, <a href="https://arxiv.org/html/2502.10735v1" title="View HTML" id="html-2502.10735" aria-labelledby="html-2502.10735" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10735" title="Other formats" id="oth-2502.10735" aria-labelledby="oth-2502.10735">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> OPTISHEAR: Towards Efficient and Adaptive Pruning of Large Language Models via Evolutionary Optimization </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+S">Shuqi Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+B">Bowei He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+H">Han Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+L">Linqi Song</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Post-training pruning has emerged as a crucial optimization technique as large language models (LLMs) continue to grow rapidly. However, the significant variations in weight distributions across different LLMs make fixed pruning strategies inadequate for multiple models. In this paper, we introduce \textbf{\textsc{OptiShear}}, an efficient evolutionary optimization framework for adaptive LLM pruning. Our framework features two key innovations: an effective search space built on our Meta pruning metric to handle diverse weight distributions, and a model-wise reconstruction error for rapid evaluation during search trials. We employ Non-dominated Sorting Genetic Algorithm III (NSGA-III) to optimize both pruning metrics and layerwise sparsity ratios. Through extensive evaluation on LLaMA-1/2/3 and Mistral models (7B-70B) across multiple benchmarks, we demonstrate that our adaptive pruning metrics consistently outperform existing methods. Additionally, our discovered layerwise sparsity ratios enhance the effectiveness of other pruning metrics. The framework exhibits strong cross-task and cross-model generalizability, providing a cost-effective solution for model compression. </p> </div> </dd> <dt> <a name='item16'>[16]</a> <a href ="/abs/2502.10739" title="Abstract" id="2502.10739"> arXiv:2502.10739 </a> [<a href="/pdf/2502.10739" title="Download PDF" id="pdf-2502.10739" aria-labelledby="pdf-2502.10739">pdf</a>, <a href="https://arxiv.org/html/2502.10739v1" title="View HTML" id="html-2502.10739" aria-labelledby="html-2502.10739" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10739" title="Other formats" id="oth-2502.10739" aria-labelledby="oth-2502.10739">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> BASE-SQL: A powerful open source Text-To-SQL baseline approach </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Sheng,+L">Lei Sheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+S">Shuai-Shuai Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xie,+W">Wei Xie</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Work in progress. 16 pages, 3 figures, 8 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The conversion of natural language into SQL language for querying databases (Text-to-SQL) has broad application prospects and has attracted widespread attention. At present, the mainstream Text-to-SQL methods are mainly divided into in-context learning (ICL) based methods and supervised fine-tuning (SFT) based methods. ICL-based methods can achieve relatively good results thanks to the use of the most advanced closed-source models. However, in real-world application scenarios, factors such as data privacy, SQL generation efficiency and cost need to be considered. SFT-based methods have certain advantages. At present, methods based on fine-tuning of open source models lack easy-to-implement and effective (cost-effective) baseline methods. We propose a pipeline-based method using open source model fine-tuning, referred to as BASE-SQL, which includes four components: Schema Linking, Candidate SQL Generate, SQL Revision and SQL Merge Revision. Experimental results show that BASE-SQL uses the open source model Qwen2.5-Coder-32B-Instruct, and achieves an accuracy of 67.47% on the BIRD development set and 88.9% on the Spider test set, which is significantly better than other methods using open source models, and even exceeds several methods using the GPT-4o closed-source model. At the same time, BASE-SQL is easy to implement and highly efficient (on average, only five calls to the large language model are required to generate SQL once). The code will be open sourced at <a href="https://github.com/CycloneBoy/base_sql" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item17'>[17]</a> <a href ="/abs/2502.10743" title="Abstract" id="2502.10743"> arXiv:2502.10743 </a> [<a href="/pdf/2502.10743" title="Download PDF" id="pdf-2502.10743" aria-labelledby="pdf-2502.10743">pdf</a>, <a href="https://arxiv.org/html/2502.10743v1" title="View HTML" id="html-2502.10743" aria-labelledby="html-2502.10743" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10743" title="Other formats" id="oth-2502.10743" aria-labelledby="oth-2502.10743">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> 1bit-Merging: Dynamic Quantized Merging for Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+S">Shuqi Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+H">Han Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+B">Bowei He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zehua Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+X">Xiongwei Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+M">Mingxuan Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+L">Linqi Song</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Recent advances in large language models have led to specialized models excelling in specific domains, creating a need for efficient model merging techniques. While traditional merging approaches combine parameters into a single static model, they often compromise task-specific performance. However, task-specific routing methods maintain accuracy but introduce substantial storage overhead. We present \texttt{1bit}-Merging, a novel framework that integrates task-specific routing with 1-bit quantized task vectors to balance performance and storage efficiency. Our approach leverages the observation that different task-specific models store knowledge in distinct layers-chat models primarily in attention layers and math/code models in MLP layers-enabling targeted compression strategies. Through extensive experiments with LLaMA2 and Mistral model families across chat, mathematical reasoning, and code generation tasks, we demonstrate that \texttt{1bit}-Merging achieves comparable or superior performance to existing methods while significantly reducing storage requirements. Our framework offers a practical solution for combining specialized models while maintaining their individual strengths and addressing the storage challenges of current approaches. </p> </div> </dd> <dt> <a name='item18'>[18]</a> <a href ="/abs/2502.10749" title="Abstract" id="2502.10749"> arXiv:2502.10749 </a> [<a href="/pdf/2502.10749" title="Download PDF" id="pdf-2502.10749" aria-labelledby="pdf-2502.10749">pdf</a>, <a href="https://arxiv.org/html/2502.10749v1" title="View HTML" id="html-2502.10749" aria-labelledby="html-2502.10749" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10749" title="Other formats" id="oth-2502.10749" aria-labelledby="oth-2502.10749">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LoRE-Merging: Exploring Low-Rank Estimation For Large Language Model Merging </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zehua Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+H">Han Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yao,+Y">Yuxuan Yao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=She,+R">Ruifeng She</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+X">Xiongwei Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhong,+T">Tao Zhong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+M">Mingxuan Yuan</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> While most current approaches rely on further training techniques, such as fine-tuning or reinforcement learning, to enhance model capacities, model merging stands out for its ability of improving models without requiring any additional training. In this paper, we propose a unified framework for model merging based on low-rank estimation of task vectors without the need for access to the base model, named \textsc{LoRE-Merging}. Our approach is motivated by the observation that task vectors from fine-tuned models frequently exhibit a limited number of dominant singular values, making low-rank estimations less prone to interference. We implement the method by formulating the merging problem as an optimization problem. Extensive empirical experiments demonstrate the effectiveness of our framework in mitigating interference and preserving task-specific information, thereby advancing the state-of-the-art performance in model merging techniques. </p> </div> </dd> <dt> <a name='item19'>[19]</a> <a href ="/abs/2502.10760" title="Abstract" id="2502.10760"> arXiv:2502.10760 </a> [<a href="/pdf/2502.10760" title="Download PDF" id="pdf-2502.10760" aria-labelledby="pdf-2502.10760">pdf</a>, <a href="https://arxiv.org/html/2502.10760v1" title="View HTML" id="html-2502.10760" aria-labelledby="html-2502.10760" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10760" title="Other formats" id="oth-2502.10760" aria-labelledby="oth-2502.10760">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Why is prompting hard? Understanding prompts on binary sequence predictors </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wenliang,+L+K">Li Kevin Wenliang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ruoss,+A">Anian Ruoss</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Grau-Moya,+J">Jordi Grau-Moya</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hutter,+M">Marcus Hutter</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Genewein,+T">Tim Genewein</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Machine Learning (cs.LG); Machine Learning (stat.ML) </div> <p class='mathjax'> Large language models (LLMs) can be prompted to do many tasks, but finding good prompts is not always easy, nor is understanding some performant prompts. We explore these issues by viewing prompting as conditioning a near-optimal sequence predictor (LLM) pretrained on diverse data sources. Through numerous prompt search experiments, we show that the unintuitive patterns in optimal prompts can be better understood given the pretraining distribution, which is often unavailable in practice. Moreover, even using exhaustive search, reliably identifying optimal prompts from practical neural predictors can be difficult. Further, we demonstrate that common prompting methods, such as using intuitive prompts or samples from the targeted task, are in fact suboptimal. Thus, this work takes an initial step towards understanding the difficulties in finding and understanding optimal prompts from a statistical and empirical perspective. </p> </div> </dd> <dt> <a name='item20'>[20]</a> <a href ="/abs/2502.10835" title="Abstract" id="2502.10835"> arXiv:2502.10835 </a> [<a href="/pdf/2502.10835" title="Download PDF" id="pdf-2502.10835" aria-labelledby="pdf-2502.10835">pdf</a>, <a href="https://arxiv.org/html/2502.10835v1" title="View HTML" id="html-2502.10835" aria-labelledby="html-2502.10835" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10835" title="Other formats" id="oth-2502.10835" aria-labelledby="oth-2502.10835">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Back Attention: Understanding and Enhancing Multi-Hop Reasoning in Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+Z">Zeping Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Belinkov,+Y">Yonatan Belinkov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ananiadou,+S">Sophia Ananiadou</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> preprint </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> We investigate how large language models perform latent multi-hop reasoning in prompts like "Wolfgang Amadeus Mozart's mother's spouse is". To analyze this process, we introduce logit flow, an interpretability method that traces how logits propagate across layers and positions toward the final prediction. Using logit flow, we identify four distinct stages in single-hop knowledge prediction: (A) entity subject enrichment, (B) entity attribute extraction, (C) relation subject enrichment, and (D) relation attribute extraction. Extending this analysis to multi-hop reasoning, we find that failures often stem from the relation attribute extraction stage, where conflicting logits reduce prediction accuracy. To address this, we propose back attention, a novel mechanism that enables lower layers to leverage higher-layer hidden states from different positions during attention computation. With back attention, a 1-layer transformer achieves the performance of a 2-layer transformer. Applied to four LLMs, back attention improves accuracy on five reasoning datasets, demonstrating its effectiveness in enhancing latent multi-hop reasoning ability. </p> </div> </dd> <dt> <a name='item21'>[21]</a> <a href ="/abs/2502.10852" title="Abstract" id="2502.10852"> arXiv:2502.10852 </a> [<a href="/pdf/2502.10852" title="Download PDF" id="pdf-2502.10852" aria-labelledby="pdf-2502.10852">pdf</a>, <a href="https://arxiv.org/html/2502.10852v1" title="View HTML" id="html-2502.10852" aria-labelledby="html-2502.10852" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10852" title="Other formats" id="oth-2502.10852" aria-labelledby="oth-2502.10852">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Multilingual Encoder Knows more than You Realize: Shared Weights Pretraining for Extremely Low-Resource Languages </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Su,+Z">Zeli Su</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Z">Ziyin Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+G">Guixian Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+J">Jianing Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+X">XU Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+T">Ting Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dong,+Y">Yushuang Dong</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> While multilingual language models like XLM-R have advanced multilingualism in NLP, they still perform poorly in extremely low-resource languages. This situation is exacerbated by the fact that modern LLMs such as LLaMA and Qwen support far fewer languages than XLM-R, making text generation models non-existent for many languages in the world. To tackle this challenge, we propose a novel framework for adapting multilingual encoders to text generation in extremely low-resource languages. By reusing the weights between the encoder and the decoder, our framework allows the model to leverage the learned semantic space of the encoder, enabling efficient learning and effective generalization in low-resource languages. Applying this framework to four Chinese minority languages, we present XLM-SWCM, and demonstrate its superior performance on various downstream tasks even when compared with much larger models. </p> </div> </dd> <dt> <a name='item22'>[22]</a> <a href ="/abs/2502.10855" title="Abstract" id="2502.10855"> arXiv:2502.10855 </a> [<a href="/pdf/2502.10855" title="Download PDF" id="pdf-2502.10855" aria-labelledby="pdf-2502.10855">pdf</a>, <a href="/format/2502.10855" title="Other formats" id="oth-2502.10855" aria-labelledby="oth-2502.10855">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Towards Effective Extraction and Evaluation of Factual Claims </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Metropolitansky,+D">Dasha Metropolitansky</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Larson,+J">Jonathan Larson</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> A common strategy for fact-checking long-form content generated by Large Language Models (LLMs) is extracting simple claims that can be verified independently. Since inaccurate or incomplete claims compromise fact-checking results, ensuring claim quality is critical. However, the lack of a standardized evaluation framework impedes assessment and comparison of claim extraction methods. To address this gap, we propose a framework for evaluating claim extraction in the context of fact-checking along with automated, scalable, and replicable methods for applying this framework, including novel approaches for measuring coverage and decontextualization. We also introduce Claimify, an LLM-based claim extraction method, and demonstrate that it outperforms existing methods under our evaluation framework. A key feature of Claimify is its ability to handle ambiguity and extract claims only when there is high confidence in the correct interpretation of the source text. </p> </div> </dd> <dt> <a name='item23'>[23]</a> <a href ="/abs/2502.10857" title="Abstract" id="2502.10857"> arXiv:2502.10857 </a> [<a href="/pdf/2502.10857" title="Download PDF" id="pdf-2502.10857" aria-labelledby="pdf-2502.10857">pdf</a>, <a href="https://arxiv.org/html/2502.10857v1" title="View HTML" id="html-2502.10857" aria-labelledby="html-2502.10857" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10857" title="Other formats" id="oth-2502.10857" aria-labelledby="oth-2502.10857">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Divergent Thoughts toward One Goal: LLM-based Multi-Agent Collaboration System for Electronic Design Automation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+H">Haoyuan Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zheng,+H">Haisheng Zheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+Z">Zhuolun He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+B">Bei Yu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Recently, with the development of tool-calling capabilities in large language models (LLMs), these models have demonstrated significant potential for automating electronic design automation (EDA) flows by interacting with EDA tool APIs via EDA scripts. However, considering the limited understanding of EDA tools, LLMs face challenges in practical scenarios where diverse interfaces of EDA tools exist across different platforms. Additionally, EDA flow automation often involves intricate, long-chain tool-calling processes, increasing the likelihood of errors in intermediate steps. Any errors will lead to the instability and failure of EDA flow automation. To address these challenges, we introduce EDAid, a multi-agent collaboration system where multiple agents harboring divergent thoughts converge towards a common goal, ensuring reliable and successful EDA flow automation. Specifically, each agent is controlled by ChipLlama models, which are expert LLMs fine-tuned for EDA flow automation. Our experiments demonstrate the state-of-the-art (SOTA) performance of our ChipLlama models and validate the effectiveness of our EDAid in the automation of complex EDA flows, showcasing superior performance compared to single-agent systems. </p> </div> </dd> <dt> <a name='item24'>[24]</a> <a href ="/abs/2502.10868" title="Abstract" id="2502.10868"> arXiv:2502.10868 </a> [<a href="/pdf/2502.10868" title="Download PDF" id="pdf-2502.10868" aria-labelledby="pdf-2502.10868">pdf</a>, <a href="https://arxiv.org/html/2502.10868v1" title="View HTML" id="html-2502.10868" aria-labelledby="html-2502.10868" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10868" title="Other formats" id="oth-2502.10868" aria-labelledby="oth-2502.10868">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> NitiBench: A Comprehensive Studies of LLM Frameworks Capabilities for Thai Legal Question Answering </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Akarajaradwong,+P">Pawitsapak Akarajaradwong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pothavorn,+P">Pirat Pothavorn</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chaksangchaichot,+C">Chompakorn Chaksangchaichot</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tasawong,+P">Panuthep Tasawong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nopparatbundit,+T">Thitiwat Nopparatbundit</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nutanong,+S">Sarana Nutanong</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The application of large language models (LLMs) in the legal domain holds significant potential for information retrieval and question answering, yet Thai legal QA systems face challenges due to a lack of standardized evaluation benchmarks and the complexity of Thai legal structures. This paper introduces NitiBench, a benchmark comprising two datasets: the NitiBench-CCL, covering general Thai financial law, and the NitiBench-Tax, which includes real-world tax law cases requiring advanced legal reasoning. We evaluate retrieval-augmented generation (RAG) and long-context LLM-based approaches to address three key research questions: the impact of domain-specific components like section-based chunking and cross-referencing, the comparative performance of different retrievers and LLMs, and the viability of long-context LLMs as an alternative to RAG. Our results show that section-based chunking significantly improves retrieval and end-to-end performance, current retrievers struggle with complex queries, and long-context LLMs still underperform RAG-based systems in Thai legal QA. To support fair evaluation, we propose tailored multi-label retrieval metrics and the use of an LLM-as-judge for coverage and contradiction detection method. These findings highlight the limitations of current Thai legal NLP solutions and provide a foundation for future research in the field. We also open-sourced our codes and dataset to available publicly. </p> </div> </dd> <dt> <a name='item25'>[25]</a> <a href ="/abs/2502.10871" title="Abstract" id="2502.10871"> arXiv:2502.10871 </a> [<a href="/pdf/2502.10871" title="Download PDF" id="pdf-2502.10871" aria-labelledby="pdf-2502.10871">pdf</a>, <a href="https://arxiv.org/html/2502.10871v1" title="View HTML" id="html-2502.10871" aria-labelledby="html-2502.10871" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10871" title="Other formats" id="oth-2502.10871" aria-labelledby="oth-2502.10871">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> The Representation and Recall of Interwoven Structured Knowledge in LLMs: A Geometric and Layered Analysis </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lei,+G">Ge Lei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cooper,+S+J">Samuel J. Cooper</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> This study investigates how large language models (LLMs) represent and recall multi-associated attributes across transformer layers. We show that intermediate layers encode factual knowledge by superimposing related attributes in overlapping spaces, along with effective recall even when attributes are not explicitly prompted. In contrast, later layers refine linguistic patterns and progressively separate attribute representations, optimizing task-specific outputs while appropriately narrowing attribute recall. We identify diverse encoding patterns including, for the first time, the observation of 3D spiral structures when exploring information related to the periodic table of elements. Our findings reveal a dynamic transition in attribute representations across layers, contributing to mechanistic interpretability and providing insights for understanding how LLMs handle complex, interrelated knowledge. </p> </div> </dd> <dt> <a name='item26'>[26]</a> <a href ="/abs/2502.10881" title="Abstract" id="2502.10881"> arXiv:2502.10881 </a> [<a href="/pdf/2502.10881" title="Download PDF" id="pdf-2502.10881" aria-labelledby="pdf-2502.10881">pdf</a>, <a href="https://arxiv.org/html/2502.10881v1" title="View HTML" id="html-2502.10881" aria-labelledby="html-2502.10881" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10881" title="Other formats" id="oth-2502.10881" aria-labelledby="oth-2502.10881">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CiteCheck: Towards Accurate Citation Faithfulness Detection </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+Z">Ziyao Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wei,+S">Shaohang Wei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+Z">Zhuoheng Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jin,+J">Jing Jin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+Z">Zhe Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+X">Xiaoguang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tan,+H">Haochen Tan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+Z">Zhijiang Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+H">Houfeng Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Citation faithfulness detection is critical for enhancing retrieval-augmented generation (RAG) systems, yet large-scale Chinese datasets for this task are scarce. Existing methods face prohibitive costs due to the need for manually annotated negative samples. To address this, we introduce the first large-scale Chinese dataset CiteCheck for citation faithfulness detection, constructed via a cost-effective approach using two-stage manual annotation. This method balances positive and negative samples while significantly reducing annotation expenses. CiteCheck comprises training and test splits. Experiments demonstrate that: (1) the test samples are highly challenging, with even state-of-the-art LLMs failing to achieve high accuracy; and (2) training data augmented with LLM-generated negative samples enables smaller models to attain strong performance using parameter-efficient fine-tuning. CiteCheck provides a robust foundation for advancing citation faithfulness detection in Chinese RAG systems. The dataset is publicly available to facilitate research. </p> </div> </dd> <dt> <a name='item27'>[27]</a> <a href ="/abs/2502.10886" title="Abstract" id="2502.10886"> arXiv:2502.10886 </a> [<a href="/pdf/2502.10886" title="Download PDF" id="pdf-2502.10886" aria-labelledby="pdf-2502.10886">pdf</a>, <a href="https://arxiv.org/html/2502.10886v1" title="View HTML" id="html-2502.10886" aria-labelledby="html-2502.10886" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10886" title="Other formats" id="oth-2502.10886" aria-labelledby="oth-2502.10886">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MET-Bench: Multimodal Entity Tracking for Evaluating the Limitations of Vision-Language and Reasoning Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Cohen,+V">Vanya Cohen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mooney,+R">Raymond Mooney</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Entity tracking is a fundamental challenge in natural language understanding, requiring models to maintain coherent representations of entities. Previous work has benchmarked entity tracking performance in purely text-based tasks. We introduce MET-Bench, a multimodal entity tracking benchmark designed to evaluate the ability of vision-language models to track entity states across modalities. Using two structured domains, Chess and the Shell Game, we assess how effectively current models integrate textual and image-based state updates. Our findings reveal a significant performance gap between text-based and image-based tracking and that this performance gap stems from deficits in visual reasoning rather than perception. We further show that explicit text-based reasoning strategies improve performance, yet substantial limitations remain, especially in long-horizon multimodal scenarios. Our results highlight the need for improved multimodal representations and reasoning techniques to bridge the gap between textual and visual entity tracking. </p> </div> </dd> <dt> <a name='item28'>[28]</a> <a href ="/abs/2502.10896" title="Abstract" id="2502.10896"> arXiv:2502.10896 </a> [<a href="/pdf/2502.10896" title="Download PDF" id="pdf-2502.10896" aria-labelledby="pdf-2502.10896">pdf</a>, <a href="/format/2502.10896" title="Other formats" id="oth-2502.10896" aria-labelledby="oth-2502.10896">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Developing Conversational Speech Systems for Robots to Detect Speech Biomarkers of Cognition in People Living with Dementia </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Perumandla,+R">Rohith Perumandla</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bae,+Y">Young-Ho Bae</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Izaguirre,+D">Diego Izaguirre</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hwang,+E">Esther Hwang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Murphy,+A">Andrew Murphy</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hsu,+L">Long-Jing Hsu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sabanovic,+S">Selma Sabanovic</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bennett,+C+C">Casey C. Bennett</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Main paper 28 pages long (pg 2-30), includes 5 figures, 5 tables, 1 Appendix at end </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> This study presents the development and testing of a conversational speech system designed for robots to detect speech biomarkers indicative of cognitive impairments in people living with dementia (PLwD). The system integrates a backend Python WebSocket server and a central core module with a large language model (LLM) fine-tuned for dementia to process user input and generate robotic conversation responses in real-time in less than 1.5 seconds. The frontend user interface, a Progressive Web App (PWA), displays information and biomarker score graphs on a smartphone in real-time to human users (PLwD, caregivers, clinicians). Six speech biomarkers based on the existing literature - Altered Grammar, Pragmatic Impairments, Anomia, Disrupted Turn-Taking, Slurred Pronunciation, and Prosody Changes - were developed for the robot conversation system using two datasets, one that included conversations of PLwD with a human clinician (DementiaBank dataset) and one that included conversations of PLwD with a robot (Indiana dataset). We also created a composite speech biomarker that combined all six individual biomarkers into a single score. The speech system's performance was first evaluated on the DementiaBank dataset showing moderate correlation with MMSE scores, with the composite biomarker score outperforming individual biomarkers. Analysis of the Indiana dataset revealed higher and more variable biomarker scores, suggesting potential differences due to study populations (e.g. severity of dementia) and the conversational scenario (human-robot conversations are different from human-human). The findings underscore the need for further research on the impact of conversational scenarios on speech biomarkers and the potential clinical applications of robotic speech systems. </p> </div> </dd> <dt> <a name='item29'>[29]</a> <a href ="/abs/2502.10916" title="Abstract" id="2502.10916"> arXiv:2502.10916 </a> [<a href="/pdf/2502.10916" title="Download PDF" id="pdf-2502.10916" aria-labelledby="pdf-2502.10916">pdf</a>, <a href="https://arxiv.org/html/2502.10916v1" title="View HTML" id="html-2502.10916" aria-labelledby="html-2502.10916" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10916" title="Other formats" id="oth-2502.10916" aria-labelledby="oth-2502.10916">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Enhancing Conversational Agents from Open-Source Large Language Models with Illocutionary Force and Document-Based Knowledge Retrieval </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Inyama,+G">Godfrey Inyama</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 23 pages, 1 figure, 7 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Information Retrieval (cs.IR) </div> <p class='mathjax'> In this paper, we first present a novel way of computationally analysing and extracting illocutionary forces from dialogue using Bert-based Large Language Models, and demonstrate how these features impact the response of a conversational agent guided by a document-based knowledge bank demonstrated by a bespoke web conversational chat agent system developed. Our proposed illocutionary force extraction and classification technique is the first of its kind using the Argument Interchange Format (AIF) Dataset, showing an improved performance compared to two methods for carrying out similar tasks with a macro F1 of approximately 45%. When we evaluated the system based on 2 knowledge files, with 2 user queries each, across 5 open-source large language models (LLMs) using 10 standard metrics we found out that larger open-source models, such as Llama2:13b and Llama3-chatqa-latest, demonstrated an improved alignment when the user illocutionary force was included with their query, achieving higher QA and linguistic similarity scores. The smaller models on the other hand like Tinyllama:latest showed an increased perplexity and mixed performance, which explicitly indicated struggles in processing queries that explicitly included illocutionary forces. The results from the analysis highlight the potential of illocutionary force to enhance conversational depth while underscoring the need for model-specific optimizations to address increased computational costs and response times. </p> </div> </dd> <dt> <a name='item30'>[30]</a> <a href ="/abs/2502.10921" title="Abstract" id="2502.10921"> arXiv:2502.10921 </a> [<a href="/pdf/2502.10921" title="Download PDF" id="pdf-2502.10921" aria-labelledby="pdf-2502.10921">pdf</a>, <a href="https://arxiv.org/html/2502.10921v1" title="View HTML" id="html-2502.10921" aria-labelledby="html-2502.10921" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10921" title="Other formats" id="oth-2502.10921" aria-labelledby="oth-2502.10921">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Evolving Hate Speech Online: An Adaptive Framework for Detection and Mitigation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ali,+S">Shiza Ali</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Stringhini,+G">Gianluca Stringhini</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Social and Information Networks (cs.SI) </div> <p class='mathjax'> The proliferation of social media platforms has led to an increase in the spread of hate speech, particularly targeting vulnerable communities. Unfortunately, existing methods for automatically identifying and blocking toxic language rely on pre-constructed lexicons, making them reactive rather than adaptive. As such, these approaches become less effective over time, especially when new communities are targeted with slurs not included in the original datasets. To address this issue, we present an adaptive approach that uses word embeddings to update lexicons and develop a hybrid model that adjusts to emerging slurs and new linguistic patterns. This approach can effectively detect toxic language, including intentional spelling mistakes employed by aggressors to avoid detection. Our hybrid model, which combines BERT with lexicon-based techniques, achieves an accuracy of 95% for most state-of-the-art datasets. Our work has significant implications for creating safer online environments by improving the detection of toxic content and proactively updating the lexicon. Content Warning: This paper contains examples of hate speech that may be triggering. </p> </div> </dd> <dt> <a name='item31'>[31]</a> <a href ="/abs/2502.10934" title="Abstract" id="2502.10934"> arXiv:2502.10934 </a> [<a href="/pdf/2502.10934" title="Download PDF" id="pdf-2502.10934" aria-labelledby="pdf-2502.10934">pdf</a>, <a href="/format/2502.10934" title="Other formats" id="oth-2502.10934" aria-labelledby="oth-2502.10934">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Fundamental Principles of Linguistic Structure are Not Represented by o3 </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Murphy,+E">Elliot Murphy</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Leivada,+E">Evelina Leivada</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dentella,+V">Vittoria Dentella</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gunther,+F">Fritz Gunther</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Marcus,+G">Gary Marcus</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> A core component of a successful artificial general intelligence would be the rapid creation and manipulation of grounded compositional abstractions and the demonstration of expertise in the family of recursive hierarchical syntactic objects necessary for the creative use of human language. We evaluated the recently released o3 model (OpenAI; o3-mini-high) and discovered that while it succeeds on some basic linguistic tests relying on linear, surface statistics (e.g., the Strawberry Test), it fails to generalize basic phrase structure rules; it fails with comparative sentences involving semantically illegal cardinality comparisons ('Escher sentences'); its fails to correctly rate and explain acceptability dynamics; and it fails to distinguish between instructions to generate unacceptable semantic vs. unacceptable syntactic outputs. When tasked with generating simple violations of grammatical rules, it is seemingly incapable of representing multiple parses to evaluate against various possible semantic interpretations. In stark contrast to many recent claims that artificial language models are on the verge of replacing the field of linguistics, our results suggest not only that deep learning is hitting a wall with respect to compositionality (Marcus 2022), but that it is hitting [a [stubbornly [resilient wall]]] that cannot readily be surmounted to reach human-like compositional reasoning simply through more compute. </p> </div> </dd> <dt> <a name='item32'>[32]</a> <a href ="/abs/2502.10942" title="Abstract" id="2502.10942"> arXiv:2502.10942 </a> [<a href="/pdf/2502.10942" title="Download PDF" id="pdf-2502.10942" aria-labelledby="pdf-2502.10942">pdf</a>, <a href="https://arxiv.org/html/2502.10942v1" title="View HTML" id="html-2502.10942" aria-labelledby="html-2502.10942" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10942" title="Other formats" id="oth-2502.10942" aria-labelledby="oth-2502.10942">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Exploring Contextual Flux in Large Language Models: A Novel Approach to Self-Modulating Semantic Networks </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Evidail,+H">Henry Evidail</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mountebank,+Z">Zachary Mountebank</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hathersage,+A">Alistair Hathersage</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Stanhope,+P">Peter Stanhope</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ravenscroft,+B">Basil Ravenscroft</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Waddingham,+T">Tobias Waddingham</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Self-modulating mechanisms introduce dynamic adaptation capabilities within language models through contextual realignment strategies that influence token embedding trajectories across extended sequences. Contextual Flux is explored as an approach to embedding modulation, integrating an auxiliary gating mechanism within the self-attention framework to dynamically adjust token representations based on evolving contextual dependencies. The empirical analysis evaluates entropy variations, latent space realignments, and coherence stability to assess the extent to which self-regulation enhances text generation consistency while preserving generative flexibility. Quantitative assessments suggest that embedding shifts contribute to more structured adaptation in long-form sequences, with measured reductions in redundant phrase repetitions and improvements in thematic retention. Variability in contextual weight computation affects modulation stability, leading to differing levels of adaptation across diverse linguistic structures. The computational demands introduced through real-time embedding reconfiguration are examined in relation to model scalability, emphasizing the need for optimization strategies in high-volume generative applications. The findings suggest that while adaptive embedding updates improve certain aspects of coherence, their impact remains contingent on model capacity and input complexity. </p> </div> </dd> <dt> <a name='item33'>[33]</a> <a href ="/abs/2502.10966" title="Abstract" id="2502.10966"> arXiv:2502.10966 </a> [<a href="/pdf/2502.10966" title="Download PDF" id="pdf-2502.10966" aria-labelledby="pdf-2502.10966">pdf</a>, <a href="https://arxiv.org/html/2502.10966v1" title="View HTML" id="html-2502.10966" aria-labelledby="html-2502.10966" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10966" title="Other formats" id="oth-2502.10966" aria-labelledby="oth-2502.10966">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Neural Networks Remember More: The Power of Parameter Isolation and Combination </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zeng,+B">Biqing Zeng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Z">Zehan Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ayesh,+A">Aladdin Ayesh</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Catastrophic forgetting is a pervasive issue for pre-trained language models (PLMs) during continual learning, where models lose previously acquired knowledge when sequentially trained on a series of tasks. The model's ability to retain old tasks is referred to as stability, while its adaptability to new tasks is called plasticity. Therefore, the key to solving this problem is to find a trade-off between the plasticity and stability of the model. To address this issue, in this paper, we propose a novel method to achieve a balance between model stability and plasticity, thereby mitigating catastrophic forgetting. More specifically, our proposed approach leverages parameter isolation and a subsequent combination strategy. Initially, in the training stage, the model adapts to each downstream task via a parameter isolation method to prevent potential interference among different tasks. We then combine all trained parameters, which contain acquired knowledge, using the task arithmetic method and finally apply them to the backbone model. Empirical evaluations on continual language learning benchmarks substantiate the effectiveness of our approach, revealing a marked enhancement over existing state-of-the-art approaches. </p> </div> </dd> <dt> <a name='item34'>[34]</a> <a href ="/abs/2502.10973" title="Abstract" id="2502.10973"> arXiv:2502.10973 </a> [<a href="/pdf/2502.10973" title="Download PDF" id="pdf-2502.10973" aria-labelledby="pdf-2502.10973">pdf</a>, <a href="https://arxiv.org/html/2502.10973v1" title="View HTML" id="html-2502.10973" aria-labelledby="html-2502.10973" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10973" title="Other formats" id="oth-2502.10973" aria-labelledby="oth-2502.10973">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Akan Cinematic Emotions (ACE): A Multimodal Multi-party Dataset for Emotion Recognition in Movie Dialogues </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Sasu,+D">David Sasu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Z">Zehui Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gong,+Z">Ziwei Gong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+R">Run Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shi,+P">Pengyuan Shi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ai,+L">Lin Ai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hirschberg,+J">Julia Hirschberg</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Schluter,+N">Natalie Schluter</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> In this paper, we introduce the Akan Conversation Emotion (ACE) dataset, the first multimodal emotion dialogue dataset for an African language, addressing the significant lack of resources for low-resource languages in emotion recognition research. ACE, developed for the Akan language, contains 385 emotion-labeled dialogues and 6,162 utterances across audio, visual, and textual modalities, along with word-level prosodic prominence annotations. The presence of prosodic labels in this dataset also makes it the first prosodically annotated African language dataset. We demonstrate the quality and utility of ACE through experiments using state-of-the-art emotion recognition methods, establishing solid baselines for future research. We hope ACE inspires further work on inclusive, linguistically and culturally diverse NLP resources. </p> </div> </dd> <dt> <a name='item35'>[35]</a> <a href ="/abs/2502.10990" title="Abstract" id="2502.10990"> arXiv:2502.10990 </a> [<a href="/pdf/2502.10990" title="Download PDF" id="pdf-2502.10990" aria-labelledby="pdf-2502.10990">pdf</a>, <a href="https://arxiv.org/html/2502.10990v1" title="View HTML" id="html-2502.10990" aria-labelledby="html-2502.10990" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10990" title="Other formats" id="oth-2502.10990" aria-labelledby="oth-2502.10990">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> FinMTEB: Finance Massive Text Embedding Benchmark </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+Y">Yixuan Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+Y">Yi Yang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> <a href="https://github.com/yixuantt/FinMTEB" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Information Retrieval (cs.IR) </div> <p class='mathjax'> Embedding models play a crucial role in representing and retrieving information across various NLP applications. Recent advances in large language models (LLMs) have further enhanced the performance of embedding models. While these models are often benchmarked on general-purpose datasets, real-world applications demand domain-specific evaluation. In this work, we introduce the Finance Massive Text Embedding Benchmark (FinMTEB), a specialized counterpart to MTEB designed for the financial domain. FinMTEB comprises 64 financial domain-specific embedding datasets across 7 tasks that cover diverse textual types in both Chinese and English, such as financial news articles, corporate annual reports, ESG reports, regulatory filings, and earnings call transcripts. We also develop a finance-adapted model, FinPersona-E5, using a persona-based data synthetic method to cover diverse financial embedding tasks for training. Through extensive evaluation of 15 embedding models, including FinPersona-E5, we show three key findings: (1) performance on general-purpose benchmarks shows limited correlation with financial domain tasks; (2) domain-adapted models consistently outperform their general-purpose counterparts; and (3) surprisingly, a simple Bag-of-Words (BoW) approach outperforms sophisticated dense embeddings in financial Semantic Textual Similarity (STS) tasks, underscoring current limitations in dense embedding techniques. Our work establishes a robust evaluation framework for financial NLP applications and provides crucial insights for developing domain-specific embedding models. </p> </div> </dd> <dt> <a name='item36'>[36]</a> <a href ="/abs/2502.10993" title="Abstract" id="2502.10993"> arXiv:2502.10993 </a> [<a href="/pdf/2502.10993" title="Download PDF" id="pdf-2502.10993" aria-labelledby="pdf-2502.10993">pdf</a>, <a href="https://arxiv.org/html/2502.10993v1" title="View HTML" id="html-2502.10993" aria-labelledby="html-2502.10993" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10993" title="Other formats" id="oth-2502.10993" aria-labelledby="oth-2502.10993">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> RoseRAG: Robust Retrieval-augmented Generation with Small-scale LLMs via Margin-aware Preference Optimization </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+T">Tianci Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+H">Haoxiang Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+T">Tianze Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+R">Ran Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+Y">Yue Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+L">Linjun Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+T">Tuo Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+H">Haoyu Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Large language models (LLMs) have achieved impressive performance but face high computational costs and latency, limiting their deployment in resource-constrained settings. In contrast, small-scale LLMs (SLMs) are more efficient yet struggle to capture evolving real-world knowledge. Retrieval-augmented generation (RAG) helps by integrating external knowledge, but imperfect retrieval can introduce distracting noise that misleads SLMs. We propose RoseRAG, a robust RAG framework for SLMs via Margin-aware Preference Optimization. RoseRAG employs multi-turn prompting for detailed reasoning, rejection sampling for high-quality explanations, and contrastive preference selection to refine responses by maximizing the likelihood gap between preferred and non-preferred outputs. By integrating these components into a margin-aware optimization process, RoseRAG robustly enhances the accuracy and reliability of SLMs for RAG applications. Extensive experiments on three open-domain question answering benchmarks indicate that our innovative RoseRAG surpasses state-of-the-art baselines significantly. </p> </div> </dd> <dt> <a name='item37'>[37]</a> <a href ="/abs/2502.10995" title="Abstract" id="2502.10995"> arXiv:2502.10995 </a> [<a href="/pdf/2502.10995" title="Download PDF" id="pdf-2502.10995" aria-labelledby="pdf-2502.10995">pdf</a>, <a href="https://arxiv.org/html/2502.10995v1" title="View HTML" id="html-2502.10995" aria-labelledby="html-2502.10995" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10995" title="Other formats" id="oth-2502.10995" aria-labelledby="oth-2502.10995">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Evaluating Large language models on Understanding Korean indirect Speech acts </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Koo,+Y">Youngeun Koo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+J">Jiwoo Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Park,+D">Dojun Park</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Park,+S">Seohyun Park</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+S">Sungeun Lee</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> under review (15 pages) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> To accurately understand the intention of an utterance is crucial in conversational communication. As conversational artificial intelligence models are rapidly being developed and applied in various fields, it is important to evaluate the LLMs' capabilities of understanding the intentions of user's utterance. This study evaluates whether current LLMs can understand the intention of an utterance by considering the given conversational context, particularly in cases where the actual intention differs from the surface-leveled, literal intention of the sentence, i.e. indirect speech acts. Our findings reveal that Claude3-Opus outperformed the other competing models, with 71.94% in MCQ and 65% in OEQ, showing a clear advantage. In general, proprietary models exhibited relatively higher performance compared to open-source models. Nevertheless, no LLMs reached the level of human performance. Most LLMs, except for Claude3-Opus, demonstrated significantly lower performance in understanding indirect speech acts compared to direct speech acts, where the intention is explicitly revealed through the utterance. This study not only performs an overall pragmatic evaluation of each LLM's language use through the analysis of OEQ response patterns, but also emphasizes the necessity for further research to improve LLMs' understanding of indirect speech acts for more natural communication with humans. </p> </div> </dd> <dt> <a name='item38'>[38]</a> <a href ="/abs/2502.10996" title="Abstract" id="2502.10996"> arXiv:2502.10996 </a> [<a href="/pdf/2502.10996" title="Download PDF" id="pdf-2502.10996" aria-labelledby="pdf-2502.10996">pdf</a>, <a href="https://arxiv.org/html/2502.10996v1" title="View HTML" id="html-2502.10996" aria-labelledby="html-2502.10996" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10996" title="Other formats" id="oth-2502.10996" aria-labelledby="oth-2502.10996">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> RAS: Retrieval-And-Structuring for Knowledge-Intensive LLM Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+P">Pengcheng Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+L">Lang Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+R">Ruike Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+M">Minhao Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yunyi Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+J">Jimeng Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+J">Jiawei Han</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> under review </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Retrieval-augmented language models often struggle with knowledge-intensive tasks due to inefficient retrieval, unstructured knowledge integration, and single-pass architectures. We present Retrieval-And-Structuring (RAS), a novel framework that dynamically constructs and reasons over query-specific knowledge graphs through iterative retrieval and structuring. RAS introduces four key technical innovations: (1) a themescoped retrieval mechanism that efficiently narrows the search space while maintaining retrieval quality, (2) an action planning module that determines knowledge needs and generates focused sub-queries, (3) a dynamic knowledge structuring approach that converts retrieved text into an evolving knowledge graph, and (4) a graph-augmented answering component that leverages the accumulated structured information. Our framework achieves state-of-the-art performance, surpassing leading baselines by 6.4% with open-source language models and 7.0% with proprietary models on seven knowledge-intensive generation datasets across all evaluation metrics. Detailed ablation studies verify the contribution of each technical component to the overall system performance. </p> </div> </dd> <dt> <a name='item39'>[39]</a> <a href ="/abs/2502.11008" title="Abstract" id="2502.11008"> arXiv:2502.11008 </a> [<a href="/pdf/2502.11008" title="Download PDF" id="pdf-2502.11008" aria-labelledby="pdf-2502.11008">pdf</a>, <a href="https://arxiv.org/html/2502.11008v1" title="View HTML" id="html-2502.11008" aria-labelledby="html-2502.11008" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11008" title="Other formats" id="oth-2502.11008" aria-labelledby="oth-2502.11008">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CounterBench: A Benchmark for Counterfactuals Reasoning in Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Y">Yuefei Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=K.Singh,+V">Vivek K.Singh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+J">Jing Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+R">Ruxiang Tang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Counterfactual reasoning is widely recognized as one of the most challenging and intricate aspects of causality in artificial intelligence. In this paper, we evaluate the performance of large language models (LLMs) in counterfactual reasoning. In contrast to previous studies that primarily focus on commonsense causal reasoning, where LLMs often rely on prior knowledge for inference, we specifically assess their ability to perform counterfactual inference using a set of formal rules. To support this evaluation, we introduce a new benchmark dataset, CounterBench, comprising 1K counterfactual reasoning questions. The dataset is designed with varying levels of difficulty, diverse causal graph structures, distinct types of counterfactual questions, and multiple nonsensical name variants. Our experiments demonstrate that counterfactual reasoning poses a significant challenge for LLMs, with most models performing at levels comparable to random guessing. To enhance LLM's counterfactual reasoning ability, we propose a novel reasoning paradigm, CoIn, which guides LLMs through iterative reasoning and backtracking to systematically explore counterfactual solutions. Experimental results show that our method significantly improves LLM performance on counterfactual reasoning tasks and consistently enhances performance across different <a href="http://LLMs.Our" rel="external noopener nofollow" class="link-external link-http">this http URL</a> dataset is available at <a href="https://huggingface.co/datasets/CounterBench/CounterBench" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item40'>[40]</a> <a href ="/abs/2502.11018" title="Abstract" id="2502.11018"> arXiv:2502.11018 </a> [<a href="/pdf/2502.11018" title="Download PDF" id="pdf-2502.11018" aria-labelledby="pdf-2502.11018">pdf</a>, <a href="https://arxiv.org/html/2502.11018v1" title="View HTML" id="html-2502.11018" aria-labelledby="html-2502.11018" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11018" title="Other formats" id="oth-2502.11018" aria-labelledby="oth-2502.11018">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> GRIFFIN: Effective Token Alignment for Faster Speculative Decoding </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+S">Shijing Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+J">Jingyang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xie,+X">Xingyu Xie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+Z">Zhihui Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Toh,+K">Kim-Chuan Toh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+P">Pan Zhou</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Speculative decoding accelerates inference in large language models (LLMs) by generating multiple draft tokens simultaneously. However, existing methods often struggle with token misalignment between the training and decoding phases, limiting their performance. To address this, we propose GRIFFIN, a novel framework that incorporates a token-alignable training strategy and a token-alignable draft model to mitigate misalignment. The training strategy employs a loss masking mechanism to exclude highly misaligned tokens during training, preventing them from negatively impacting the draft model's optimization. The token-alignable draft model introduces input tokens to correct inconsistencies in generated features. Experiments on LLaMA-series and Vicuna models demonstrate that GRIFFIN achieves an average acceptance length improvement of over 7\% and a speedup ratio exceeding 8%, outperforming current SoTAs as shown in Fig. 1 (a) and (b). </p> </div> </dd> <dt> <a name='item41'>[41]</a> <a href ="/abs/2502.11020" title="Abstract" id="2502.11020"> arXiv:2502.11020 </a> [<a href="/pdf/2502.11020" title="Download PDF" id="pdf-2502.11020" aria-labelledby="pdf-2502.11020">pdf</a>, <a href="https://arxiv.org/html/2502.11020v1" title="View HTML" id="html-2502.11020" aria-labelledby="html-2502.11020" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11020" title="Other formats" id="oth-2502.11020" aria-labelledby="oth-2502.11020">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> TUMLU: A Unified and Native Language Understanding Benchmark for Turkic Languages </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Isbarov,+J">Jafar Isbarov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Akhundjanova,+A">Arofat Akhundjanova</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hajili,+M">Mammad Hajili</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huseynova,+K">Kavsar Huseynova</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gaynullin,+D">Dmitry Gaynullin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rzayev,+A">Anar Rzayev</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tursun,+O">Osman Tursun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Saetov,+I">Ilshat Saetov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kharisov,+R">Rinat Kharisov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Belginova,+S">Saule Belginova</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kenbayeva,+A">Ariana Kenbayeva</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Alisheva,+A">Amina Alisheva</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Turdubaeva,+A">Aizirek Turdubaeva</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=K%C3%B6ksal,+A">Abdullatif K枚ksal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rustamov,+S">Samir Rustamov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ataman,+D">Duygu Ataman</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Being able to thoroughly assess massive multi-task language understanding (MMLU) capabilities is essential for advancing the applicability of multilingual language models. However, preparing such benchmarks in high quality native language is often costly and therefore limits the representativeness of evaluation datasets. While recent efforts focused on building more inclusive MMLU benchmarks, these are conventionally built using machine translation from high-resource languages, which may introduce errors and fail to account for the linguistic and cultural intricacies of the target languages. In this paper, we address the lack of native language MMLU benchmark especially in the under-represented Turkic language family with distinct morphosyntactic and cultural characteristics. We propose two benchmarks for Turkic language MMLU: TUMLU is a comprehensive, multilingual, and natively developed language understanding benchmark specifically designed for Turkic languages. It consists of middle- and high-school level questions spanning 11 academic subjects in Azerbaijani, Crimean Tatar, Karakalpak, Kazakh, Tatar, Turkish, Uyghur, and Uzbek. We also present TUMLU-mini, a more concise, balanced, and manually verified subset of the dataset. Using this dataset, we systematically evaluate a diverse range of open and proprietary multilingual large language models (LLMs), including Claude, Gemini, GPT, and LLaMA, offering an in-depth analysis of their performance across different languages, subjects, and alphabets. To promote further research and development in multilingual language understanding, we release TUMLU-mini and all corresponding evaluation scripts. </p> </div> </dd> <dt> <a name='item42'>[42]</a> <a href ="/abs/2502.11022" title="Abstract" id="2502.11022"> arXiv:2502.11022 </a> [<a href="/pdf/2502.11022" title="Download PDF" id="pdf-2502.11022" aria-labelledby="pdf-2502.11022">pdf</a>, <a href="https://arxiv.org/html/2502.11022v1" title="View HTML" id="html-2502.11022" aria-labelledby="html-2502.11022" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11022" title="Other formats" id="oth-2502.11022" aria-labelledby="oth-2502.11022">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MultiTEND: A Multilingual Benchmark for Natural Language to NoSQL Query Translation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Qin,+Z">Zhiqian Qin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+Y">Yuanfeng Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+J">Jinwei Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+Y">Yuanwei Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+S">Shuaimin Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+C+J">Chen Jason Zhang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Natural language interfaces for NoSQL databases are increasingly vital in the big data era, enabling users to interact with complex, unstructured data without deep technical expertise. However, most recent advancements focus on English, leaving a gap for multilingual support. This paper introduces MultiTEND, the first and largest multilingual benchmark for natural language to NoSQL query generation, covering six languages: English, German, French, Russian, Japanese and Mandarin Chinese. Using MultiTEND, we analyze challenges in translating natural language to NoSQL queries across diverse linguistic structures, including lexical and syntactic differences. Experiments show that performance accuracy in both English and non-English settings remains relatively low, with a 4%-6% gap across scenarios like fine-tuned SLM, zero-shot LLM, and RAG for LLM. To address the aforementioned challenges, we introduce MultiLink, a novel framework that bridges the multilingual input to NoSQL query generation gap through a Parallel Linking Process. It breaks down the task into multiple steps, integrating parallel multilingual processing, Chain-of-Thought (CoT) reasoning, and Retrieval-Augmented Generation (RAG) to tackle lexical and structural challenges inherent in multilingual NoSQL generation. MultiLink shows enhancements in all metrics for every language against the top baseline, boosting execution accuracy by about 15% for English and averaging a 10% improvement for non-English languages. </p> </div> </dd> <dt> <a name='item43'>[43]</a> <a href ="/abs/2502.11028" title="Abstract" id="2502.11028"> arXiv:2502.11028 </a> [<a href="/pdf/2502.11028" title="Download PDF" id="pdf-2502.11028" aria-labelledby="pdf-2502.11028">pdf</a>, <a href="https://arxiv.org/html/2502.11028v1" title="View HTML" id="html-2502.11028" aria-labelledby="html-2502.11028" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11028" title="Other formats" id="oth-2502.11028" aria-labelledby="oth-2502.11028">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Mind the Confidence Gap: Overconfidence, Calibration, and Distractor Effects in Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chhikara,+P">Prateek Chhikara</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Large Language Models (LLMs) demonstrate impressive performance across diverse tasks, yet confidence calibration remains a challenge. Miscalibration - where models are overconfident or underconfident - poses risks, particularly in high-stakes applications. This paper presents an empirical study on LLM calibration, examining how model size, distractors, and question types affect confidence alignment. We introduce an evaluation framework to measure overconfidence and investigate whether multiple-choice formats mitigate or worsen miscalibration. Our findings show that while larger models (e.g., GPT-4o) are better calibrated overall, they are more prone to distraction, whereas smaller models benefit more from answer choices but struggle with uncertainty estimation. Unlike prior work, which primarily reports miscalibration trends, we provide actionable insights into failure modes and conditions that worsen overconfidence. These findings highlight the need for calibration-aware interventions and improved uncertainty estimation methods. </p> </div> </dd> <dt> <a name='item44'>[44]</a> <a href ="/abs/2502.11051" title="Abstract" id="2502.11051"> arXiv:2502.11051 </a> [<a href="/pdf/2502.11051" title="Download PDF" id="pdf-2502.11051" aria-labelledby="pdf-2502.11051">pdf</a>, <a href="/format/2502.11051" title="Other formats" id="oth-2502.11051" aria-labelledby="oth-2502.11051">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MMUNLEARNER: Reformulating Multimodal Machine Unlearning in the Era of Multimodal Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Huo,+J">Jiahao Huo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yan,+Y">Yibo Yan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zheng,+X">Xu Zheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lyu,+Y">Yuanhuiyi Lyu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zou,+X">Xin Zou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wei,+Z">Zhihua Wei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+X">Xuming Hu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Recent progress in Machine Unlearning (MU) has introduced solutions for the selective removal of private or sensitive information encoded within deep neural networks. Nonetheless, MU for Multimodal Large Language Models (MLLMs) remains in its nascent phase. Therefore, we propose to reformulate the task of multimodal MU in the era of MLLMs, which aims to erase only the visual patterns associated with a given entity while preserving the corresponding textual knowledge encoded within the original parameters of the language model backbone. Furthermore, we develop a novel geometry-constrained gradient descent method MMUnlearner. It updates the weights of MLLMs with a weight saliency map jointly restricted by the remaining concepts and textual knowledge during unlearning, thereby preserving parameters essential for non-target knowledge. Extensive experiments demonstrate that MMUnlearner surpasses baselines that finetuning MLLMs with VQA data directly through Gradient Ascent (GA) or Negative Preference Optimization (NPO), across all evaluation dimensions. Our code will be released upon acceptance. </p> </div> </dd> <dt> <a name='item45'>[45]</a> <a href ="/abs/2502.11054" title="Abstract" id="2502.11054"> arXiv:2502.11054 </a> [<a href="/pdf/2502.11054" title="Download PDF" id="pdf-2502.11054" aria-labelledby="pdf-2502.11054">pdf</a>, <a href="https://arxiv.org/html/2502.11054v1" title="View HTML" id="html-2502.11054" aria-labelledby="html-2502.11054" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11054" title="Other formats" id="oth-2502.11054" aria-labelledby="oth-2502.11054">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Reasoning-Augmented Conversation for Multi-Turn Jailbreak Attacks on Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ying,+Z">Zonghao Ying</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+D">Deyue Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jing,+Z">Zonglei Jing</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiao,+Y">Yisong Xiao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zou,+Q">Quanchen Zou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+A">Aishan Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liang,+S">Siyuan Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+X">Xiangzheng Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+X">Xianglong Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tao,+D">Dacheng Tao</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Cryptography and Security (cs.CR) </div> <p class='mathjax'> Multi-turn jailbreak attacks simulate real-world human interactions by engaging large language models (LLMs) in iterative dialogues, exposing critical safety vulnerabilities. However, existing methods often struggle to balance semantic coherence with attack effectiveness, resulting in either benign semantic drift or ineffective detection evasion. To address this challenge, we propose Reasoning-Augmented Conversation, a novel multi-turn jailbreak framework that reformulates harmful queries into benign reasoning tasks and leverages LLMs' strong reasoning capabilities to compromise safety alignment. Specifically, we introduce an attack state machine framework to systematically model problem translation and iterative reasoning, ensuring coherent query generation across multiple turns. Building on this framework, we design gain-guided exploration, self-play, and rejection feedback modules to preserve attack semantics, enhance effectiveness, and sustain reasoning-driven attack progression. Extensive experiments on multiple LLMs demonstrate that RACE achieves state-of-the-art attack effectiveness in complex conversational scenarios, with attack success rates (ASRs) increasing by up to 96%. Notably, our approach achieves ASRs of 82% and 92% against leading commercial models, OpenAI o1 and DeepSeek R1, underscoring its potency. We release our code at <a href="https://github.com/NY1024/RACE" rel="external noopener nofollow" class="link-external link-https">this https URL</a> to facilitate further research in this critical domain. </p> </div> </dd> <dt> <a name='item46'>[46]</a> <a href ="/abs/2502.11061" title="Abstract" id="2502.11061"> arXiv:2502.11061 </a> [<a href="/pdf/2502.11061" title="Download PDF" id="pdf-2502.11061" aria-labelledby="pdf-2502.11061">pdf</a>, <a href="https://arxiv.org/html/2502.11061v1" title="View HTML" id="html-2502.11061" aria-labelledby="html-2502.11061" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11061" title="Other formats" id="oth-2502.11061" aria-labelledby="oth-2502.11061">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> D茅j脿 Vu? Decoding Repeated Reading from Eye Movements </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Meiri,+Y">Yoav Meiri</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shubi,+O">Omer Shubi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hadar,+C+A">Cfir Avraham Hadar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nitzav,+A+K">Ariel Kreisberg Nitzav</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Berzak,+Y">Yevgeni Berzak</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Be it your favorite novel, a newswire article, a cooking recipe or an academic paper -- in many daily situations we read the same text more than once. In this work, we ask whether it is possible to automatically determine whether the reader has previously encountered a text based on their eye movement patterns. We introduce two variants of this task and address them with considerable success using both feature-based and neural models. We further introduce a general strategy for enhancing these models with machine generated simulations of eye movements from a cognitive model. Finally, we present an analysis of model performance which on the one hand yields insights on the information used by the models, and on the other hand leverages predictive modeling as an analytic tool for better characterization of the role of memory in repeated reading. Our work advances the understanding of the extent and manner in which eye movements in reading capture memory effects from prior text exposure, and paves the way for future applications that involve predictive modeling of repeated reading. </p> </div> </dd> <dt> <a name='item47'>[47]</a> <a href ="/abs/2502.11062" title="Abstract" id="2502.11062"> arXiv:2502.11062 </a> [<a href="/pdf/2502.11062" title="Download PDF" id="pdf-2502.11062" aria-labelledby="pdf-2502.11062">pdf</a>, <a href="https://arxiv.org/html/2502.11062v1" title="View HTML" id="html-2502.11062" aria-labelledby="html-2502.11062" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11062" title="Other formats" id="oth-2502.11062" aria-labelledby="oth-2502.11062">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Beyond Similarity: A Gradient-based Graph Method for Instruction Tuning Data Selection </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+Y">Yang Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Du,+L">Li Du</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ding,+X">Xiao Ding</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ouyang,+Y">Yangou Ouyang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+H">Hepeng Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiong,+K">Kai Xiong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+J">Jinglong Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+Z">Zhouhao Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+D">Dongliang Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qing,+Y">Yang Qing</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+D">Dongchen Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qin,+B">Bing Qin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+T">Ting Liu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large language models (LLMs) have shown great potential across various industries due to their remarkable ability to generalize through instruction tuning. However, the limited availability of domain-specific data significantly hampers their performance on specialized tasks. While existing methods primarily focus on selecting training data from general datasets that are similar to the target domain, they often fail to consider the joint distribution of instructions, resulting in inefficient learning and suboptimal knowledge transfer. To address these challenges, we introduce G2IS (Gradient-based Graph Instruction Selection), a novel method that constructs a mixed gradient-based instruction graph to capture the joint distribution and interdependencies between instructions. By accounting for the relationships between instructions, G2IS improves domain adaptation efficiency. Additionally, we propose a gradient walk algorithm to refine the data selection process, enhancing both training effectiveness and efficiency. Our experiments demonstrate that G2IS outperforms traditional methods across various domain adaptation tasks, yielding significant performance gains, particularly in complex, data-scarce scenarios. These results underscore the potential of G2IS in advancing the development of large, domain-specific models. </p> </div> </dd> <dt> <a name='item48'>[48]</a> <a href ="/abs/2502.11066" title="Abstract" id="2502.11066"> arXiv:2502.11066 </a> [<a href="/pdf/2502.11066" title="Download PDF" id="pdf-2502.11066" aria-labelledby="pdf-2502.11066">pdf</a>, <a href="https://arxiv.org/html/2502.11066v1" title="View HTML" id="html-2502.11066" aria-labelledby="html-2502.11066" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11066" title="Other formats" id="oth-2502.11066" aria-labelledby="oth-2502.11066">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CARMA: Enhanced Compositionality in LLMs via Advanced Regularisation and Mutual Information Alignment </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Aljaafari,+N">Nura Aljaafari</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Carvalho,+D+S">Danilo S. Carvalho</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Freitas,+A">Andr茅 Freitas</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 18 pages, 7 figures, 5 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large language models (LLMs) struggle with compositional generalisation, limiting their ability to systematically combine learned components to interpret novel inputs. While architectural modifications, fine-tuning, and data augmentation improve compositionality, they often have limited adaptability, face scalability constraints, or yield diminishing returns on real data. To address this, we propose CARMA, an intervention that enhances the stability and robustness of compositional reasoning in LLMs while preserving fine-tuned performance. CARMA employs mutual information regularisation and layer-wise stability constraints to mitigate feature fragmentation, ensuring structured representations persist across and within layers. We evaluate CARMA on inverse dictionary modelling and sentiment classification, measuring its impact on semantic consistency, performance stability, and robustness to lexical perturbations. Results show that CARMA reduces the variability introduced by fine-tuning, stabilises token representations, and improves compositional reasoning. While its effectiveness varies across architectures, CARMA's key strength lies in reinforcing learned structures rather than introducing new capabilities, making it a scalable auxiliary method. These findings suggest that integrating CARMA with fine-tuning can improve compositional generalisation while maintaining task-specific performance in LLMs. </p> </div> </dd> <dt> <a name='item49'>[49]</a> <a href ="/abs/2502.11073" title="Abstract" id="2502.11073"> arXiv:2502.11073 </a> [<a href="/pdf/2502.11073" title="Download PDF" id="pdf-2502.11073" aria-labelledby="pdf-2502.11073">pdf</a>, <a href="https://arxiv.org/html/2502.11073v1" title="View HTML" id="html-2502.11073" aria-labelledby="html-2502.11073" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11073" title="Other formats" id="oth-2502.11073" aria-labelledby="oth-2502.11073">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Demystifying Hateful Content: Leveraging Large Multimodal Models for Hateful Meme Detection with Explainable Decisions </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Hee,+M+S">Ming Shan Hee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+R+K">Roy Ka-Wei Lee</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Preprint. Accepted at ICWSM'25 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Hateful meme detection presents a significant challenge as a multimodal task due to the complexity of interpreting implicit hate messages and contextual cues within memes. Previous approaches have fine-tuned pre-trained vision-language models (PT-VLMs), leveraging the knowledge they gained during pre-training and their attention mechanisms to understand meme content. However, the reliance of these models on implicit knowledge and complex attention mechanisms renders their decisions difficult to explain, which is crucial for building trust in meme classification. In this paper, we introduce IntMeme, a novel framework that leverages Large Multimodal Models (LMMs) for hateful meme classification with explainable decisions. IntMeme addresses the dual challenges of improving both accuracy and explainability in meme moderation. The framework uses LMMs to generate human-like, interpretive analyses of memes, providing deeper insights into multimodal content and context. Additionally, it uses independent encoding modules for both memes and their interpretations, which are then combined to enhance classification performance. Our approach addresses the opacity and misclassification issues associated with PT-VLMs, optimizing the use of LMMs for hateful meme detection. We demonstrate the effectiveness of IntMeme through comprehensive experiments across three datasets, showcasing its superiority over state-of-the-art models. </p> </div> </dd> <dt> <a name='item50'>[50]</a> <a href ="/abs/2502.11075" title="Abstract" id="2502.11075"> arXiv:2502.11075 </a> [<a href="/pdf/2502.11075" title="Download PDF" id="pdf-2502.11075" aria-labelledby="pdf-2502.11075">pdf</a>, <a href="https://arxiv.org/html/2502.11075v1" title="View HTML" id="html-2502.11075" aria-labelledby="html-2502.11075" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11075" title="Other formats" id="oth-2502.11075" aria-labelledby="oth-2502.11075">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Exposing Numeracy Gaps: A Benchmark to Evaluate Fundamental Numerical Abilities in Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+H">Haoyang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+X">Xuejia Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=XU,+Z">Zhanchao XU</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+D">Darian Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+N">Nicole Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Teng,+F">Fei Teng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yiming Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qiu,+L">Luyu Qiu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+C+J">Chen Jason Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Q">Qing Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+L">Lei Chen</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Large Language Models (LLMs) have demonstrated impressive capabilities in natural language processing tasks, such as text generation and semantic understanding. However, their performance on numerical reasoning tasks, such as basic arithmetic, numerical retrieval, and magnitude comparison, remains surprisingly poor. This gap arises from their reliance on surface-level statistical patterns rather than understanding numbers as continuous magnitudes. Existing benchmarks primarily focus on either linguistic competence or structured mathematical problem-solving, neglecting fundamental numerical reasoning required in real-world scenarios. To bridge this gap, we propose NumericBench, a comprehensive benchmark to evaluate six fundamental numerical capabilities: number recognition, arithmetic operations, contextual retrieval, comparison, summary, and logical reasoning. NumericBench includes datasets ranging from synthetic number lists to the crawled real-world data, addressing challenges like long contexts, noise, and multi-step reasoning. Extensive experiments on state-of-the-art LLMs, including GPT-4 and DeepSeek, reveal persistent weaknesses in numerical reasoning, highlighting the urgent need to improve numerically-aware language modeling. The benchmark is released in: <a href="https://github.com/TreeAI-Lab/NumericBench" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item51'>[51]</a> <a href ="/abs/2502.11078" title="Abstract" id="2502.11078"> arXiv:2502.11078 </a> [<a href="/pdf/2502.11078" title="Download PDF" id="pdf-2502.11078" aria-labelledby="pdf-2502.11078">pdf</a>, <a href="https://arxiv.org/html/2502.11078v1" title="View HTML" id="html-2502.11078" aria-labelledby="html-2502.11078" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11078" title="Other formats" id="oth-2502.11078" aria-labelledby="oth-2502.11078">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> DEEPER Insight into Your User: Directed Persona Refinement for Dynamic Persona Modeling </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+A">Aili Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Du,+C">Chengyu Du</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+J">Jiangjie Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+J">Jinghan Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yikai Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+S">Siyu Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Z">Zulong Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+L">Liangyue Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiao,+Y">Yanghua Xiao</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> To advance personalized applications such as recommendation systems and user behavior prediction, recent research increasingly adopts large language models (LLMs) for human -readable persona modeling. In dynamic real -world scenarios, effective persona modeling necessitates leveraging streaming behavior data to continually optimize user personas. However, existing methods -whether regenerating personas or incrementally extending them with new behaviors -often fail to achieve sustained improvements in persona quality or future behavior prediction accuracy. To address this, we propose DEEPER, a novel approach for dynamic persona modeling that enables continual persona optimization. Specifically, we enhance the model's direction -search capability through an iterative reinforcement learning framework, allowing it to automatically identify effective update directions and optimize personas using discrepancies between user behaviors and model predictions. Extensive experiments on dynamic persona modeling involving 4800 users across 10 domains highlight the superior persona optimization capabilities of DEEPER, delivering an impressive 32.2% average reduction in user behavior prediction error over four update rounds -outperforming the best baseline by a remarkable 22.92%. </p> </div> </dd> <dt> <a name='item52'>[52]</a> <a href ="/abs/2502.11083" title="Abstract" id="2502.11083"> arXiv:2502.11083 </a> [<a href="/pdf/2502.11083" title="Download PDF" id="pdf-2502.11083" aria-labelledby="pdf-2502.11083">pdf</a>, <a href="https://arxiv.org/html/2502.11083v1" title="View HTML" id="html-2502.11083" aria-labelledby="html-2502.11083" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11083" title="Other formats" id="oth-2502.11083" aria-labelledby="oth-2502.11083">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Streamlining the Collaborative Chain of Models into A Single Forward Pass in Generation-Based Tasks </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lyu,+Y">Yuanjie Lyu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+C">Chao Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Y">Yuhao Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Y">Yong Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+T">Tong Xu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> In Retrieval-Augmented Generation (RAG) and agent-based frameworks, the "Chain of Models" approach is widely used, where multiple specialized models work sequentially on distinct sub-tasks. This approach is effective but increases resource demands as each model must be deployed separately. Recent advancements attempt to address this by applying prompt tuning, which allows a shared base model to adapt to multiple tasks with minimal parameter changes. However, a key challenge remains: intermediate outputs, passed between models as plain text, require recomputation of hidden states (i.e., Key and Value (KV) states in Transformers) during inference. In this paper, we introduce FTHSS, a novel prompt-tuning method that enables models to share KV hidden states, eliminating redundant forward passes and reducing KV cache storage. By modifying input and attention masks during training, FTHSS allows models to effectively utilize KV hidden states from prior models in both single- and multi-round scenarios. Empirical results on four tasks show that FTHSS matches the performance of traditional model chains while improving inference efficiency. </p> </div> </dd> <dt> <a name='item53'>[53]</a> <a href ="/abs/2502.11084" title="Abstract" id="2502.11084"> arXiv:2502.11084 </a> [<a href="/pdf/2502.11084" title="Download PDF" id="pdf-2502.11084" aria-labelledby="pdf-2502.11084">pdf</a>, <a href="https://arxiv.org/html/2502.11084v1" title="View HTML" id="html-2502.11084" aria-labelledby="html-2502.11084" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11084" title="Other formats" id="oth-2502.11084" aria-labelledby="oth-2502.11084">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Rewrite to Jailbreak: Discover Learnable and Transferable Implicit Harmfulness Instruction </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+Y">Yuting Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+C">Chengyuan Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+Y">Yifeng Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+C">Chao Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+F">Fei Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kuang,+K">Kun Kuang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 21pages, 10 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> As Large Language Models (LLMs) are widely applied in various domains, the safety of LLMs is increasingly attracting attention to avoid their powerful capabilities being misused. Existing jailbreak methods create a forced instruction-following scenario, or search adversarial prompts with prefix or suffix tokens to achieve a specific representation manually or automatically. However, they suffer from low efficiency and explicit jailbreak patterns, far from the real deployment of mass attacks to LLMs. In this paper, we point out that simply rewriting the original instruction can achieve a jailbreak, and we find that this rewriting approach is learnable and transferable. We propose the Rewrite to Jailbreak (R2J) approach, a transferable black-box jailbreak method to attack LLMs by iteratively exploring the weakness of the LLMs and automatically improving the attacking strategy. The jailbreak is more efficient and hard to identify since no additional features are introduced. Extensive experiments and analysis demonstrate the effectiveness of R2J, and we find that the jailbreak is also transferable to multiple datasets and various types of models with only a few queries. We hope our work motivates further investigation of LLM safety. </p> </div> </dd> <dt> <a name='item54'>[54]</a> <a href ="/abs/2502.11089" title="Abstract" id="2502.11089"> arXiv:2502.11089 </a> [<a href="/pdf/2502.11089" title="Download PDF" id="pdf-2502.11089" aria-labelledby="pdf-2502.11089">pdf</a>, <a href="https://arxiv.org/html/2502.11089v1" title="View HTML" id="html-2502.11089" aria-labelledby="html-2502.11089" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11089" title="Other formats" id="oth-2502.11089" aria-labelledby="oth-2502.11089">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Native Sparse Attention: Hardware-Aligned and Natively Trainable Sparse Attention </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+J">Jingyang Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+H">Huazuo Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dai,+D">Damai Dai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+J">Junyu Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+L">Liang Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Z">Zhengyan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xie,+Z">Zhenda Xie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wei,+Y+X">Y. X. Wei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+L">Lean Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiao,+Z">Zhiping Xiao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yuqing Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ruan,+C">Chong Ruan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+M">Ming Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liang,+W">Wenfeng Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zeng,+W">Wangding Zeng</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Long-context modeling is crucial for next-generation language models, yet the high computational cost of standard attention mechanisms poses significant computational challenges. Sparse attention offers a promising direction for improving efficiency while maintaining model capabilities. We present NSA, a Natively trainable Sparse Attention mechanism that integrates algorithmic innovations with hardware-aligned optimizations to achieve efficient long-context modeling. NSA employs a dynamic hierarchical sparse strategy, combining coarse-grained token compression with fine-grained token selection to preserve both global context awareness and local precision. Our approach advances sparse attention design with two key innovations: (1) We achieve substantial speedups through arithmetic intensity-balanced algorithm design, with implementation optimizations for modern hardware. (2) We enable end-to-end training, reducing pretraining computation without sacrificing model performance. As shown in Figure 1, experiments show the model pretrained with NSA maintains or exceeds Full Attention models across general benchmarks, long-context tasks, and instruction-based reasoning. Meanwhile, NSA achieves substantial speedups over Full Attention on 64k-length sequences across decoding, forward propagation, and backward propagation, validating its efficiency throughout the model lifecycle. </p> </div> </dd> <dt> <a name='item55'>[55]</a> <a href ="/abs/2502.11090" title="Abstract" id="2502.11090"> arXiv:2502.11090 </a> [<a href="/pdf/2502.11090" title="Download PDF" id="pdf-2502.11090" aria-labelledby="pdf-2502.11090">pdf</a>, <a href="https://arxiv.org/html/2502.11090v1" title="View HTML" id="html-2502.11090" aria-labelledby="html-2502.11090" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11090" title="Other formats" id="oth-2502.11090" aria-labelledby="oth-2502.11090">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SafeDialBench: A Fine-Grained Safety Benchmark for Large Language Models in Multi-Turn Dialogues with Diverse Jailbreak Attacks </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+H">Hongye Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yanming Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jing,+S">Sijia Jing</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Peng,+Z">Ziyue Peng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bai,+Z">Zhixin Bai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+Z">Zhe Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fang,+M">Meng Fang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+F">Fan Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+B">Boyan Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+J">Jiaheng Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+T">Tianpei Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huo,+J">Jing Huo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+Y">Yang Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Meng,+F">Fanyu Meng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+X">Xi Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Deng,+C">Chao Deng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+J">Junlan Feng</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> With the rapid advancement of Large Language Models (LLMs), the safety of LLMs has been a critical concern requiring precise assessment. Current benchmarks primarily concentrate on single-turn dialogues or a single jailbreak attack method to assess the safety. Additionally, these benchmarks have not taken into account the LLM's capability of identifying and handling unsafe information in detail. To address these issues, we propose a fine-grained benchmark SafeDialBench for evaluating the safety of LLMs across various jailbreak attacks in multi-turn dialogues. Specifically, we design a two-tier hierarchical safety taxonomy that considers 6 safety dimensions and generates more than 4000 multi-turn dialogues in both Chinese and English under 22 dialogue scenarios. We employ 7 jailbreak attack strategies, such as reference attack and purpose reverse, to enhance the dataset quality for dialogue generation. Notably, we construct an innovative assessment framework of LLMs, measuring capabilities in detecting, and handling unsafe information and maintaining consistency when facing jailbreak attacks. Experimental results across 17 LLMs reveal that Yi-34B-Chat and GLM4-9B-Chat demonstrate superior safety performance, while Llama3.1-8B-Instruct and o3-mini exhibit safety vulnerabilities. </p> </div> </dd> <dt> <a name='item56'>[56]</a> <a href ="/abs/2502.11095" title="Abstract" id="2502.11095"> arXiv:2502.11095 </a> [<a href="/pdf/2502.11095" title="Download PDF" id="pdf-2502.11095" aria-labelledby="pdf-2502.11095">pdf</a>, <a href="https://arxiv.org/html/2502.11095v1" title="View HTML" id="html-2502.11095" aria-labelledby="html-2502.11095" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11095" title="Other formats" id="oth-2502.11095" aria-labelledby="oth-2502.11095">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Survey of Large Language Models in Psychotherapy: Current Landscape and Future Directions </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Na,+H">Hongbin Na</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hua,+Y">Yining Hua</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zimu Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+T">Tao Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+B">Beibei Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+L">Lilin Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+W">Wei Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Torous,+J">John Torous</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+L">Ling Chen</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> in progress </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Mental health remains a critical global challenge, with increasing demand for accessible, effective interventions. Large language models (LLMs) offer promising solutions in psychotherapy by enhancing the assessment, diagnosis, and treatment of mental health conditions through dynamic, context-aware interactions. This survey provides a comprehensive overview of the current landscape of LLM applications in psychotherapy, highlighting the roles of LLMs in symptom detection, severity estimation, cognitive assessment, and therapeutic interventions. We present a novel conceptual taxonomy to organize the psychotherapy process into three core components: assessment, diagnosis, and treatment, and examine the challenges and advancements in each area. The survey also addresses key research gaps, including linguistic biases, limited disorder coverage, and underrepresented therapeutic models. Finally, we discuss future directions to integrate LLMs into a holistic, end-to-end psychotherapy framework, addressing the evolving nature of mental health conditions and fostering more inclusive, personalized care. </p> </div> </dd> <dt> <a name='item57'>[57]</a> <a href ="/abs/2502.11100" title="Abstract" id="2502.11100"> arXiv:2502.11100 </a> [<a href="/pdf/2502.11100" title="Download PDF" id="pdf-2502.11100" aria-labelledby="pdf-2502.11100">pdf</a>, <a href="https://arxiv.org/html/2502.11100v1" title="View HTML" id="html-2502.11100" aria-labelledby="html-2502.11100" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11100" title="Other formats" id="oth-2502.11100" aria-labelledby="oth-2502.11100">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Towards Achieving Concept Completeness for Unsupervised Textual Concept Bottleneck Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Bhan,+M">Milan Bhan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Choho,+Y">Yann Choho</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Moreau,+P">Pierre Moreau</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Vittaut,+J">Jean-Noel Vittaut</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chesneau,+N">Nicolas Chesneau</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lesot,+M">Marie-Jeanne Lesot</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Textual Concept Bottleneck Models (TBMs) are interpretable-by-design models for text classification that predict a set of salient concepts before making the final prediction. This paper proposes Complete Textual Concept Bottleneck Model (CT-CBM),a novel TCBM generator building concept labels in a fully unsupervised manner using a small language model, eliminating both the need for predefined human labeled concepts and LLM annotations. CT-CBM iteratively targets and adds important concepts in the bottleneck layer to create a complete concept basis and addresses downstream classification leakage through a parallel residual connection. CT-CBM achieves good results against competitors, offering a promising solution to enhance interpretability of NLP classifiers without sacrificing performance. </p> </div> </dd> <dt> <a name='item58'>[58]</a> <a href ="/abs/2502.11101" title="Abstract" id="2502.11101"> arXiv:2502.11101 </a> [<a href="/pdf/2502.11101" title="Download PDF" id="pdf-2502.11101" aria-labelledby="pdf-2502.11101">pdf</a>, <a href="https://arxiv.org/html/2502.11101v1" title="View HTML" id="html-2502.11101" aria-labelledby="html-2502.11101" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11101" title="Other formats" id="oth-2502.11101" aria-labelledby="oth-2502.11101">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CacheFocus: Dynamic Cache Re-Positioning for Efficient Retrieval-Augmented Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+K">Kun-Hui Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Park,+E">Eunhwan Park</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+D">Donghoon Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Na,+S">Seung-Hoon Na</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 11 pages (Work in progress) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Large Language Models (LLMs) excel across a variety of language tasks yet are constrained by limited input lengths and high computational costs. Existing approaches\textemdash such as relative positional encodings (e.g., RoPE, ALiBi) and sliding window mechanisms\textemdash partially alleviate these issues but often require additional training or suffer from performance degradation with longer inputs. In this paper, we introduce \textbf{\textit{CacheFocus}}, a method that enhances length normalization and reduces inference latency without any further training. Our approach leverages query-independent, offline caching to efficiently reuse a Context KV Cache Store. We address the amplification of abnormal token distributions problem by re-positioning cached keys and introducing Layer-Adaptive Cache Pruning to discard low-relevance caches during pre-filling. Additionally, our Adaptive Positional Allocation Strategy dynamically reassigns cache positions to maximize the use of the available positional encoding range. Experiments on the Natural Questions and TriviaQA datasets demonstrate that CacheFocus outperforms alternative methods even when inputs exceed the $4$K limit of the \texttt{LLaMA-2} model, emphasizing its practical effectiveness for long-context LLMs. Moreover, even with large maximum input length of \texttt{Qwen2}, the performance of CacheFocus shows that it maintains consistent performance even as the number of documents increases, effectively managing long-text generation without degradation. </p> </div> </dd> <dt> <a name='item59'>[59]</a> <a href ="/abs/2502.11104" title="Abstract" id="2502.11104"> arXiv:2502.11104 </a> [<a href="/pdf/2502.11104" title="Download PDF" id="pdf-2502.11104" aria-labelledby="pdf-2502.11104">pdf</a>, <a href="https://arxiv.org/html/2502.11104v1" title="View HTML" id="html-2502.11104" aria-labelledby="html-2502.11104" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11104" title="Other formats" id="oth-2502.11104" aria-labelledby="oth-2502.11104">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Enhancing Cross-Tokenizer Knowledge Distillation with Contextual Dynamical Mapping </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Y">Yijie Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Y">Yijin Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Meng,+F">Fandong Meng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Y">Yufeng Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+J">Jinan Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+J">Jie Zhou</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> The code is available at <a href="https://github.com/pppa2019/ContexualDynamicMapping" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Knowledge Distillation (KD) has emerged as a prominent technique for model compression. However, conventional KD approaches primarily focus on homogeneous architectures with identical tokenizers, constraining their applicability in cross-architecture scenarios. As for the cross-tokenizer KD, the differences in the tokenizers give rise to two fundamental challenges: (1) sequence misalignment caused by divergent tokenization strategies, and (2) mismatched vocabulary size and composition. While existing probability-matching methods attempt to address these issues, their efficacy remains limited due to suboptimal alignment in both the sequence and vocabulary aspects. To overcome these limitations, we propose Contextual Dynamic Mapping (CDM), a novel cross-tokenizer distillation framework that employs contextual information to enhance sequence alignment precision and dynamically improves vocabulary mapping. We evaluated the effectiveness of our approach across five advanced and widely-used model families (i.e, LLama3, Phi3, Gemma2, OPT and Qwen2), which were configured into three distinct teacher-student pairs. Our method shows significant advantages over existing cross-tokenizer distillation baselines across diverse benchmarks, including instruction-following, code generation and math. Notably, our analysis reveals that combining conventional same-tokenizer distillation and cross-tokenizer distillation through CDM yields further performance improvements. The code is available at <a href="https://github.com/pppa2019/ContexualDynamicMapping" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item60'>[60]</a> <a href ="/abs/2502.11108" title="Abstract" id="2502.11108"> arXiv:2502.11108 </a> [<a href="/pdf/2502.11108" title="Download PDF" id="pdf-2502.11108" aria-labelledby="pdf-2502.11108">pdf</a>, <a href="https://arxiv.org/html/2502.11108v1" title="View HTML" id="html-2502.11108" aria-labelledby="html-2502.11108" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11108" title="Other formats" id="oth-2502.11108" aria-labelledby="oth-2502.11108">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Knowledge Graph-Driven Retrieval-Augmented Generation: Integrating Deepseek-R1 with Weaviate for Advanced Chatbot Applications </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lecu,+A">Alexandru Lecu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Groza,+A">Adrian Groza</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hawizy,+L">Lezan Hawizy</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Large language models (LLMs) have significantly advanced the field of natural language generation. However, they frequently generate unverified outputs, which compromises their reliability in critical applications. In this study, we propose an innovative framework that combines structured biomedical knowledge with LLMs through a retrieval-augmented generation technique. Our system develops a thorough knowledge graph by identifying and refining causal relationships and named entities from medical abstracts related to age-related macular degeneration (AMD). Using a vector-based retrieval process and a locally deployed language model, our framework produces responses that are both contextually relevant and verifiable, with direct references to clinical evidence. Experimental results show that this method notably decreases hallucinations, enhances factual precision, and improves the clarity of generated responses, providing a robust solution for advanced biomedical chatbot applications. </p> </div> </dd> <dt> <a name='item61'>[61]</a> <a href ="/abs/2502.11113" title="Abstract" id="2502.11113"> arXiv:2502.11113 </a> [<a href="/pdf/2502.11113" title="Download PDF" id="pdf-2502.11113" aria-labelledby="pdf-2502.11113">pdf</a>, <a href="https://arxiv.org/html/2502.11113v1" title="View HTML" id="html-2502.11113" aria-labelledby="html-2502.11113" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11113" title="Other formats" id="oth-2502.11113" aria-labelledby="oth-2502.11113">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Valuable Hallucinations: Realizable Non-realistic Propositions </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Q">Qiucheng Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+B">Bo Wang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 13 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> This paper introduces the first formal definition of valuable hallucinations in large language models (LLMs),addressing a gap in the existing <a href="http://literature.We" rel="external noopener nofollow" class="link-external link-http">this http URL</a> provide a systematic definition and analysis of hallucination value,proposing methods for enhancing the value of <a href="http://hallucinations.In" rel="external noopener nofollow" class="link-external link-http">this http URL</a> contrast to previous works,which often treat hallucinations as a broad flaw,we focus on the potential value that certain types of hallucinations can offer in specific <a href="http://contexts.Hallucinations" rel="external noopener nofollow" class="link-external link-http">this http URL</a> in LLMs generally refer to the generation of unfaithful, fabricated,inconsistent,or nonsensical <a href="http://content.Rather" rel="external noopener nofollow" class="link-external link-http">this http URL</a> than viewing all hallucinations negatively,this paper gives formal representations and manual judgments of "valuable hallucinations" and explores how realizable non-realistic propositions-ideas that are not currently true but could be achievable under certain conditions-can have constructive <a href="http://value.We" rel="external noopener nofollow" class="link-external link-http">this http URL</a> present experiments using the Qwen2.5 model and HalluQA dataset, employing ReAct prompting (which involves reasoning, confidence assessment, and answer verification) to control and optimize hallucinations. Our findings show that ReAct prompting results in a reduction in overall hallucinations and an increase in the proportion of valuable <a href="http://hallucinations.These" rel="external noopener nofollow" class="link-external link-http">this http URL</a> results demonstrate that systematically controlling hallucinations can improve their usefulness without compromising factual reliability. </p> </div> </dd> <dt> <a name='item62'>[62]</a> <a href ="/abs/2502.11114" title="Abstract" id="2502.11114"> arXiv:2502.11114 </a> [<a href="/pdf/2502.11114" title="Download PDF" id="pdf-2502.11114" aria-labelledby="pdf-2502.11114">pdf</a>, <a href="https://arxiv.org/html/2502.11114v1" title="View HTML" id="html-2502.11114" aria-labelledby="html-2502.11114" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11114" title="Other formats" id="oth-2502.11114" aria-labelledby="oth-2502.11114">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Beyond Pairwise: Global Zero-shot Temporal Graph Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Eirew,+A">Alon Eirew</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bar,+K">Kfir Bar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dagan,+I">Ido Dagan</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Temporal relation extraction (TRE) is a fundamental task in natural language processing (NLP) that involves identifying the temporal relationships between events in a document. Despite the advances in large language models (LLMs), their application to TRE remains limited. Most existing approaches rely on pairwise classification, in which event pairs are considered individually, leading to computational inefficiency and a lack of global consistency in the resulting temporal graph. In this work, we propose a novel zero-shot method for TRE that generates a document's complete temporal graph at once, then applies transitive constraints optimization to refine predictions and enforce temporal consistency across relations. Additionally, we introduce OmniTemp, a new dataset with complete annotations for all pairs of targeted events within a document. Through experiments and analyses, we demonstrate that our method significantly outperforms existing zero-shot approaches while achieving competitive performance with supervised models. </p> </div> </dd> <dt> <a name='item63'>[63]</a> <a href ="/abs/2502.11115" title="Abstract" id="2502.11115"> arXiv:2502.11115 </a> [<a href="/pdf/2502.11115" title="Download PDF" id="pdf-2502.11115" aria-labelledby="pdf-2502.11115">pdf</a>, <a href="https://arxiv.org/html/2502.11115v1" title="View HTML" id="html-2502.11115" aria-labelledby="html-2502.11115" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11115" title="Other formats" id="oth-2502.11115" aria-labelledby="oth-2502.11115">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Are Generative Models Underconfident? An Embarrassingly Simple Quality Estimation Approach </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Dinh,+T+A">Tu Anh Dinh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Niehues,+J">Jan Niehues</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Quality Estimation (QE) is estimating the quality of model output when the ground truth reference is not available. Looking at model uncertainty from its own output probabilities is the most trivial and low-effort way to estimate the output quality. However, for generative model, output probabilities might not be the best quality estimator. At an output step, there can be multiple correct options, making the probability distribution spread out more. Thus, lower token probability does not necessarily mean lower output quality. In other words, the model can be considered underconfident. In this paper, we propose a QE approach called Dominant Mass Probability (DMP}, that boosts the model confidence in cases where there are multiple viable output options. We show that, with no increase in complexity, DMP is notably better than sequence probability when estimating the quality of different models (Whisper, Llama, etc.) on different tasks (translation, summarization, etc.). Compared to sequence probability, DMP achieves on average +0.208 improvement in Pearson correlation to ground-truth quality. </p> </div> </dd> <dt> <a name='item64'>[64]</a> <a href ="/abs/2502.11116" title="Abstract" id="2502.11116"> arXiv:2502.11116 </a> [<a href="/pdf/2502.11116" title="Download PDF" id="pdf-2502.11116" aria-labelledby="pdf-2502.11116">pdf</a>, <a href="https://arxiv.org/html/2502.11116v1" title="View HTML" id="html-2502.11116" aria-labelledby="html-2502.11116" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11116" title="Other formats" id="oth-2502.11116" aria-labelledby="oth-2502.11116">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Gumbel Reranking: Differentiable End-to-End Reranker Optimization </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+S">Siyuan Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+Z">Zhiyuan Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Du,+J">Jintao Du</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Meng,+C">Changhua Meng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+W">Weiqiang Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Leng,+J">Jingwen Leng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+M">Minyi Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+Z">Zhouhan Lin</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Information Retrieval (cs.IR) </div> <p class='mathjax'> RAG systems rely on rerankers to identify relevant documents. However, fine-tuning these models remains challenging due to the scarcity of annotated query-document pairs. Existing distillation-based approaches suffer from training-inference misalignment and fail to capture interdependencies among candidate documents. To overcome these limitations, we reframe the reranking process as an attention-mask problem and propose Gumbel Reranking, an end-to-end training framework for rerankers aimed at minimizing the training-inference gap. In our approach, reranker optimization is reformulated as learning a stochastic, document-wise Top-$k$ attention mask using the Gumbel Trick and Relaxed Top-$k$ Sampling. This formulation enables end-to-end optimization by minimizing the overall language loss. Experiments across various settings consistently demonstrate performance gains, including a 10.4\% improvement in recall on HotpotQA for distinguishing indirectly relevant documents. </p> </div> </dd> <dt> <a name='item65'>[65]</a> <a href ="/abs/2502.11123" title="Abstract" id="2502.11123"> arXiv:2502.11123 </a> [<a href="/pdf/2502.11123" title="Download PDF" id="pdf-2502.11123" aria-labelledby="pdf-2502.11123">pdf</a>, <a href="https://arxiv.org/html/2502.11123v1" title="View HTML" id="html-2502.11123" aria-labelledby="html-2502.11123" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11123" title="Other formats" id="oth-2502.11123" aria-labelledby="oth-2502.11123">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> DuplexMamba: Enhancing Real-time Speech Conversations with Duplex and Streaming Capabilities </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+X">Xiangyu Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+W">Wang Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+H">Haoyu Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+H">Hongyun Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+H">Haiyan Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+C">Conghui Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+T">Tiejun Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+M">Muyun Yang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Real-time speech conversation is essential for natural and efficient human-machine interactions, requiring duplex and streaming capabilities. Traditional Transformer-based conversational chatbots operate in a turn-based manner and exhibit quadratic computational complexity that grows as the input size increases. In this paper, we propose DuplexMamba, a Mamba-based end-to-end multimodal duplex model for speech-to-text conversation. DuplexMamba enables simultaneous input processing and output generation, dynamically adjusting to support real-time streaming. Specifically, we develop a Mamba-based speech encoder and adapt it with a Mamba-based language model. Furthermore, we introduce a novel duplex decoding strategy that enables DuplexMamba to process input and generate output simultaneously. Experimental results demonstrate that DuplexMamba successfully implements duplex and streaming capabilities while achieving performance comparable to several recently developed Transformer-based models in automatic speech recognition (ASR) tasks and voice assistant benchmark evaluations. </p> </div> </dd> <dt> <a name='item66'>[66]</a> <a href ="/abs/2502.11128" title="Abstract" id="2502.11128"> arXiv:2502.11128 </a> [<a href="/pdf/2502.11128" title="Download PDF" id="pdf-2502.11128" aria-labelledby="pdf-2502.11128">pdf</a>, <a href="https://arxiv.org/html/2502.11128v1" title="View HTML" id="html-2502.11128" aria-labelledby="html-2502.11128" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11128" title="Other formats" id="oth-2502.11128" aria-labelledby="oth-2502.11128">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> FELLE: Autoregressive Speech Synthesis with Token-Wise Coarse-to-Fine Flow Matching </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+H">Hui Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+S">Shujie Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Meng,+L">Lingwei Meng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+J">Jinyu Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+Y">Yifan Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+S">Shiwan Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+H">Haiyang Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Y">Yanqing Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+H">Haoqin Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+J">Jiaming Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+Y">Yan Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qin,+Y">Yong Qin</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> <p class='mathjax'> To advance continuous-valued token modeling and temporal-coherence enforcement, we propose FELLE, an autoregressive model that integrates language modeling with token-wise flow matching. By leveraging the autoregressive nature of language models and the generative efficacy of flow matching, FELLE effectively predicts continuous-valued tokens (mel-spectrograms). For each continuous-valued token, FELLE modifies the general prior distribution in flow matching by incorporating information from the previous step, improving coherence and stability. Furthermore, to enhance synthesis quality, FELLE introduces a coarse-to-fine flow-matching mechanism, generating continuous-valued tokens hierarchically, conditioned on the language model's output. Experimental results demonstrate the potential of incorporating flow-matching techniques in autoregressive mel-spectrogram modeling, leading to significant improvements in TTS generation quality, as shown in <a href="https://aka.ms/felle" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item67'>[67]</a> <a href ="/abs/2502.11131" title="Abstract" id="2502.11131"> arXiv:2502.11131 </a> [<a href="/pdf/2502.11131" title="Download PDF" id="pdf-2502.11131" aria-labelledby="pdf-2502.11131">pdf</a>, <a href="https://arxiv.org/html/2502.11131v1" title="View HTML" id="html-2502.11131" aria-labelledby="html-2502.11131" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11131" title="Other formats" id="oth-2502.11131" aria-labelledby="oth-2502.11131">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Improving Similar Case Retrieval Ranking Performance By Revisiting RankSVM </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Y">Yuqi Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zheng,+Y">Yan Zheng</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Given the rapid development of Legal AI, a lot of attention has been paid to one of the most important legal AI tasks--similar case retrieval, especially with language models to use. In our paper, however, we try to improve the ranking performance of current models from the perspective of learning to rank instead of language models. Specifically, we conduct experiments using a pairwise method--RankSVM as the classifier to substitute a fully connected layer, combined with commonly used language models on similar case retrieval datasets LeCaRDv1 and LeCaRDv2. We finally come to the conclusion that RankSVM could generally help improve the retrieval performance on the LeCaRDv1 and LeCaRDv2 datasets compared with original classifiers by optimizing the precise ranking. It could also help mitigate overfitting owing to class imbalance. Our code is available in <a href="https://github.com/liuyuqi123study/RankSVM_for_SLR" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item68'>[68]</a> <a href ="/abs/2502.11137" title="Abstract" id="2502.11137"> arXiv:2502.11137 </a> [<a href="/pdf/2502.11137" title="Download PDF" id="pdf-2502.11137" aria-labelledby="pdf-2502.11137">pdf</a>, <a href="https://arxiv.org/html/2502.11137v1" title="View HTML" id="html-2502.11137" aria-labelledby="html-2502.11137" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11137" title="Other formats" id="oth-2502.11137" aria-labelledby="oth-2502.11137">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Safety Evaluation of DeepSeek Models in Chinese Contexts </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+W">Wenjing Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lei,+X">Xuejiao Lei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zhaoxiang Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+N">Ning Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Long,+Z">Zhenhong Long</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+P">Peijun Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+J">Jiaojiao Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hua,+M">Minjie Hua</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+C">Chaoyang Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+K">Kai Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lian,+S">Shiguo Lian</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Recently, the DeepSeek series of models, leveraging their exceptional reasoning capabilities and open-source strategy, is reshaping the global AI landscape. Despite these advantages, they exhibit significant safety deficiencies. Research conducted by Robust Intelligence, a subsidiary of Cisco, in collaboration with the University of Pennsylvania, revealed that DeepSeek-R1 has a 100\% attack success rate when processing harmful prompts. Additionally, multiple safety companies and research institutions have confirmed critical safety vulnerabilities in this model. As models demonstrating robust performance in Chinese and English, DeepSeek models require equally crucial safety assessments in both language contexts. However, current research has predominantly focused on safety evaluations in English environments, leaving a gap in comprehensive assessments of their safety performance in Chinese contexts. In response to this gap, this study introduces CHiSafetyBench, a Chinese-specific safety evaluation benchmark. This benchmark systematically evaluates the safety of DeepSeek-R1 and DeepSeek-V3 in Chinese contexts, revealing their performance across safety categories. The experimental results quantify the deficiencies of these two models in Chinese contexts, providing key insights for subsequent improvements. </p> </div> </dd> <dt> <a name='item69'>[69]</a> <a href ="/abs/2502.11150" title="Abstract" id="2502.11150"> arXiv:2502.11150 </a> [<a href="/pdf/2502.11150" title="Download PDF" id="pdf-2502.11150" aria-labelledby="pdf-2502.11150">pdf</a>, <a href="https://arxiv.org/html/2502.11150v1" title="View HTML" id="html-2502.11150" aria-labelledby="html-2502.11150" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11150" title="Other formats" id="oth-2502.11150" aria-labelledby="oth-2502.11150">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Surprisal Takes It All: Eye Tracking Based Cognitive Evaluation of Text Readability Measures </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Klein,+K+G">Keren Gruteke Klein</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Frenkel,+S">Shachar Frenkel</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shubi,+O">Omer Shubi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Berzak,+Y">Yevgeni Berzak</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Text readability measures are widely used in many real-world scenarios and in NLP. These measures have primarily been developed by predicting reading comprehension outcomes, while largely neglecting what is perhaps the core aspect of a readable text: reading ease. In this work, we propose a new eye tracking based methodology for evaluating readability measures, which focuses on their ability to account for reading facilitation effects in text simplification, as well as for text reading ease more broadly. Using this approach, we find that existing readability formulas are moderate to poor predictors of reading ease. We further find that average per-word length, frequency, and especially surprisal tend to outperform existing readability formulas as measures of reading ease. We thus propose surprisal as a simple unsupervised alternative to existing measures. </p> </div> </dd> <dt> <a name='item70'>[70]</a> <a href ="/abs/2502.11169" title="Abstract" id="2502.11169"> arXiv:2502.11169 </a> [<a href="/pdf/2502.11169" title="Download PDF" id="pdf-2502.11169" aria-labelledby="pdf-2502.11169">pdf</a>, <a href="https://arxiv.org/html/2502.11169v1" title="View HTML" id="html-2502.11169" aria-labelledby="html-2502.11169" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11169" title="Other formats" id="oth-2502.11169" aria-labelledby="oth-2502.11169">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Leveraging Constrained Monte Carlo Tree Search to Generate Reliable Long Chain-of-Thought for Mathematical Reasoning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+Q">Qingwen Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+B">Boyan Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Z">Zijian Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hao,+Z">Zhifeng Hao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+K">Keli Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cai,+R">Ruichu Cai</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Recently, Long Chain-of-Thoughts (CoTs) have gained widespread attention for improving the reasoning capabilities of Large Language Models (LLMs). This necessitates that existing LLMs, which lack the ability to generate Long CoTs, to acquire such capability through post-training methods. Without additional training, LLMs typically enhance their mathematical reasoning abilities through inference scaling methods such as MCTS. However, they are hindered by the large action space and inefficient search strategies, making it challenging to generate Long CoTs effectively. To tackle this issue, we propose constraining the action space and guiding the emergence of Long CoTs through a refined search strategy. In our proposed Constrained Monte Carlo Tree Search (C-MCTS) framework, we limit the actions selected from a constrained action space, which is divided into five disjoint subsets: \emph{understanding}, \emph{planning}, \emph{reflection}, \emph{coding}, and \emph{summary}. Each subset is further constrained to a small number of predefined prompts, rather than allowing LLMs to generate actions arbitrarily. Additionally, we refine the search strategy by incorporating prior knowledge about the action sets, such as a human-like partial order of the action subsets and the pretrained process reward models. These strategies work together to significantly reduce the vast search space of Long CoTs. Extensive evaluations on mathematical reasoning benchmarks show that, under zero-shot settings, our method enables the 7B model to achieve reasoning capabilities that surpass those of the 72B model. </p> </div> </dd> <dt> <a name='item71'>[71]</a> <a href ="/abs/2502.11175" title="Abstract" id="2502.11175"> arXiv:2502.11175 </a> [<a href="/pdf/2502.11175" title="Download PDF" id="pdf-2502.11175" aria-labelledby="pdf-2502.11175">pdf</a>, <a href="https://arxiv.org/html/2502.11175v1" title="View HTML" id="html-2502.11175" aria-labelledby="html-2502.11175" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11175" title="Other formats" id="oth-2502.11175" aria-labelledby="oth-2502.11175">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Investigating Language Preference of Multilingual RAG Systems </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Park,+J">Jeonghyun Park</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+H">Hwanhee Lee</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 30 pages, 16 tables, 14 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Multilingual Retrieval-Augmented Generation (mRAG) systems enhance language models by integrating external multilingual information to produce context-aware responses. However, mRAG systems struggle with retrieving relevant information due to linguistic variations between queries and documents, generating inconsistent responses when multilingual sources conflict. In this work, we systematically investigate language preferences in both retrieval and generation of mRAG through a series of experiments. Our analysis indicates that retrievers tend to prefer high-resource and query languages, yet this preference does not consistently improve generation performance. Moreover, we observe that generators prefer the query language or Latin scripts, leading to inconsistent outputs. To overcome these issues, we propose Dual Knowledge Multilingual RAG (DKM-RAG), a simple yet effective framework that fuses translated multilingual passages with complementary model knowledge. Empirical results demonstrate that DKM-RAG mitigates language preference in generation and enhances performance across diverse linguistic settings. </p> </div> </dd> <dt> <a name='item72'>[72]</a> <a href ="/abs/2502.11176" title="Abstract" id="2502.11176"> arXiv:2502.11176 </a> [<a href="/pdf/2502.11176" title="Download PDF" id="pdf-2502.11176" aria-labelledby="pdf-2502.11176">pdf</a>, <a href="/format/2502.11176" title="Other formats" id="oth-2502.11176" aria-labelledby="oth-2502.11176">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LogiDynamics: Unraveling the Dynamics of Logical Inference in Large Language Model Reasoning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zheng,+T">Tianshi Zheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cheng,+J">Jiayang Cheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+C">Chunyang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shi,+H">Haochen Shi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zihao Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bai,+J">Jiaxin Bai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+Y">Yangqiu Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wong,+G+Y">Ginny Y. Wong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=See,+S">Simon See</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 21 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Modern large language models (LLMs) employ various forms of logical inference, both implicitly and explicitly, when addressing reasoning tasks. Understanding how to optimally leverage these inference paradigms is critical for advancing LLMs' reasoning capabilities. This paper adopts an exploratory approach by introducing a controlled evaluation environment for analogical reasoning -- a fundamental cognitive task -- that is systematically parameterized across three dimensions: modality (textual, visual, symbolic), difficulty (easy, medium, hard), and task format (multiple-choice or free-text generation). We analyze the comparative dynamics of inductive, abductive, and deductive inference pipelines across these dimensions, and demonstrate that our findings generalize to broader in-context learning tasks. Additionally, we investigate advanced paradigms such as hypothesis selection, verification, and refinement, revealing their potential to scale up logical inference in LLM reasoning. This exploratory study provides a foundation for future research in enhancing LLM reasoning through systematic logical inference strategies. </p> </div> </dd> <dt> <a name='item73'>[73]</a> <a href ="/abs/2502.11177" title="Abstract" id="2502.11177"> arXiv:2502.11177 </a> [<a href="/pdf/2502.11177" title="Download PDF" id="pdf-2502.11177" aria-labelledby="pdf-2502.11177">pdf</a>, <a href="/format/2502.11177" title="Other formats" id="oth-2502.11177" aria-labelledby="oth-2502.11177">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> The Mirage of Model Editing: Revisiting Evaluation in the Wild </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+W">Wanli Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+F">Fei Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tan,+J">Jiajun Tan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+X">Xinyu Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+Q">Qi Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yin,+D">Dawei Yin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+H">Huawei Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cheng,+X">Xueqi Cheng</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Despite near-perfect results in artificial evaluations, the effectiveness of model editing in real-world applications remains unexplored. To bridge this gap, we propose to study model editing in question answering (QA) by establishing a rigorous evaluation practice to assess the effectiveness of editing methods in correcting LLMs' errors. It consists of QAEdit, a new benchmark derived from popular QA datasets, and a standardized evaluation framework. Our single editing experiments indicate that current editing methods perform substantially worse than previously reported (38.5% vs. ~96%). Through module analysis and controlled experiments, we demonstrate that this performance decline stems from issues in evaluation practices of prior editing research. One key issue is the inappropriate use of teacher forcing in testing prevents error propagation by feeding ground truth tokens (inaccessible in real-world scenarios) as input. Furthermore, we simulate real-world deployment by sequential editing, revealing that current approaches fail drastically with only 1000 edits. Our analysis provides a fundamental reexamination of both the real-world applicability of existing model editing methods and their evaluation practices, and establishes a rigorous evaluation framework with key insights to advance reliable and practical model editing research. </p> </div> </dd> <dt> <a name='item74'>[74]</a> <a href ="/abs/2502.11183" title="Abstract" id="2502.11183"> arXiv:2502.11183 </a> [<a href="/pdf/2502.11183" title="Download PDF" id="pdf-2502.11183" aria-labelledby="pdf-2502.11183">pdf</a>, <a href="https://arxiv.org/html/2502.11183v1" title="View HTML" id="html-2502.11183" aria-labelledby="html-2502.11183" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11183" title="Other formats" id="oth-2502.11183" aria-labelledby="oth-2502.11183">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Don't Get Lost in the Trees: Streamlining LLM Reasoning by Overcoming Tree Search Exploration Pitfalls </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+A">Ante Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+L">Linfeng Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tian,+Y">Ye Tian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+D">Dian Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mi,+H">Haitao Mi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Duan,+X">Xiangyu Duan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tu,+Z">Zhaopeng Tu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Su,+J">Jinsong Su</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+D">Dong Yu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Recent advancements in tree search algorithms guided by verifiers have significantly enhanced the reasoning capabilities of large language models (LLMs), but at the cost of increased computational resources. In this work, we identify two key challenges contributing to this inefficiency: $\textit{over-exploration}$ due to redundant states with semantically equivalent content, and $\textit{under-exploration}$ caused by high variance in verifier scoring leading to frequent trajectory switching. To address these issues, we propose FETCH, an e$\textbf{f}$fici$\textbf{e}$nt $\textbf{t}$ree sear$\textbf{ch}$ framework, which is a flexible, plug-and-play system compatible with various tree search algorithms. Our framework mitigates over-exploration by merging semantically similar states using agglomerative clustering of text embeddings obtained from a fine-tuned SimCSE model. To tackle under-exploration, we enhance verifiers by incorporating temporal difference learning with adjusted $\lambda$-returns during training to reduce variance, and employing a verifier ensemble to aggregate scores during inference. Experiments on GSM8K, GSM-Plus, and MATH datasets demonstrate that our methods significantly improve reasoning accuracy and computational efficiency across four different tree search algorithms, paving the way for more practical applications of LLM-based reasoning. The code will be released upon acceptance. </p> </div> </dd> <dt> <a name='item75'>[75]</a> <a href ="/abs/2502.11184" title="Abstract" id="2502.11184"> arXiv:2502.11184 </a> [<a href="/pdf/2502.11184" title="Download PDF" id="pdf-2502.11184" aria-labelledby="pdf-2502.11184">pdf</a>, <a href="https://arxiv.org/html/2502.11184v1" title="View HTML" id="html-2502.11184" aria-labelledby="html-2502.11184" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11184" title="Other formats" id="oth-2502.11184" aria-labelledby="oth-2502.11184">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Can't See the Forest for the Trees: Benchmarking Multimodal Safety Awareness for Multimodal LLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+W">Wenxuan Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+X">Xiaoyuan Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+K">Kuiyi Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+J">Jen-tse Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+Y">Youliang Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+P">Pinjia He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Shuai Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tu,+Z">Zhaopeng Tu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV); Multimedia (cs.MM) </div> <p class='mathjax'> Multimodal Large Language Models (MLLMs) have expanded the capabilities of traditional language models by enabling interaction through both text and images. However, ensuring the safety of these models remains a significant challenge, particularly in accurately identifying whether multimodal content is safe or unsafe-a capability we term safety awareness. In this paper, we introduce MMSafeAware, the first comprehensive multimodal safety awareness benchmark designed to evaluate MLLMs across 29 safety scenarios with 1500 carefully curated image-prompt pairs. MMSafeAware includes both unsafe and over-safety subsets to assess models abilities to correctly identify unsafe content and avoid over-sensitivity that can hinder helpfulness. Evaluating nine widely used MLLMs using MMSafeAware reveals that current models are not sufficiently safe and often overly sensitive; for example, GPT-4V misclassifies 36.1% of unsafe inputs as safe and 59.9% of benign inputs as unsafe. We further explore three methods to improve safety awareness-prompting-based approaches, visual contrastive decoding, and vision-centric reasoning fine-tuning-but find that none achieve satisfactory performance. Our findings highlight the profound challenges in developing MLLMs with robust safety awareness, underscoring the need for further research in this area. All the code and data will be publicly available to facilitate future research. </p> </div> </dd> <dt> <a name='item76'>[76]</a> <a href ="/abs/2502.11187" title="Abstract" id="2502.11187"> arXiv:2502.11187 </a> [<a href="/pdf/2502.11187" title="Download PDF" id="pdf-2502.11187" aria-labelledby="pdf-2502.11187">pdf</a>, <a href="https://arxiv.org/html/2502.11187v1" title="View HTML" id="html-2502.11187" aria-labelledby="html-2502.11187" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11187" title="Other formats" id="oth-2502.11187" aria-labelledby="oth-2502.11187">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> TituLLMs: A Family of Bangla LLMs with Comprehensive Benchmarking </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Nahin,+S+K">Shahriar Kabir Nahin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nandi,+R+N">Rabindra Nath Nandi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sarker,+S">Sagor Sarker</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Muhtaseem,+Q+S">Quazi Sarwar Muhtaseem</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kowsher,+M">Md Kowsher</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shill,+A+C">Apu Chandraw Shill</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ibrahim,+M">Md Ibrahim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Menon,+M+H">Mehadi Hasan Menon</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Muntasir,+T+A">Tareq Al Muntasir</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Alam,+F">Firoj Alam</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> LLMs, Benchmarking, Large Language Models, Bangla </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> In this paper, we present TituLLMs, the first large pretrained Bangla LLMs, available in 1B and 3B parameter sizes. Due to computational constraints during both training and inference, we focused on smaller models. To train TituLLMs, we collected a pretraining dataset of approximately 37 billion tokens. We extended the Llama-3.2 tokenizer to incorporate language- and culture-specific knowledge, which also enables faster training and inference. There was a lack of benchmarking datasets to evaluate LLMs for Bangla. To address this gap, we developed five benchmarking datasets. We benchmarked various LLMs, including TituLLMs, and demonstrated that TituLLMs outperforms its initial multilingual versions. However, this is not always the case, highlighting the complexities of language adaptation. Our work lays the groundwork for adapting existing multilingual open models to other low-resource languages. To facilitate broader adoption and further research, we have made the TituLLMs models and benchmarking datasets publicly available (<a href="https://huggingface.co/collections/hishab/titulm-llama-family-6718d31fc1b83529276f490a" rel="external noopener nofollow" class="link-external link-https">this https URL</a>). </p> </div> </dd> <dt> <a name='item77'>[77]</a> <a href ="/abs/2502.11190" title="Abstract" id="2502.11190"> arXiv:2502.11190 </a> [<a href="/pdf/2502.11190" title="Download PDF" id="pdf-2502.11190" aria-labelledby="pdf-2502.11190">pdf</a>, <a href="/format/2502.11190" title="Other formats" id="oth-2502.11190" aria-labelledby="oth-2502.11190">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> ReLearn: Unlearning via Learning for Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+H">Haoming Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+N">Ningyuan Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+L">Liming Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+S">Sendong Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Deng,+S">Shumin Deng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+M">Mengru Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hooi,+B">Bryan Hooi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Oo,+N">Nay Oo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+H">Huajun Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+N">Ningyu Zhang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Work in progress </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV); Human-Computer Interaction (cs.HC); Machine Learning (cs.LG) </div> <p class='mathjax'> Current unlearning methods for large language models usually rely on reverse optimization to reduce target token probabilities. However, this paradigm disrupts the subsequent tokens prediction, degrading model performance and linguistic coherence. Moreover, existing evaluation metrics overemphasize contextual forgetting while inadequately assessing response fluency and relevance. To address these challenges, we propose ReLearn, a data augmentation and fine-tuning pipeline for effective unlearning, along with a comprehensive evaluation framework. This framework introduces Knowledge Forgetting Rate (KFR) and Knowledge Retention Rate (KRR) to measure knowledge-level preservation, and Linguistic Score (LS) to evaluate generation quality. Our experiments show that ReLearn successfully achieves targeted forgetting while preserving high-quality output. Through mechanistic analysis, we further demonstrate how reverse optimization disrupts coherent text generation, while ReLearn preserves this essential capability. Code is available at <a href="https://github.com/zjunlp/unlearn" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item78'>[78]</a> <a href ="/abs/2502.11193" title="Abstract" id="2502.11193"> arXiv:2502.11193 </a> [<a href="/pdf/2502.11193" title="Download PDF" id="pdf-2502.11193" aria-labelledby="pdf-2502.11193">pdf</a>, <a href="https://arxiv.org/html/2502.11193v1" title="View HTML" id="html-2502.11193" aria-labelledby="html-2502.11193" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11193" title="Other formats" id="oth-2502.11193" aria-labelledby="oth-2502.11193">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Large Language Models Penetration in Scholarly Writing and Peer Review </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+L">Li Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+R">Ruijie Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dai,+X">Xunlian Dai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hershcovich,+D">Daniel Hershcovich</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+H">Haizhou Li</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Transparency in NLP, LLM-generated text evaluation and detection, LLM Penetration, Scholarly Credibility and Accountability </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> While the widespread use of Large Language Models (LLMs) brings convenience, it also raises concerns about the credibility of academic research and scholarly processes. To better understand these dynamics, we evaluate the penetration of LLMs across academic workflows from multiple perspectives and dimensions, providing compelling evidence of their growing influence. We propose a framework with two components: \texttt{ScholarLens}, a curated dataset of human- and LLM-generated content across scholarly writing and peer review for multi-perspective evaluation, and \texttt{LLMetrica}, a tool for assessing LLM penetration using rule-based metrics and model-based detectors for multi-dimensional evaluation. Our experiments demonstrate the effectiveness of \texttt{LLMetrica}, revealing the increasing role of LLMs in scholarly processes. These findings emphasize the need for transparency, accountability, and ethical practices in LLM usage to maintain academic credibility. </p> </div> </dd> <dt> <a name='item79'>[79]</a> <a href ="/abs/2502.11198" title="Abstract" id="2502.11198"> arXiv:2502.11198 </a> [<a href="/pdf/2502.11198" title="Download PDF" id="pdf-2502.11198" aria-labelledby="pdf-2502.11198">pdf</a>, <a href="https://arxiv.org/html/2502.11198v1" title="View HTML" id="html-2502.11198" aria-labelledby="html-2502.11198" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11198" title="Other formats" id="oth-2502.11198" aria-labelledby="oth-2502.11198">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> ANCHOLIK-NER: A Benchmark Dataset for Bangla Regional Named Entity Recognition </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Paul,+B">Bidyarthi Paul</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Preotee,+F+F">Faika Fairuj Preotee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sarker,+S">Shuvashis Sarker</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Refat,+S+R">Shamim Rahim Refat</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Islam,+S">Shifat Islam</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Muhammad,+T">Tashreef Muhammad</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hoque,+M+A">Mohammad Ashraful Hoque</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Manzoor,+S">Shahriar Manzoor</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> ANCHOLIK-NER is a linguistically diverse dataset for Named Entity Recognition (NER) in Bangla regional dialects, capturing variations across Sylhet, Chittagong, and Barishal. The dataset has around 10,443 sentences, 3,481 sentences per region. The data was collected from two publicly available datasets and through web scraping from various online newspapers, articles. To ensure high-quality annotations, the BIO tagging scheme was employed, and professional annotators with expertise in regional dialects carried out the labeling process. The dataset is structured into separate subsets for each region and is available both in CSV format. Each entry contains textual data along with identified named entities and their corresponding annotations. Named entities are categorized into ten distinct classes: Person, Location, Organization, Food, Animal, Colour, Role, Relation, Object, and Miscellaneous. This dataset serves as a valuable resource for developing and evaluating NER models for Bangla dialectal variations, contributing to regional language processing and low-resource NLP applications. It can be utilized to enhance NER systems in Bangla dialects, improve regional language understanding, and support applications in machine translation, information retrieval, and conversational AI. </p> </div> </dd> <dt> <a name='item80'>[80]</a> <a href ="/abs/2502.11211" title="Abstract" id="2502.11211"> arXiv:2502.11211 </a> [<a href="/pdf/2502.11211" title="Download PDF" id="pdf-2502.11211" aria-labelledby="pdf-2502.11211">pdf</a>, <a href="https://arxiv.org/html/2502.11211v1" title="View HTML" id="html-2502.11211" aria-labelledby="html-2502.11211" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11211" title="Other formats" id="oth-2502.11211" aria-labelledby="oth-2502.11211">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Survey of LLM-based Agents in Medicine: How far are we from Baymax? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+W">Wenxuan Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+Z">Zizhan Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zheng Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+C">Chenghan Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+W">Wenting Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+X">Xiang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+Y">Yixuan Yuan</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Large Language Models (LLMs) are transforming healthcare through the development of LLM-based agents that can understand, reason about, and assist with medical tasks. This survey provides a comprehensive review of LLM-based agents in medicine, examining their architectures, applications, and challenges. We analyze the key components of medical agent systems, including system profiles, clinical planning mechanisms, medical reasoning frameworks, and external capacity enhancement. The survey covers major application scenarios such as clinical decision support, medical documentation, training simulations, and healthcare service optimization. We discuss evaluation frameworks and metrics used to assess these agents' performance in healthcare settings. While LLM-based agents show promise in enhancing healthcare delivery, several challenges remain, including hallucination management, multimodal integration, implementation barriers, and ethical considerations. The survey concludes by highlighting future research directions, including advances in medical reasoning inspired by recent developments in LLM architectures, integration with physical systems, and improvements in training simulations. This work provides researchers and practitioners with a structured overview of the current state and future prospects of LLM-based agents in medicine. </p> </div> </dd> <dt> <a name='item81'>[81]</a> <a href ="/abs/2502.11223" title="Abstract" id="2502.11223"> arXiv:2502.11223 </a> [<a href="/pdf/2502.11223" title="Download PDF" id="pdf-2502.11223" aria-labelledby="pdf-2502.11223">pdf</a>, <a href="/format/2502.11223" title="Other formats" id="oth-2502.11223" aria-labelledby="oth-2502.11223">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Asymmetric Conflict and Synergy in Post-training for LLM-based Multilingual Machine Translation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zheng,+T">Tong Zheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wen,+Y">Yan Wen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bao,+H">Huiwen Bao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+J">Junfeng Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+H">Heng Huang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 22 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The emergence of Large Language Models (LLMs) has advanced the multilingual machine translation (MMT), yet the Curse of Multilinguality (CoM) remains a major challenge. Existing work in LLM-based MMT typically mitigates this issue via scaling up training and computation budget, which raises a critical question: Is scaling up the training and computation budget truly necessary for high-quality MMT, or can a deeper understanding of CoM provide a more efficient solution? To explore this problem, we analyze the linguistic conflicts and synergy, the underlying mechanism of CoM during post-training phase. We identify an asymmetric phenomenon in linguistic conflicts and synergy: the dominance of conflicts and synergy varies in different translation directions, leading to sub-optimal adaptation in existing post-training methods. We further find that a significant bottleneck in MMT appears to lie in post-training rather than multilingual pre-training, suggesting the need for more effective adaptation strategies. Building on these new insights, we propose a direction-aware training approach, combined with group-wise model merging, to address asymmetry in linguistic conflicts and synergy explicitly. Leveraging this strategy, our method fine-tunes X-ALMA-13B-Pretrain-trained only with multilingual pre-training-achieving comparable performance to XALMA-13B (only SFT) while using only 20B pretraining tokens and 17B parameters-5.5x fewer pretraining-tokens and 1.7x fewer model size-with just 0.85 COMET drop on Flores-200 testsets of 50 languages. </p> </div> </dd> <dt> <a name='item82'>[82]</a> <a href ="/abs/2502.11228" title="Abstract" id="2502.11228"> arXiv:2502.11228 </a> [<a href="/pdf/2502.11228" title="Download PDF" id="pdf-2502.11228" aria-labelledby="pdf-2502.11228">pdf</a>, <a href="https://arxiv.org/html/2502.11228v1" title="View HTML" id="html-2502.11228" aria-labelledby="html-2502.11228" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11228" title="Other formats" id="oth-2502.11228" aria-labelledby="oth-2502.11228">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Vendi-RAG: Adaptively Trading-Off Diversity And Quality Significantly Improves Retrieval Augmented Generation With LLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Rezaei,+M+R">Mohammad Reza Rezaei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dieng,+A+B">Adji Bousso Dieng</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> A RAG pipeline that accounts for both diversity and answer quality and that can be used with any LLM backbone to solve complex multi-hop question-answering tasks </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Retrieval-augmented generation (RAG) enhances large language models (LLMs) for domain-specific question-answering (QA) tasks by leveraging external knowledge sources. However, traditional RAG systems primarily focus on relevance-based retrieval and often struggle with redundancy, especially when reasoning requires connecting information from multiple sources. This paper introduces Vendi-RAG, a framework based on an iterative process that jointly optimizes retrieval diversity and answer quality. This joint optimization leads to significantly higher accuracy for multi-hop QA tasks. Vendi-RAG leverages the Vendi Score (VS), a flexible similarity-based diversity metric, to promote semantic diversity in document retrieval. It then uses an LLM judge that evaluates candidate answers, generated after a reasoning step, and outputs a score that the retriever uses to balance relevance and diversity among the retrieved documents during each iteration. Experiments on three challenging datasets -- HotpotQA, MuSiQue, and 2WikiMultiHopQA -- demonstrate Vendi-RAG's effectiveness in multi-hop reasoning tasks. The framework achieves significant accuracy improvements over traditional single-step and multi-step RAG approaches, with accuracy increases reaching up to +4.2% on HotpotQA, +4.1% on 2WikiMultiHopQA, and +1.3% on MuSiQue compared to Adaptive-RAG, the current best baseline. The benefits of Vendi-RAG are even more pronounced as the number of retrieved documents increases. Finally, we evaluated Vendi-RAG across different LLM backbones, including GPT-3.5, GPT-4, and GPT-4o-mini, and observed consistent improvements, demonstrating that the framework's advantages are model-agnostic. </p> </div> </dd> <dt> <a name='item83'>[83]</a> <a href ="/abs/2502.11244" title="Abstract" id="2502.11244"> arXiv:2502.11244 </a> [<a href="/pdf/2502.11244" title="Download PDF" id="pdf-2502.11244" aria-labelledby="pdf-2502.11244">pdf</a>, <a href="https://arxiv.org/html/2502.11244v1" title="View HTML" id="html-2502.11244" aria-labelledby="html-2502.11244" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11244" title="Other formats" id="oth-2502.11244" aria-labelledby="oth-2502.11244">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Soteria: Language-Specific Functional Parameter Steering for Multilingual Safety Alignment </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Banerjee,+S">Somnath Banerjee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Layek,+S">Sayan Layek</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chatterjee,+P">Pratyush Chatterjee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mukherjee,+A">Animesh Mukherjee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hazra,+R">Rima Hazra</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Ensuring consistent safety across multiple languages remains a significant challenge for large language models (LLMs). We introduce Soteria, a lightweight yet powerful strategy that locates and minimally adjusts the "functional heads" most responsible for harmful content generation in each language. By altering only a fraction of parameters, Soteria drastically reduces policy violations without sacrificing overall model performance, even in low-resource settings. To rigorously evaluate our approach, we also present XThreatBench, a specialized multilingual dataset capturing fine-grained harmful behaviors drawn from real policy guidelines. Experiments with leading open-source LLMs (e.g., Llama, Qwen, Mistral) show that Soteria consistently improves safety metrics across high-, mid-, and low-resource languages. These findings highlight a promising path toward scalable, linguistically attuned, and ethically aligned LLMs worldwide. </p> </div> </dd> <dt> <a name='item84'>[84]</a> <a href ="/abs/2502.11250" title="Abstract" id="2502.11250"> arXiv:2502.11250 </a> [<a href="/pdf/2502.11250" title="Download PDF" id="pdf-2502.11250" aria-labelledby="pdf-2502.11250">pdf</a>, <a href="https://arxiv.org/html/2502.11250v1" title="View HTML" id="html-2502.11250" aria-labelledby="html-2502.11250" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11250" title="Other formats" id="oth-2502.11250" aria-labelledby="oth-2502.11250">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Uncertainty-Aware Step-wise Verification with Generative Reward Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ye,+Z">Zihuiwen Ye</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Melo,+L+C">Luckeciano Carvalho Melo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kaddar,+Y">Younesse Kaddar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Blunsom,+P">Phil Blunsom</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Staton,+S">Sam Staton</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gal,+Y">Yarin Gal</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Complex multi-step reasoning tasks, such as solving mathematical problems, remain challenging for large language models (LLMs). While outcome supervision is commonly used, process supervision via process reward models (PRMs) provides intermediate rewards to verify step-wise correctness in solution traces. However, as proxies for human judgement, PRMs suffer from reliability issues, including susceptibility to reward hacking. In this work, we propose leveraging uncertainty quantification (UQ) to enhance the reliability of step-wise verification with generative reward models for mathematical reasoning tasks. We introduce CoT Entropy, a novel UQ method that outperforms existing approaches in quantifying a PRM's uncertainty in step-wise verification. Our results demonstrate that incorporating uncertainty estimates improves the robustness of judge-LM PRMs, leading to more reliable verification. </p> </div> </dd> <dt> <a name='item85'>[85]</a> <a href ="/abs/2502.11258" title="Abstract" id="2502.11258"> arXiv:2502.11258 </a> [<a href="/pdf/2502.11258" title="Download PDF" id="pdf-2502.11258" aria-labelledby="pdf-2502.11258">pdf</a>, <a href="https://arxiv.org/html/2502.11258v1" title="View HTML" id="html-2502.11258" aria-labelledby="html-2502.11258" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11258" title="Other formats" id="oth-2502.11258" aria-labelledby="oth-2502.11258">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Leveraging Conditional Mutual Information to Improve Large Language Model Fine-Tuning For Classification </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Sivakaran,+T">Thanushon Sivakaran</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+E">En-Hui Yang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 6 pages, 2 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Although large language models (LLMs) have demonstrated remarkable capabilities in recent years, the potential of information theory (IT) to enhance LLM development remains underexplored. This paper introduces the information theoretic principle of Conditional Mutual Information (CMI) to LLM fine-tuning for classification tasks, exploring its promise in two main ways: minimizing CMI to improve a model's standalone performance and maximizing CMI to enhance knowledge distillation (KD) for more capable student models. To apply CMI in LLM fine-tuning, we adapt the recently proposed CMI-constrained deep learning framework, which was initially developed for image classification, with some modification. By minimizing CMI during LLM fine-tuning, we achieve superior performance gains on 6 of 8 GLUE classification tasks compared to BERT. Additionally, maximizing CMI during the KD process results in significant performance improvements in 6 of 8 GLUE classification tasks compared to DistilBERT. These findings demonstrate CMI's adaptability for optimizing both standalone LLMs and student models, showcasing its potential as a robust framework for advancing LLM fine-tuning. Our work bridges the gap between information theory and LLM development, offering new insights for building high-performing language models. </p> </div> </dd> <dt> <a name='item86'>[86]</a> <a href ="/abs/2502.11266" title="Abstract" id="2502.11266"> arXiv:2502.11266 </a> [<a href="/pdf/2502.11266" title="Download PDF" id="pdf-2502.11266" aria-labelledby="pdf-2502.11266">pdf</a>, <a href="https://arxiv.org/html/2502.11266v1" title="View HTML" id="html-2502.11266" aria-labelledby="html-2502.11266" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11266" title="Other formats" id="oth-2502.11266" aria-labelledby="oth-2502.11266">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> The Shrinking Landscape of Linguistic Diversity in the Age of Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Sourati,+Z">Zhivar Sourati</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Karimi-Malekabadi,+F">Farzan Karimi-Malekabadi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ozcan,+M">Meltem Ozcan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=McDaniel,+C">Colin McDaniel</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ziabari,+A">Alireza Ziabari</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Trager,+J">Jackson Trager</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tak,+A">Ala Tak</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+M">Meng Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Morstatter,+F">Fred Morstatter</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dehghani,+M">Morteza Dehghani</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> arXiv admin note: text overlap with <a href="https://arxiv.org/abs/2404.00267" data-arxiv-id="2404.00267" class="link-https">arXiv:2404.00267</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Language is far more than a communication tool. A wealth of information - including but not limited to the identities, psychological states, and social contexts of its users - can be gleaned through linguistic markers, and such insights are routinely leveraged across diverse fields ranging from product development and marketing to healthcare. In four studies utilizing experimental and observational methods, we demonstrate that the widespread adoption of large language models (LLMs) as writing assistants is linked to notable declines in linguistic diversity and may interfere with the societal and psychological insights language provides. We show that while the core content of texts is retained when LLMs polish and rewrite texts, not only do they homogenize writing styles, but they also alter stylistic elements in a way that selectively amplifies certain dominant characteristics or biases while suppressing others - emphasizing conformity over individuality. By varying LLMs, prompts, classifiers, and contexts, we show that these trends are robust and consistent. Our findings highlight a wide array of risks associated with linguistic homogenization, including compromised diagnostic processes and personalization efforts, the exacerbation of existing divides and barriers to equity in settings like personnel selection where language plays a critical role in assessing candidates' qualifications, communication skills, and cultural fit, and the undermining of efforts for cultural preservation. </p> </div> </dd> <dt> <a name='item87'>[87]</a> <a href ="/abs/2502.11268" title="Abstract" id="2502.11268"> arXiv:2502.11268 </a> [<a href="/pdf/2502.11268" title="Download PDF" id="pdf-2502.11268" aria-labelledby="pdf-2502.11268">pdf</a>, <a href="https://arxiv.org/html/2502.11268v1" title="View HTML" id="html-2502.11268" aria-labelledby="html-2502.11268" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11268" title="Other formats" id="oth-2502.11268" aria-labelledby="oth-2502.11268">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Improved Unbiased Watermark for Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+R">Ruibo Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Y">Yihan Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+J">Junfeng Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+H">Heng Huang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> As artificial intelligence surpasses human capabilities in text generation, the necessity to authenticate the origins of AI-generated content has become paramount. Unbiased watermarks offer a powerful solution by embedding statistical signals into language model-generated text without distorting the quality. In this paper, we introduce MCmark, a family of unbiased, Multi-Channel-based watermarks. MCmark works by partitioning the model's vocabulary into segments and promoting token probabilities within a selected segment based on a watermark key. We demonstrate that MCmark not only preserves the original distribution of the language model but also offers significant improvements in detectability and robustness over existing unbiased watermarks. Our experiments with widely-used language models demonstrate an improvement in detectability of over 10% using MCmark, compared to existing state-of-the-art unbiased watermarks. This advancement underscores MCmark's potential in enhancing the practical application of watermarking in AI-generated texts. </p> </div> </dd> <dt> <a name='item88'>[88]</a> <a href ="/abs/2502.11275" title="Abstract" id="2502.11275"> arXiv:2502.11275 </a> [<a href="/pdf/2502.11275" title="Download PDF" id="pdf-2502.11275" aria-labelledby="pdf-2502.11275">pdf</a>, <a href="https://arxiv.org/html/2502.11275v1" title="View HTML" id="html-2502.11275" aria-labelledby="html-2502.11275" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11275" title="Other formats" id="oth-2502.11275" aria-labelledby="oth-2502.11275">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Cuckoo: An IE Free Rider Hatched by Massive Nutrition in LLM's Nest </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Peng,+L">Letian Peng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zilong Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yao,+F">Feng Yao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shang,+J">Jingbo Shang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Massive high-quality data, both pre-training raw texts and post-training annotations, have been carefully prepared to incubate advanced large language models (LLMs). In contrast, for information extraction (IE), pre-training data, such as BIO-tagged sequences, are hard to scale up. We show that IE models can act as free riders on LLM resources by reframing next-token \emph{prediction} into \emph{extraction} for tokens already present in the context. Specifically, our proposed next tokens extraction (NTE) paradigm learns a versatile IE model, \emph{Cuckoo}, with 102.6M extractive data converted from LLM's pre-training and post-training data. Under the few-shot setting, Cuckoo adapts effectively to traditional and complex instruction-following IE with better performance than existing pre-trained IE models. As a free rider, Cuckoo can naturally evolve with the ongoing advancements in LLM data preparation, benefiting from improvements in LLM training pipelines without additional manual effort. </p> </div> </dd> <dt> <a name='item89'>[89]</a> <a href ="/abs/2502.11276" title="Abstract" id="2502.11276"> arXiv:2502.11276 </a> [<a href="/pdf/2502.11276" title="Download PDF" id="pdf-2502.11276" aria-labelledby="pdf-2502.11276">pdf</a>, <a href="https://arxiv.org/html/2502.11276v1" title="View HTML" id="html-2502.11276" aria-labelledby="html-2502.11276" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11276" title="Other formats" id="oth-2502.11276" aria-labelledby="oth-2502.11276">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> The Rotary Position Embedding May Cause Dimension Inefficiency in Attention Heads for Long-Distance Retrieval </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chiang,+T">Ting-Rui Chiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yogatama,+D">Dani Yogatama</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> The Rotary Position Embedding (RoPE) is widely used in the attention heads of many large language models (LLM). It rotates dimensions in the query and the key vectors by different angles according to their positions in the input sequence. For long context modeling, the range of positions may vary a lot, and thus RoPE rotates some dimensions by a great range of angles. We hypothesize that the wide range of rotation angles may prevent LLMs from utilizing those dimensions. To validate this hypothesis, we present a controlled experiment showing that applying RoPE causes low utility of certain dimensions. Our analyses on three LLMs also indicate that these dimensions do not help LLMs do long-context question answering. </p> </div> </dd> <dt> <a name='item90'>[90]</a> <a href ="/abs/2502.11300" title="Abstract" id="2502.11300"> arXiv:2502.11300 </a> [<a href="/pdf/2502.11300" title="Download PDF" id="pdf-2502.11300" aria-labelledby="pdf-2502.11300">pdf</a>, <a href="https://arxiv.org/html/2502.11300v1" title="View HTML" id="html-2502.11300" aria-labelledby="html-2502.11300" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11300" title="Other formats" id="oth-2502.11300" aria-labelledby="oth-2502.11300">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CORDIAL: Can Multimodal Large Language Models Effectively Understand Coherence Relationships? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ramakrishnan,+A+A">Aashish Anantha Ramakrishnan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ramakrishnan,+A+A">Aadarsh Anantha Ramakrishnan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+D">Dongwon Lee</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Multimodal Large Language Models (MLLMs) are renowned for their superior instruction-following and reasoning capabilities across diverse problem domains. However, existing benchmarks primarily focus on assessing factual and logical correctness in downstream tasks, with limited emphasis on evaluating MLLMs' ability to interpret pragmatic cues and intermodal relationships. To address this gap, we assess the competency of MLLMs in performing Multimodal Discourse Analysis (MDA) using Coherence Relations. Our benchmark, CORDIAL, encompasses a broad spectrum of Coherence Relations across 3 different discourse domains at varying levels of granularity. Through our experiments on 10+ MLLMs employing different prompting strategies, we show that even top models like Gemini 1.5 Pro and GPT-4o fail to match the performance of simple classifier-based baselines. This study emphasizes the need to move beyond similarity-based metrics and adopt a discourse-driven framework for evaluating MLLMs, providing a more nuanced assessment of their capabilities. The benchmark and code are available at: <a href="https://github.com/aashish2000/CORDIAL" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item91'>[91]</a> <a href ="/abs/2502.11306" title="Abstract" id="2502.11306"> arXiv:2502.11306 </a> [<a href="/pdf/2502.11306" title="Download PDF" id="pdf-2502.11306" aria-labelledby="pdf-2502.11306">pdf</a>, <a href="https://arxiv.org/html/2502.11306v1" title="View HTML" id="html-2502.11306" aria-labelledby="html-2502.11306" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11306" title="Other formats" id="oth-2502.11306" aria-labelledby="oth-2502.11306">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Smoothing Out Hallucinations: Mitigating LLM Hallucination with Smoothed Knowledge Distillation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Nguyen,+H">Hieu Nguyen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+Z">Zihao He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gandre,+S+A">Shoumik Atul Gandre</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pasupulety,+U">Ujjwal Pasupulety</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shivakumar,+S+K">Sharanya Kumari Shivakumar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lerman,+K">Kristina Lerman</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Large language models (LLMs) often suffer from hallucination, generating factually incorrect or ungrounded content, which limits their reliability in high-stakes applications. A key factor contributing to hallucination is the use of hard labels during training, which enforce deterministic supervision, encourage overconfidence, and disregard the uncertainty inherent in natural language. To address this, we propose mitigating hallucination through knowledge distillation (KD), where a teacher model provides smoothed soft labels to a student model, reducing overconfidence and improving factual grounding. We apply KD during supervised finetuning on instructional data, evaluating its effectiveness across LLMs from different families. Experimental results on summarization benchmarks demonstrate that KD reduces hallucination compared to standard finetuning while preserving performance on general NLP tasks. These findings highlight KD as a promising approach for mitigating hallucination in LLMs and improving model reliability. </p> </div> </dd> <dt> <a name='item92'>[92]</a> <a href ="/abs/2502.11330" title="Abstract" id="2502.11330"> arXiv:2502.11330 </a> [<a href="/pdf/2502.11330" title="Download PDF" id="pdf-2502.11330" aria-labelledby="pdf-2502.11330">pdf</a>, <a href="https://arxiv.org/html/2502.11330v1" title="View HTML" id="html-2502.11330" aria-labelledby="html-2502.11330" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11330" title="Other formats" id="oth-2502.11330" aria-labelledby="oth-2502.11330">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> System Message Generation for User Preferences using Open-Source Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Jeong,+M">Minbyul Jeong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cho,+J">Jungho Cho</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Khang,+M">Minsoo Khang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jung,+D">Dawoon Jung</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hong,+T">Teakgyu Hong</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> System messages play a crucial role in interactions with large language models (LLMs), often serving as prompts to initiate conversations. Through system messages, users can assign specific roles, perform intended tasks, incorporate background information, specify various output formats and communication styles. Despite such versatility, publicly available data are often lack system messages and subject to strict license constraints in the industry field. Manual labeling of publicly available data with system messages that align with user instructions demands significant resources. In view of such challenges, our work introduces SysGen, a pipeline for generating system messages with better aligned assistant responses from the supervised fine-tuning dataset without system messages. Training on SysGen data has demonstrated substantial improvements in the alignment of model responses with system messages and user instructions, as demonstrated across various open-source models on the Multifacet benchmark, while maintaining minimal impact on other unseen benchmarks such as Open LLM Leaderboard 2. Our qualitative analysis highlights the importance of diverse system messages to ensure better adaptability across different contexts. </p> </div> </dd> <dt> <a name='item93'>[93]</a> <a href ="/abs/2502.11336" title="Abstract" id="2502.11336"> arXiv:2502.11336 </a> [<a href="/pdf/2502.11336" title="Download PDF" id="pdf-2502.11336" aria-labelledby="pdf-2502.11336">pdf</a>, <a href="https://arxiv.org/html/2502.11336v1" title="View HTML" id="html-2502.11336" aria-labelledby="html-2502.11336" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11336" title="Other formats" id="oth-2502.11336" aria-labelledby="oth-2502.11336">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> ExaGPT: Example-Based Machine-Generated Text Detection for Human Interpretability </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Koike,+R">Ryuto Koike</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kaneko,+M">Masahiro Kaneko</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Niwa,+A">Ayana Niwa</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nakov,+P">Preslav Nakov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Okazaki,+N">Naoaki Okazaki</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Under Review </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Detecting texts generated by Large Language Models (LLMs) could cause grave mistakes due to incorrect decisions, such as undermining student's academic dignity. LLM text detection thus needs to ensure the interpretability of the decision, which can help users judge how reliably correct its prediction is. When humans verify whether a text is human-written or LLM-generated, they intuitively investigate with which of them it shares more similar spans. However, existing interpretable detectors are not aligned with the human decision-making process and fail to offer evidence that users easily understand. To bridge this gap, we introduce ExaGPT, an interpretable detection approach grounded in the human decision-making process for verifying the origin of a text. ExaGPT identifies a text by checking whether it shares more similar spans with human-written vs. with LLM-generated texts from a datastore. This approach can provide similar span examples that contribute to the decision for each span in the text as evidence. Our human evaluation demonstrates that providing similar span examples contributes more effectively to judging the correctness of the decision than existing interpretable methods. Moreover, extensive experiments in four domains and three generators show that ExaGPT massively outperforms prior powerful detectors by up to +40.9 points of accuracy at a false positive rate of 1%. </p> </div> </dd> <dt> <a name='item94'>[94]</a> <a href ="/abs/2502.11345" title="Abstract" id="2502.11345"> arXiv:2502.11345 </a> [<a href="/pdf/2502.11345" title="Download PDF" id="pdf-2502.11345" aria-labelledby="pdf-2502.11345">pdf</a>, <a href="https://arxiv.org/html/2502.11345v1" title="View HTML" id="html-2502.11345" aria-labelledby="html-2502.11345" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11345" title="Other formats" id="oth-2502.11345" aria-labelledby="oth-2502.11345">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Hierarchical Graph Topic Modeling with Topic Tree-based Transformer </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+D+C">Delvin Ce Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+M">Menglin Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+X">Xiaobao Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+J">Jiasheng Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lauw,+H+W">Hady W. Lauw</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Textual documents are commonly connected in a hierarchical graph structure where a central document links to others with an exponentially growing connectivity. Though Hyperbolic Graph Neural Networks (HGNNs) excel at capturing such graph hierarchy, they cannot model the rich textual semantics within documents. Moreover, text contents in documents usually discuss topics of different specificity. Hierarchical Topic Models (HTMs) discover such latent topic hierarchy within text corpora. However, most of them focus on the textual content within documents, and ignore the graph adjacency across interlinked documents. We thus propose a Hierarchical Graph Topic Modeling Transformer to integrate both topic hierarchy within documents and graph hierarchy across documents into a unified Transformer. Specifically, to incorporate topic hierarchy within documents, we design a topic tree and infer a hierarchical tree embedding for hierarchical topic modeling. To preserve both topic and graph hierarchies, we design our model in hyperbolic space and propose Hyperbolic Doubly Recurrent Neural Network, which models ancestral and fraternal tree structure. Both hierarchies are inserted into each Transformer layer to learn unified representations. Both supervised and unsupervised experiments verify the effectiveness of our model. </p> </div> </dd> <dt> <a name='item95'>[95]</a> <a href ="/abs/2502.11355" title="Abstract" id="2502.11355"> arXiv:2502.11355 </a> [<a href="/pdf/2502.11355" title="Download PDF" id="pdf-2502.11355" aria-labelledby="pdf-2502.11355">pdf</a>, <a href="/format/2502.11355" title="Other formats" id="oth-2502.11355" aria-labelledby="oth-2502.11355">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> "Nuclear Deployed!": Analyzing Catastrophic Risks in Decision-making of Autonomous LLM Agents </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+R">Rongwu Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+X">Xiaojian Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+S">Shuo Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+W">Wei Xu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Our code will be available at <a href="https://github.com/pillowsofwind/LLM-CBRN-Risks" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Cryptography and Security (cs.CR); Computers and Society (cs.CY) </div> <p class='mathjax'> Large language models (LLMs) are evolving into autonomous decision-makers, raising concerns about catastrophic risks in high-stakes scenarios, particularly in Chemical, Biological, Radiological and Nuclear (CBRN) domains. Based on the insight that such risks can originate from trade-offs between the agent's Helpful, Harmlessness and Honest (HHH) goals, we build a novel three-stage evaluation framework, which is carefully constructed to effectively and naturally expose such risks. We conduct 14,400 agentic simulations across 12 advanced LLMs, with extensive experiments and analysis. Results reveal that LLM agents can autonomously engage in catastrophic behaviors and deception, without being deliberately induced. Furthermore, stronger reasoning abilities often increase, rather than mitigate, these risks. We also show that these agents can violate instructions and superior commands. On the whole, we empirically prove the existence of catastrophic risks in autonomous LLM agents. We will release our code upon request. </p> </div> </dd> <dt> <a name='item96'>[96]</a> <a href ="/abs/2502.11361" title="Abstract" id="2502.11361"> arXiv:2502.11361 </a> [<a href="/pdf/2502.11361" title="Download PDF" id="pdf-2502.11361" aria-labelledby="pdf-2502.11361">pdf</a>, <a href="https://arxiv.org/html/2502.11361v1" title="View HTML" id="html-2502.11361" aria-labelledby="html-2502.11361" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11361" title="Other formats" id="oth-2502.11361" aria-labelledby="oth-2502.11361">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> VLDBench: Vision Language Models Disinformation Detection Benchmark </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Raza,+S">Shaina Raza</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Vayani,+A">Ashmal Vayani</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jain,+A">Aditya Jain</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Narayanan,+A">Aravind Narayanan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Khazaie,+V+R">Vahid Reza Khazaie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bashir,+S+R">Syed Raza Bashir</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dolatabadi,+E">Elham Dolatabadi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Uddin,+G">Gias Uddin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Emmanouilidis,+C">Christos Emmanouilidis</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qureshi,+R">Rizwan Qureshi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shah,+M">Mubarak Shah</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> under review </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The rapid rise of AI-generated content has made detecting disinformation increasingly challenging. In particular, multimodal disinformation, i.e., online posts-articles that contain images and texts with fabricated information are specially designed to deceive. While existing AI safety benchmarks primarily address bias and toxicity, multimodal disinformation detection remains largely underexplored. To address this challenge, we present the Vision-Language Disinformation Detection Benchmark VLDBench, the first comprehensive benchmark for detecting disinformation across both unimodal (text-only) and multimodal (text and image) content, comprising 31,000} news article-image pairs, spanning 13 distinct categories, for robust evaluation. VLDBench features a rigorous semi-automated data curation pipeline, with 22 domain experts dedicating 300 plus hours} to annotation, achieving a strong inter-annotator agreement (Cohen kappa = 0.78). We extensively evaluate state-of-the-art Large Language Models (LLMs) and Vision-Language Models (VLMs), demonstrating that integrating textual and visual cues in multimodal news posts improves disinformation detection accuracy by 5 - 35 % compared to unimodal models. Developed in alignment with AI governance frameworks such as the EU AI Act, NIST guidelines, and the MIT AI Risk Repository 2024, VLDBench is expected to become a benchmark for detecting disinformation in online multi-modal contents. Our code and data will be publicly available. </p> </div> </dd> <dt> <a name='item97'>[97]</a> <a href ="/abs/2502.11364" title="Abstract" id="2502.11364"> arXiv:2502.11364 </a> [<a href="/pdf/2502.11364" title="Download PDF" id="pdf-2502.11364" aria-labelledby="pdf-2502.11364">pdf</a>, <a href="https://arxiv.org/html/2502.11364v1" title="View HTML" id="html-2502.11364" aria-labelledby="html-2502.11364" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11364" title="Other formats" id="oth-2502.11364" aria-labelledby="oth-2502.11364">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Blessing of Multilinguality: A Systematic Analysis of Multilingual In-Context Learning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Tu,+Y">Yilei Tu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xue,+A">Andrew Xue</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shi,+F">Freda Shi</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> While multilingual large language models generally perform adequately, and sometimes even rival English performance on high-resource languages (HRLs), they often significantly underperform on low-resource languages (LRLs). Among several prompting strategies aiming at bridging the gap, multilingual in-context learning (ICL) has been particularly effective when demonstration in target languages is unavailable. However, there lacks a systematic understanding when and why it works well. <br>In this work, we systematically analyze multilingual ICL, using demonstrations in HRLs to enhance cross-lingual transfer. We show that demonstrations in mixed HRLs consistently outperform English-only ones across the board, particularly for tasks written in LRLs. Surprisingly, our ablation study show that the presence of irrelevant non-English sentences in the prompt yields measurable gains, suggesting the effectiveness of multilingual exposure itself. Our results highlight the potential of strategically leveraging multilingual resources to bridge the performance gap for underrepresented languages. </p> </div> </dd> <dt> <a name='item98'>[98]</a> <a href ="/abs/2502.11368" title="Abstract" id="2502.11368"> arXiv:2502.11368 </a> [<a href="/pdf/2502.11368" title="Download PDF" id="pdf-2502.11368" aria-labelledby="pdf-2502.11368">pdf</a>, <a href="https://arxiv.org/html/2502.11368v1" title="View HTML" id="html-2502.11368" aria-labelledby="html-2502.11368" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11368" title="Other formats" id="oth-2502.11368" aria-labelledby="oth-2502.11368">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LLMs can Perform Multi-Dimensional Analytic Writing Assessments: A Case Study of L2 Graduate-Level Academic English Writing </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zhengxiang Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Makarova,+V">Veronika Makarova</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Z">Zhi Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kodner,+J">Jordan Kodner</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rambow,+O">Owen Rambow</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 26 pages, 6 figures, 15 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> The paper explores the performance of LLMs in the context of multi-dimensional analytic writing assessments, i.e. their ability to provide both scores and comments based on multiple assessment criteria. Using a corpus of literature reviews written by L2 graduate students and assessed by human experts against 9 analytic criteria, we prompt several popular LLMs to perform the same task under various conditions. To evaluate the quality of feedback comments, we apply a novel feedback comment quality evaluation framework. This framework is interpretable, cost-efficient, scalable, and reproducible, compared to existing methods that rely on manual judgments. We find that LLMs can generate reasonably good and generally reliable multi-dimensional analytic assessments. We release our corpus for reproducibility. </p> </div> </dd> <dt> <a name='item99'>[99]</a> <a href ="/abs/2502.11380" title="Abstract" id="2502.11380"> arXiv:2502.11380 </a> [<a href="/pdf/2502.11380" title="Download PDF" id="pdf-2502.11380" aria-labelledby="pdf-2502.11380">pdf</a>, <a href="https://arxiv.org/html/2502.11380v1" title="View HTML" id="html-2502.11380" aria-labelledby="html-2502.11380" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11380" title="Other formats" id="oth-2502.11380" aria-labelledby="oth-2502.11380">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Exploring the Small World of Word Embeddings: A Comparative Study on Conceptual Spaces from LLMs of Different Scales </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zhu Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Y">Ying Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+K">KangYang Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kong,+C">Cunliang Kong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+M">Maosong Sun</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Paper under review </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> A conceptual space represents concepts as nodes and semantic relatedness as edges. Word embeddings, combined with a similarity metric, provide an effective approach to constructing such a space. Typically, embeddings are derived from traditional distributed models or encoder-only pretrained models, whose objectives directly capture the meaning of the current token. In contrast, decoder-only models, including large language models (LLMs), predict the next token, making their embeddings less directly tied to the current token's semantics. Moreover, comparative studies on LLMs of different scales remain underexplored. In this paper, we construct a conceptual space using word embeddings from LLMs of varying scales and comparatively analyze their properties. We establish a network based on a linguistic typology-inspired connectivity hypothesis, examine global statistical properties, and compare LLMs of varying scales. Locally, we analyze conceptual pairs, WordNet relations, and a cross-lingual semantic network for qualitative words. Our results indicate that the constructed space exhibits small-world properties, characterized by a high clustering coefficient and short path lengths. Larger LLMs generate more intricate spaces, with longer paths reflecting richer relational structures and connections. Furthermore, the network serves as an efficient bridge for cross-lingual semantic mapping. </p> </div> </dd> <dt> <a name='item100'>[100]</a> <a href ="/abs/2502.11387" title="Abstract" id="2502.11387"> arXiv:2502.11387 </a> [<a href="/pdf/2502.11387" title="Download PDF" id="pdf-2502.11387" aria-labelledby="pdf-2502.11387">pdf</a>, <a href="https://arxiv.org/html/2502.11387v1" title="View HTML" id="html-2502.11387" aria-labelledby="html-2502.11387" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11387" title="Other formats" id="oth-2502.11387" aria-labelledby="oth-2502.11387">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> RoleMRC: A Fine-Grained Composite Benchmark for Role-Playing and Instruction-Following </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+J">Junru Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+J">Jiazheng Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+G">Guodong Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gui,+L">Lin Gui</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=An,+S">Siyu An</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+Y">Yulan He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yin,+D">Di Yin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+X">Xing Sun</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Role-playing is important for Large Language Models (LLMs) to follow diverse instructions while maintaining role identity and the role's pre-defined ability limits. Existing role-playing datasets mostly contribute to controlling role style and knowledge boundaries, but overlook role-playing in instruction-following scenarios. We introduce a fine-grained role-playing and instruction-following composite benchmark, named RoleMRC, including: (1) Multi-turn dialogues between ideal roles and humans, including free chats or discussions upon given passages; (2) Role-playing machine reading comprehension, involving response, refusal, and attempts according to passage answerability and role ability; (3) More complex scenarios with nested, multi-turn and prioritized instructions. The final RoleMRC features a 10.2k role profile meta-pool, 37.9k well-synthesized role-playing instructions, and 1.4k testing samples. We develop a pipeline to quantitatively evaluate the fine-grained role-playing and instruction-following capabilities of several mainstream LLMs, as well as models that are fine-tuned on our data. Moreover, cross-evaluation on external role-playing datasets confirms that models fine-tuned on RoleMRC enhances instruction-following without compromising general role-playing and reasoning capabilities. We also probe the neural-level activation maps of different capabilities over post-tuned LLMs. Access to our RoleMRC, RoleMRC-mix and Codes: <a href="https://github.com/LuJunru/RoleMRC" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item101'>[101]</a> <a href ="/abs/2502.11393" title="Abstract" id="2502.11393"> arXiv:2502.11393 </a> [<a href="/pdf/2502.11393" title="Download PDF" id="pdf-2502.11393" aria-labelledby="pdf-2502.11393">pdf</a>, <a href="https://arxiv.org/html/2502.11393v1" title="View HTML" id="html-2502.11393" aria-labelledby="html-2502.11393" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11393" title="Other formats" id="oth-2502.11393" aria-labelledby="oth-2502.11393">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> HellaSwag-Pro: A Large-Scale Bilingual Benchmark for Evaluating the Robustness of LLMs in Commonsense Reasoning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+X">Xiaoyuan Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+M">Moxin Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Men,+R">Rui Men</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yichang Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bao,+K">Keqin Bao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+W">Wenjie Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+F">Fuli Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+D">Dayiheng Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+J">Junyang Lin</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Under Review </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large language models (LLMs) have shown remarkable capabilities in commonsense reasoning; however, some variations in questions can trigger incorrect responses. Do these models truly understand commonsense knowledge, or just memorize expression patterns? To investigate this question, we present the first extensive robustness evaluation of LLMs in commonsense reasoning. We introduce HellaSwag-Pro, a large-scale bilingual benchmark consisting of 11,200 cases, by designing and compiling seven types of question variants. To construct this benchmark, we propose a two-stage method to develop Chinese HellaSwag, a finely annotated dataset comprising 12,000 instances across 56 categories. We conduct extensive experiments on 41 representative LLMs, revealing that these LLMs are far from robust in commonsense reasoning. Furthermore, this robustness varies depending on the language in which the LLM is tested. This work establishes a high-quality evaluation benchmark, with extensive experiments offering valuable insights to the community in commonsense reasoning for LLMs. </p> </div> </dd> <dt> <a name='item102'>[102]</a> <a href ="/abs/2502.11400" title="Abstract" id="2502.11400"> arXiv:2502.11400 </a> [<a href="/pdf/2502.11400" title="Download PDF" id="pdf-2502.11400" aria-labelledby="pdf-2502.11400">pdf</a>, <a href="https://arxiv.org/html/2502.11400v1" title="View HTML" id="html-2502.11400" aria-labelledby="html-2502.11400" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11400" title="Other formats" id="oth-2502.11400" aria-labelledby="oth-2502.11400">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Revisiting Robust RAG: Do We Still Need Complex Robust Training in the Era of Powerful LLMs? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ding,+H">Hanxing Ding</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tao,+S">Shuchang Tao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pang,+L">Liang Pang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wei,+Z">Zihao Wei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+L">Liwei Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+K">Kun Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+H">Huawei Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cheng,+X">Xueqi Cheng</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Retrieval-augmented generation (RAG) systems often suffer from performance degradation when encountering noisy or irrelevant documents, driving researchers to develop sophisticated training strategies to enhance their robustness against such retrieval noise. However, as large language models (LLMs) continue to advance, the necessity of these complex training methods is increasingly questioned. In this paper, we systematically investigate whether complex robust training strategies remain necessary as model capacity grows. Through comprehensive experiments spanning multiple model architectures and parameter scales, we evaluate various document selection methods and adversarial training techniques across diverse datasets. Our extensive experiments consistently demonstrate that as models become more powerful, the performance gains brought by complex robust training methods drop off dramatically. We delve into the rationale and find that more powerful models inherently exhibit superior confidence calibration, better generalization across datasets (even when trained with randomly selected documents), and optimal attention mechanisms learned with simpler strategies. Our findings suggest that RAG systems can benefit from simpler architectures and training strategies as models become more powerful, enabling more scalable applications with minimal complexity. </p> </div> </dd> <dt> <a name='item103'>[103]</a> <a href ="/abs/2502.11401" title="Abstract" id="2502.11401"> arXiv:2502.11401 </a> [<a href="/pdf/2502.11401" title="Download PDF" id="pdf-2502.11401" aria-labelledby="pdf-2502.11401">pdf</a>, <a href="https://arxiv.org/html/2502.11401v1" title="View HTML" id="html-2502.11401" aria-labelledby="html-2502.11401" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11401" title="Other formats" id="oth-2502.11401" aria-labelledby="oth-2502.11401">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Following the Autoregressive Nature of LLM Embeddings via Compression and Alignment </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Deng,+J">Jingcheng Deng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+Z">Zhongtao Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pang,+L">Liang Pang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+L">Liwei Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+K">Kun Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wei,+Z">Zihao Wei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+H">Huawei Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cheng,+X">Xueqi Cheng</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> A new trend uses LLMs as dense text encoders via contrastive learning. However, since LLM embeddings predict the probability distribution of the next token, they are inherently generative and distributive, conflicting with contrastive learning, which requires embeddings to capture full-text semantics and align via cosine similarity. This discrepancy hinders the full utilization of LLMs' pre-training capabilities, resulting in inefficient learning. In response to this issue, we propose AutoRegEmbed, a new contrastive learning method built on embedding conditional probability distributions, which integrates two core tasks: information compression and conditional distribution alignment. The information compression task encodes text into the embedding space, ensuring that the embedding vectors capture global semantics. The conditional distribution alignment task focuses on aligning text embeddings with positive samples embeddings by leveraging the conditional distribution of embeddings while simultaneously reducing the likelihood of generating negative samples from text embeddings, thereby achieving embedding alignment and uniformity. Experimental results demonstrate that our method significantly outperforms traditional contrastive learning approaches and achieves performance comparable to state-of-the-art models when using the same amount of data. </p> </div> </dd> <dt> <a name='item104'>[104]</a> <a href ="/abs/2502.11404" title="Abstract" id="2502.11404"> arXiv:2502.11404 </a> [<a href="/pdf/2502.11404" title="Download PDF" id="pdf-2502.11404" aria-labelledby="pdf-2502.11404">pdf</a>, <a href="https://arxiv.org/html/2502.11404v1" title="View HTML" id="html-2502.11404" aria-labelledby="html-2502.11404" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11404" title="Other formats" id="oth-2502.11404" aria-labelledby="oth-2502.11404">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> ToolCoder: A Systematic Code-Empowered Tool Learning Framework for Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ding,+H">Hanxing Ding</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tao,+S">Shuchang Tao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pang,+L">Liang Pang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wei,+Z">Zihao Wei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+J">Jinyang Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ding,+B">Bolin Ding</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+H">Huawei Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+X">Xueqi Chen</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Tool learning has emerged as a crucial capability for large language models (LLMs) to solve complex real-world tasks through interaction with external tools. Existing approaches face significant challenges, including reliance on hand-crafted prompts, difficulty in multi-step planning, and lack of precise error diagnosis and reflection mechanisms. We propose ToolCoder, a novel framework that reformulates tool learning as a code generation task. Inspired by software engineering principles, ToolCoder transforms natural language queries into structured Python function scaffold and systematically breaks down tasks with descriptive comments, enabling LLMs to leverage coding paradigms for complex reasoning and planning. It then generates and executes function implementations to obtain final responses. Additionally, ToolCoder stores successfully executed functions in a repository to promote code reuse, while leveraging error traceback mechanisms for systematic debugging, optimizing both execution efficiency and robustness. Experiments demonstrate that ToolCoder achieves superior performance in task completion accuracy and execution reliability compared to existing approaches, establishing the effectiveness of code-centric approaches in tool learning. </p> </div> </dd> <dt> <a name='item105'>[105]</a> <a href ="/abs/2502.11405" title="Abstract" id="2502.11405"> arXiv:2502.11405 </a> [<a href="/pdf/2502.11405" title="Download PDF" id="pdf-2502.11405" aria-labelledby="pdf-2502.11405">pdf</a>, <a href="https://arxiv.org/html/2502.11405v1" title="View HTML" id="html-2502.11405" aria-labelledby="html-2502.11405" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11405" title="Other formats" id="oth-2502.11405" aria-labelledby="oth-2502.11405">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LayAlign: Enhancing Multilingual Reasoning in Large Language Models via Layer-Wise Adaptive Fusion and Alignment Strategy </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ruan,+Z">Zhiwen Ruan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yixia Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+H">He Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+L">Longyue Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+W">Weihua Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+K">Kaifu Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Y">Yun Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+G">Guanhua Chen</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> In Findings of NAACL 2025(The 2025 Annual Conference of the Nations of the Americas Chapter of the ACL) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Despite being pretrained on multilingual corpora, large language models (LLMs) exhibit suboptimal performance on low-resource languages. Recent approaches have leveraged multilingual encoders alongside LLMs by introducing trainable parameters connecting the two models. However, these methods typically focus on the encoder's output, overlooking valuable information from other layers. We propose \aname (\mname), a framework that integrates representations from all encoder layers, coupled with the \attaname mechanism to enable layer-wise interaction between the LLM and the multilingual encoder. Extensive experiments on multilingual reasoning tasks, along with analyses of learned representations, show that our approach consistently outperforms existing baselines. </p> </div> </dd> <dt> <a name='item106'>[106]</a> <a href ="/abs/2502.11419" title="Abstract" id="2502.11419"> arXiv:2502.11419 </a> [<a href="/pdf/2502.11419" title="Download PDF" id="pdf-2502.11419" aria-labelledby="pdf-2502.11419">pdf</a>, <a href="https://arxiv.org/html/2502.11419v1" title="View HTML" id="html-2502.11419" aria-labelledby="html-2502.11419" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11419" title="Other formats" id="oth-2502.11419" aria-labelledby="oth-2502.11419">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> InsBank: Evolving Instruction Subset for Ongoing Alignment </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Shi,+J">Jiayi Shi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yiwei Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+S">Shaoxiong Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+P">Peiwen Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+X">Xinglin Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yueqi Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tan,+C">Chuyi Tan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pan,+B">Boyuan Pan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ren,+H">Huan Ren</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+Y">Yao Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+K">Kan Li</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large language models (LLMs) typically undergo instruction tuning to enhance alignment. Recent studies emphasize that quality and diversity of instruction data are more crucial than quantity, highlighting the need to select diverse, high-quality subsets to reduce training costs. However, how to evolve these selected subsets alongside the development of new instruction data remains insufficiently explored. To achieve LLMs' ongoing alignment, we introduce Instruction Bank (InsBank), a continuously updated repository that integrates the latest valuable instruction data. We further propose Progressive Instruction Bank Evolution (PIBE), a novel framework designed to evolve InsBank effectively and efficiently over time. PIBE employs a gradual data selection strategy to maintain long-term efficiency, leveraging a representation-based diversity score to capture relationships between data points and retain historical information for comprehensive diversity evaluation. This also allows for flexible combination of diversity and quality scores during data selection and ranking. Extensive experiments demonstrate that PIBE significantly outperforms baselines in InsBank evolution and is able to extract budget-specific subsets, demonstrating its effectiveness and adaptability. </p> </div> </dd> <dt> <a name='item107'>[107]</a> <a href ="/abs/2502.11423" title="Abstract" id="2502.11423"> arXiv:2502.11423 </a> [<a href="/pdf/2502.11423" title="Download PDF" id="pdf-2502.11423" aria-labelledby="pdf-2502.11423">pdf</a>, <a href="https://arxiv.org/html/2502.11423v1" title="View HTML" id="html-2502.11423" aria-labelledby="html-2502.11423" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11423" title="Other formats" id="oth-2502.11423" aria-labelledby="oth-2502.11423">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Exploring Persona Sentiment Sensitivity in Personalized Dialogue Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Jun,+Y">YongHyun Jun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+H">Hwanhee Lee</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 19 pages, 8 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Personalized dialogue systems have advanced considerably with the integration of user-specific personas into large language models (LLMs). However, while LLMs can effectively generate personalized responses, the influence of persona sentiment on dialogue quality remains underexplored. In this work, we conduct a large-scale analysis of dialogues generated using a range of polarized user profiles. Our experiments reveal that dialogues involving negatively polarized users tend to overemphasize persona attributes, leading to increased entailment and contradiction instances and lower overall coherence. In contrast, positively polarized profiles yield dialogues that selectively incorporate persona information, resulting in smoother and more coherent interactions. Furthermore, we find that personas with weak or neutral sentiment generally produce lower-quality dialogues. Motivated by these findings, we propose a dialogue generation approach that explicitly accounts for persona polarity by combining a turn-based generation strategy with a profile ordering mechanism. Our study provides new insights into the sensitivity of LLMs to persona sentiment and offers guidance for developing more robust and nuanced personalized dialogue systems. </p> </div> </dd> <dt> <a name='item108'>[108]</a> <a href ="/abs/2502.11425" title="Abstract" id="2502.11425"> arXiv:2502.11425 </a> [<a href="/pdf/2502.11425" title="Download PDF" id="pdf-2502.11425" aria-labelledby="pdf-2502.11425">pdf</a>, <a href="/format/2502.11425" title="Other formats" id="oth-2502.11425" aria-labelledby="oth-2502.11425">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Counterfactual-Consistency Prompting for Relative Temporal Understanding in Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+J">Jongho Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hwang,+S">Seung-won Hwang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Preprint </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Despite the advanced capabilities of large language models (LLMs), their temporal reasoning ability remains underdeveloped. Prior works have highlighted this limitation, particularly in maintaining temporal consistency when understanding events. For example, models often confuse mutually exclusive temporal relations like ``before'' and ``after'' between events and make inconsistent predictions. In this work, we tackle the issue of temporal inconsistency in LLMs by proposing a novel counterfactual prompting approach. Our method generates counterfactual questions and enforces collective constraints, enhancing the model's consistency. We evaluate our method on multiple datasets, demonstrating significant improvements in event ordering for explicit and implicit events and temporal commonsense understanding by effectively addressing temporal inconsistencies. </p> </div> </dd> <dt> <a name='item109'>[109]</a> <a href ="/abs/2502.11427" title="Abstract" id="2502.11427"> arXiv:2502.11427 </a> [<a href="/pdf/2502.11427" title="Download PDF" id="pdf-2502.11427" aria-labelledby="pdf-2502.11427">pdf</a>, <a href="https://arxiv.org/html/2502.11427v1" title="View HTML" id="html-2502.11427" aria-labelledby="html-2502.11427" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11427" title="Other formats" id="oth-2502.11427" aria-labelledby="oth-2502.11427">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Do we Really Need Visual Instructions? Towards Visual Instruction-Free Fine-tuning for Large Vision-Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zikang Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+K">Kun Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+W+X">Wayne Xin Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+D">Dawei Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yaliang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wen,+J">Ji-Rong Wen</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> under review </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Visual instruction tuning has become the predominant technology in eliciting the multimodal task-solving capabilities of large vision-language models (LVLMs). Despite the success, as visual instructions require images as the input, it would leave the gap in inheriting the task-solving capabilities from the backbone LLMs, and make it costly to collect a large-scale dataset. To address it, we propose ViFT, a visual instruction-free fine-tuning framework for LVLMs. In ViFT, we only require the text-only instructions and image caption data during training, to separately learn the task-solving and visual perception abilities. During inference, we extract and combine the representations of the text and image inputs, for fusing the two abilities to fulfill multimodal tasks. Experimental results demonstrate that ViFT can achieve state-of-the-art performance on several visual reasoning and visual instruction following benchmarks, with rather less training data. Our code and data will be publicly released. </p> </div> </dd> <dt> <a name='item110'>[110]</a> <a href ="/abs/2502.11431" title="Abstract" id="2502.11431"> arXiv:2502.11431 </a> [<a href="/pdf/2502.11431" title="Download PDF" id="pdf-2502.11431" aria-labelledby="pdf-2502.11431">pdf</a>, <a href="https://arxiv.org/html/2502.11431v1" title="View HTML" id="html-2502.11431" aria-labelledby="html-2502.11431" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11431" title="Other formats" id="oth-2502.11431" aria-labelledby="oth-2502.11431">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Any Information Is Just Worth One Single Screenshot: Unifying Search With Visualized Information Retrieval </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Ze Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liang,+Z">Zhengyang Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+J">Junjie Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zheng Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lian,+D">Defu Lian</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> With the popularity of multimodal techniques, it receives growing interests to acquire useful information in visual forms. In this work, we formally define an emerging IR paradigm called \textit{Visualized Information Retrieval}, or \textbf{Vis-IR}, where multimodal information, such as texts, images, tables and charts, is jointly represented by a unified visual format called \textbf{Screenshots}, for various retrieval applications. We further make three key contributions for Vis-IR. First, we create \textbf{VIRA} (Vis-IR Aggregation), a large-scale dataset comprising a vast collection of screenshots from diverse sources, carefully curated into captioned and question-answer formats. Second, we develop \textbf{UniSE} (Universal Screenshot Embeddings), a family of retrieval models that enable screenshots to query or be queried across arbitrary data modalities. Finally, we construct \textbf{MVRB} (Massive Visualized IR Benchmark), a comprehensive benchmark covering a variety of task forms and application scenarios. Through extensive evaluations on MVRB, we highlight the deficiency from existing multimodal retrievers and the substantial improvements made by UniSE. Our work will be shared with the community, laying a solid foundation for this emerging field. </p> </div> </dd> <dt> <a name='item111'>[111]</a> <a href ="/abs/2502.11438" title="Abstract" id="2502.11438"> arXiv:2502.11438 </a> [<a href="/pdf/2502.11438" title="Download PDF" id="pdf-2502.11438" aria-labelledby="pdf-2502.11438">pdf</a>, <a href="https://arxiv.org/html/2502.11438v1" title="View HTML" id="html-2502.11438" aria-labelledby="html-2502.11438" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11438" title="Other formats" id="oth-2502.11438" aria-labelledby="oth-2502.11438">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SAFE-SQL: Self-Augmented In-Context Learning with Fine-grained Example Selection for Text-to-SQL </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+J">Jimin Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Baek,+I">Ingeol Baek</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+B">Byeongjeong Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+H">Hwanhee Lee</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 13 pages, 5 figures, 10 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Text-to-SQL aims to convert natural language questions into executable SQL queries. While previous approaches, such as skeleton-masked selection, have demonstrated strong performance by retrieving similar training examples to guide large language models (LLMs), they struggle in real-world scenarios where such examples are unavailable. To overcome this limitation, we propose Self-Augmentation in-context learning with Fine-grained Example selection for Text-to-SQL (SAFE-SQL), a novel framework that improves SQL generation by generating and filtering self-augmented examples. SAFE-SQL first prompts an LLM to generate multiple Text-to-SQL examples relevant to the test input. Then SAFE-SQL filters these examples through three relevance assessments, constructing high-quality in-context learning examples. Using self-generated examples, SAFE-SQL surpasses the previous zero-shot, and few-shot Text-to-SQL frameworks, achieving higher execution accuracy. Notably, our approach provides additional performance gains in extra hard and unseen scenarios, where conventional methods often fail. </p> </div> </dd> <dt> <a name='item112'>[112]</a> <a href ="/abs/2502.11439" title="Abstract" id="2502.11439"> arXiv:2502.11439 </a> [<a href="/pdf/2502.11439" title="Download PDF" id="pdf-2502.11439" aria-labelledby="pdf-2502.11439">pdf</a>, <a href="https://arxiv.org/html/2502.11439v1" title="View HTML" id="html-2502.11439" aria-labelledby="html-2502.11439" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11439" title="Other formats" id="oth-2502.11439" aria-labelledby="oth-2502.11439">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> An Efficient Row-Based Sparse Fine-Tuning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+C">Cen-Jhih Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bhaskara,+A">Aditya Bhaskara</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Fine-tuning is an important step in adapting foundation models such as large language models to downstream tasks. To make this step more accessible to users with limited computational budgets, it is crucial to develop fine-tuning methods that are memory and computationally efficient. Sparse Fine-tuning (SFT) and Low-rank adaptation (LoRA) are two frameworks that have emerged for addressing this problem and have been adopted widely in practice. In this work, we develop a new SFT framework, based on ideas from neural network pruning. At a high level, we first identify "important" neurons/nodes using feature importance metrics from network pruning (specifically, we use the structural pruning method), and then perform fine-tuning by restricting to weights involving these neurons. Using experiments on common language tasks, we demonstrate that our method significantly improves the memory efficiency of SFT without increasing training time complexity and implementation complexity, while achieving accuracy comparable to state-of-the-art methods such as LoRA and its variants. </p> </div> </dd> <dt> <a name='item113'>[113]</a> <a href ="/abs/2502.11441" title="Abstract" id="2502.11441"> arXiv:2502.11441 </a> [<a href="/pdf/2502.11441" title="Download PDF" id="pdf-2502.11441" aria-labelledby="pdf-2502.11441">pdf</a>, <a href="https://arxiv.org/html/2502.11441v1" title="View HTML" id="html-2502.11441" aria-labelledby="html-2502.11441" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11441" title="Other formats" id="oth-2502.11441" aria-labelledby="oth-2502.11441">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Which Retain Set Matters for LLM Unlearning? A Case Study on Entity Unlearning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chang,+H">Hwan Chang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+H">Hwanhee Lee</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Work in Progress </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large language models (LLMs) risk retaining unauthorized or sensitive information from their training data, which raises privacy concerns. LLM unlearning seeks to mitigate these risks by selectively removing specified data while maintaining overall model performance. However, most existing work focus on methods to achieve effective forgetting and does not provide a detailed analysis of the retain set, the portion of training data that is not targeted for removal. In this paper, we investigate the effects of unlearning on various subsets of the retain set through a case study on entity unlearning. We introduce the Syntactically Similar Neighbor Set, a group of queries that share similar syntactic structures with the data targeted for removal, and show that this subset suffers the greatest performance drop during unlearning. Moreover, when used for regularization, this set not only preserves performance on syntactically similar queries but also delivers comparable or improved results across other data subsets. Our results highlight that syntactic similarity is a critical factor, potentially more so than domain or entity relationships, in achieving effective and practical LLM unlearning. </p> </div> </dd> <dt> <a name='item114'>[114]</a> <a href ="/abs/2502.11444" title="Abstract" id="2502.11444"> arXiv:2502.11444 </a> [<a href="/pdf/2502.11444" title="Download PDF" id="pdf-2502.11444" aria-labelledby="pdf-2502.11444">pdf</a>, <a href="https://arxiv.org/html/2502.11444v1" title="View HTML" id="html-2502.11444" aria-labelledby="html-2502.11444" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11444" title="Other formats" id="oth-2502.11444" aria-labelledby="oth-2502.11444">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Does RAG Really Perform Bad For Long-Context Processing? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+K">Kun Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zheng Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+P">Peitian Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qian,+H">Hongjin Qian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+J">Jun Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+K">Kang Liu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The efficient processing of long context poses a serious challenge for large language models (LLMs). Recently, retrieval-augmented generation (RAG) has emerged as a promising strategy for this problem, as it enables LLMs to make selective use of the long context for efficient computation. However, existing RAG approaches lag behind other long-context processing methods due to inherent limitations on inaccurate retrieval and fragmented contexts. To address these challenges, we introduce RetroLM, a novel RAG framework for long-context processing. Unlike traditional methods, RetroLM employs KV-level retrieval augmentation, where it partitions the LLM's KV cache into contiguous pages and retrieves the most crucial ones for efficient computation. This approach enhances robustness to retrieval inaccuracy, facilitates effective utilization of fragmented contexts, and saves the cost from repeated computation. Building on this framework, we further develop a specialized retriever for precise retrieval of critical pages and conduct unsupervised post-training to optimize the model's ability to leverage retrieved information. We conduct comprehensive evaluations with a variety of benchmarks, including LongBench, InfiniteBench, and RULER, where RetroLM significantly outperforms existing long-context LLMs and efficient long-context processing methods, particularly in tasks requiring intensive reasoning or extremely long-context comprehension. </p> </div> </dd> <dt> <a name='item115'>[115]</a> <a href ="/abs/2502.11451" title="Abstract" id="2502.11451"> arXiv:2502.11451 </a> [<a href="/pdf/2502.11451" title="Download PDF" id="pdf-2502.11451" aria-labelledby="pdf-2502.11451">pdf</a>, <a href="https://arxiv.org/html/2502.11451v1" title="View HTML" id="html-2502.11451" aria-labelledby="html-2502.11451" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11451" title="Other formats" id="oth-2502.11451" aria-labelledby="oth-2502.11451">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> From Personas to Talks: Revisiting the Impact of Personas on LLM-Synthesized Emotional Support Conversations </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+S">Shenghan Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Deng,+Y">Yang Deng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+Y">Yimo Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hsu,+W">Wynne Hsu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+M+L">Mong Li Lee</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The rapid advancement of Large Language Models (LLMs) has revolutionized the generation of emotional support conversations (ESC), offering scalable solutions with reduced costs and enhanced data privacy. This paper explores the role of personas in the creation of ESC by LLMs. Our research utilizes established psychological frameworks to measure and infuse persona traits into LLMs, which then generate dialogues in the emotional support scenario. We conduct extensive evaluations to understand the stability of persona traits in dialogues, examining shifts in traits post-generation and their impact on dialogue quality and strategy distribution. Experimental results reveal several notable findings: 1) LLMs can infer core persona traits, 2) subtle shifts in emotionality and extraversion occur, influencing the dialogue dynamics, and 3) the application of persona traits modifies the distribution of emotional support strategies, enhancing the relevance and empathetic quality of the responses. These findings highlight the potential of persona-driven LLMs in crafting more personalized, empathetic, and effective emotional support dialogues, which has significant implications for the future design of AI-driven emotional support systems. </p> </div> </dd> <dt> <a name='item116'>[116]</a> <a href ="/abs/2502.11454" title="Abstract" id="2502.11454"> arXiv:2502.11454 </a> [<a href="/pdf/2502.11454" title="Download PDF" id="pdf-2502.11454" aria-labelledby="pdf-2502.11454">pdf</a>, <a href="https://arxiv.org/html/2502.11454v1" title="View HTML" id="html-2502.11454" aria-labelledby="html-2502.11454" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11454" title="Other formats" id="oth-2502.11454" aria-labelledby="oth-2502.11454">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> UniCBE: An Uniformity-driven Comparing Based Evaluation Framework with Unified Multi-Objective Optimization </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+P">Peiwen Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+S">Shaoxiong Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yiwei Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+X">Xinglin Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yueqi Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shi,+J">Jiayi Shi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tan,+C">Chuyi Tan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pan,+B">Boyuan Pan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+Y">Yao Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+K">Kan Li</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> ICLR 2025 spotlight </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Human preference plays a significant role in measuring large language models and guiding them to align with human values. Unfortunately, current comparing-based evaluation (CBE) methods typically focus on a single optimization objective, failing to effectively utilize scarce yet valuable preference signals. To address this, we delve into key factors that can enhance the accuracy, convergence, and scalability of CBE: suppressing sampling bias, balancing descending process of uncertainty, and mitigating updating uncertainty. Following the derived guidelines, we propose UniCBE, a unified uniformity-driven CBE framework which simultaneously optimize these core objectives by constructing and integrating three decoupled sampling probability matrices, each designed to ensure uniformity in specific aspects. We further ablate the optimal tuple sampling and preference aggregation strategies to achieve efficient CBE. On the AlpacaEval benchmark, UniCBE saves over 17% of evaluation budgets while achieving a Pearson correlation with ground truth exceeding 0.995, demonstrating excellent accuracy and convergence. In scenarios where new models are continuously introduced, UniCBE can even save over 50% of evaluation costs, highlighting its improved scalability. </p> </div> </dd> <dt> <a name='item117'>[117]</a> <a href ="/abs/2502.11457" title="Abstract" id="2502.11457"> arXiv:2502.11457 </a> [<a href="/pdf/2502.11457" title="Download PDF" id="pdf-2502.11457" aria-labelledby="pdf-2502.11457">pdf</a>, <a href="https://arxiv.org/html/2502.11457v1" title="View HTML" id="html-2502.11457" aria-labelledby="html-2502.11457" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11457" title="Other formats" id="oth-2502.11457" aria-labelledby="oth-2502.11457">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Aligning Sentence Simplification with ESL Learner's Proficiency for Language Acquisition </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+G">Guanlin Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Arase,+Y">Yuki Arase</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Crespi,+N">Noel Crespi</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> NAACL2025 main </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Text simplification is crucial for improving accessibility and comprehension for English as a Second Language (ESL) learners. This study goes a step further and aims to facilitate ESL learners' language acquisition by simplification. Specifically, we propose simplifying complex sentences to appropriate levels for learners while also increasing vocabulary coverage of the target level in the simplifications. We achieve this without a parallel corpus by conducting reinforcement learning on a large language model. Our method employs token-level and sentence-level rewards, and iteratively trains the model on its self-generated outputs to guide the model to search for simplification hypotheses that satisfy the target attributes. Experiment results on CEFR-SP and TurkCorpus datasets show that the proposed method can effectively increase the frequency and diversity of vocabulary of the target level by more than $20\%$ compared to baseline models, while maintaining high simplification quality. </p> </div> </dd> <dt> <a name='item118'>[118]</a> <a href ="/abs/2502.11460" title="Abstract" id="2502.11460"> arXiv:2502.11460 </a> [<a href="/pdf/2502.11460" title="Download PDF" id="pdf-2502.11460" aria-labelledby="pdf-2502.11460">pdf</a>, <a href="/format/2502.11460" title="Other formats" id="oth-2502.11460" aria-labelledby="oth-2502.11460">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> UnitCoder: Scalable Iterative Code Synthesis with Unit Test Guidance </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+Y">Yichuan Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shao,+Y">Yunfan Shao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+P">Peiji Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+D">Demin Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+Q">Qipeng Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+L">Linyang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qiu,+X">Xipeng Qiu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+K">Kai Chen</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> work in progress </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Software Engineering (cs.SE) </div> <p class='mathjax'> Large Language Models (LLMs) have demonstrated remarkable capabilities in various tasks, yet code generation remains a major challenge. Current approaches for obtaining high-quality code data primarily focus on (i) collecting large-scale pre-training data and (ii) synthesizing instruction data through prompt engineering with powerful models. While pre-training data faces quality consistency issues, instruction-based synthesis suffers from limited instruction diversity and inherent biases of LLMs. To address this gap, we introduce UnitCoder, a systematic pipeline leveraging model-generated unit tests to both guide and validate the code generation process. Combined with large-scale package-based retrieval from pre-training corpus, we generate a dataset of 500K+ verifiable programs containing diverse API calls. Evaluations on multiple Python benchmarks (BigCodeBench, HumanEval, MBPP) demonstrate that models fine-tuned on our synthetic data exhibit consistent performance improvements. Notably, Llama3.1-8B and InternLM2.5-7B improve from 31\% and 28\% to 40\% and 39\% success rates on BigCodeBench, respectively. Our work presents a scalable approach that leverages model-generated unit tests to guide the synthesis of high-quality code data from pre-training corpora, demonstrating the potential for producing diverse and high-quality post-training data at scale. All code and data will be released (<a href="https://github.com" rel="external noopener nofollow" class="link-external link-https">this https URL</a>). </p> </div> </dd> <dt> <a name='item119'>[119]</a> <a href ="/abs/2502.11469" title="Abstract" id="2502.11469"> arXiv:2502.11469 </a> [<a href="/pdf/2502.11469" title="Download PDF" id="pdf-2502.11469" aria-labelledby="pdf-2502.11469">pdf</a>, <a href="https://arxiv.org/html/2502.11469v1" title="View HTML" id="html-2502.11469" aria-labelledby="html-2502.11469" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11469" title="Other formats" id="oth-2502.11469" aria-labelledby="oth-2502.11469">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> If Attention Serves as a Cognitive Model of Human Memory Retrieval, What is the Plausible Memory Representation? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yoshida,+R">Ryo Yoshida</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Isono,+S">Shinnosuke Isono</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kajikawa,+K">Kohei Kajikawa</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Someya,+T">Taiga Someya</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sugimito,+Y">Yushi Sugimito</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Oseki,+Y">Yohei Oseki</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 16 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Recent work in computational psycholinguistics has revealed intriguing parallels between attention mechanisms and human memory retrieval, focusing primarily on Transformer architectures that operate on token-level representations. However, computational psycholinguistic research has also established that syntactic structures provide compelling explanations for human sentence processing that word-level factors alone cannot fully account for. In this study, we investigate whether the attention mechanism of Transformer Grammar (TG), which uniquely operates on syntactic structures as representational units, can serve as a cognitive model of human memory retrieval, using Normalized Attention Entropy (NAE) as a linking hypothesis between model behavior and human processing difficulty. Our experiments demonstrate that TG's attention achieves superior predictive power for self-paced reading times compared to vanilla Transformer's, with further analyses revealing independent contributions from both models. These findings suggest that human sentence processing involves dual memory representations -- one based on syntactic structures and another on token sequences -- with attention serving as the general retrieval algorithm, while highlighting the importance of incorporating syntactic structures as representational units. </p> </div> </dd> <dt> <a name='item120'>[120]</a> <a href ="/abs/2502.11471" title="Abstract" id="2502.11471"> arXiv:2502.11471 </a> [<a href="/pdf/2502.11471" title="Download PDF" id="pdf-2502.11471" aria-labelledby="pdf-2502.11471">pdf</a>, <a href="https://arxiv.org/html/2502.11471v1" title="View HTML" id="html-2502.11471" aria-labelledby="html-2502.11471" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11471" title="Other formats" id="oth-2502.11471" aria-labelledby="oth-2502.11471">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> GLTW: Joint Improved Graph Transformer and LLM via Three-Word Language for Knowledge Graph Completion </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+K">Kangyang Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bai,+Y">Yuzhuo Bai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+C">Cheng Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Si,+S">Shuzheng Si</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+Y">Yingli Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zhu Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zhitong Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kong,+C">Cunliang Kong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+W">Wenhao Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+Y">Yufei Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tian,+Y">Ye Tian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiong,+X">Xuantang Xiong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+L">Lei Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+M">Maosong Sun</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Information Retrieval (cs.IR) </div> <p class='mathjax'> Knowledge Graph Completion (KGC), which aims to infer missing or incomplete facts, is a crucial task for KGs. However, integrating the vital structural information of KGs into Large Language Models (LLMs) and outputting predictions deterministically remains challenging. To address this, we propose a new method called GLTW, which encodes the structural information of KGs and merges it with LLMs to enhance KGC performance. Specifically, we introduce an improved Graph Transformer (iGT) that effectively encodes subgraphs with both local and global structural information and inherits the characteristics of language model, bypassing training from scratch. Also, we develop a subgraph-based multi-classification training objective, using all entities within KG as classification objects, to boost learning <a href="http://efficiency.Importantly" rel="external noopener nofollow" class="link-external link-http">this http URL</a>, we combine iGT with an LLM that takes KG language prompts as <a href="http://input.Our" rel="external noopener nofollow" class="link-external link-http">this http URL</a> extensive experiments on various KG datasets show that GLTW achieves significant performance gains compared to SOTA baselines. </p> </div> </dd> <dt> <a name='item121'>[121]</a> <a href ="/abs/2502.11476" title="Abstract" id="2502.11476"> arXiv:2502.11476 </a> [<a href="/pdf/2502.11476" title="Download PDF" id="pdf-2502.11476" aria-labelledby="pdf-2502.11476">pdf</a>, <a href="https://arxiv.org/html/2502.11476v1" title="View HTML" id="html-2502.11476" aria-labelledby="html-2502.11476" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11476" title="Other formats" id="oth-2502.11476" aria-labelledby="oth-2502.11476">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> FastMCTS: A Simple Sampling Strategy for Data Synthesis </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+P">Peiji Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lv,+K">Kai Lv</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shao,+Y">Yunfan Shao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+Y">Yichuan Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+L">Linyang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zheng,+X">Xiaoqing Zheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qiu,+X">Xipeng Qiu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+Q">Qipeng Guo</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> work in progress </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Synthetic high-quality multi-step reasoning data can significantly enhance the performance of large language models on various tasks. However, most existing methods rely on rejection sampling, which generates trajectories independently and suffers from inefficiency and imbalanced sampling across problems of varying difficulty. In this work, we introduce FastMCTS, an innovative data synthesis strategy inspired by Monte Carlo Tree Search. FastMCTS provides a more efficient sampling method for multi-step reasoning data, offering step-level evaluation signals and promoting balanced sampling across problems of different difficulty levels. Experiments on both English and Chinese reasoning datasets demonstrate that FastMCTS generates over 30\% more correct reasoning paths compared to rejection sampling as the number of generated tokens scales up. Furthermore, under comparable synthetic data budgets, models trained on FastMCTS-generated data outperform those trained on rejection sampling data by 3.9\% across multiple benchmarks. As a lightweight sampling strategy, FastMCTS offers a practical and efficient alternative for synthesizing high-quality reasoning data. Our code will be released soon. </p> </div> </dd> <dt> <a name='item122'>[122]</a> <a href ="/abs/2502.11491" title="Abstract" id="2502.11491"> arXiv:2502.11491 </a> [<a href="/pdf/2502.11491" title="Download PDF" id="pdf-2502.11491" aria-labelledby="pdf-2502.11491">pdf</a>, <a href="https://arxiv.org/html/2502.11491v1" title="View HTML" id="html-2502.11491" aria-labelledby="html-2502.11491" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11491" title="Other formats" id="oth-2502.11491" aria-labelledby="oth-2502.11491">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Ontology-Guided Reverse Thinking Makes Large Language Models Stronger on Knowledge Graph Question Answering </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+R">Runxuan Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+B">Bei Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+J">Jiaqi Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+B">Baoxin Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+M">Ming Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+D">Dayong Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Shijin Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qin,+B">Bing Qin</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Large language models (LLMs) have shown remarkable capabilities in natural language processing. However, in knowledge graph question answering tasks (KGQA), there remains the issue of answering questions that require multi-hop reasoning. Existing methods rely on entity vector matching, but the purpose of the question is abstract and difficult to match with specific entities. As a result, it is difficult to establish reasoning paths to the purpose, which leads to information loss and redundancy. To address this issue, inspired by human reverse thinking, we propose Ontology-Guided Reverse Thinking (ORT), a novel framework that constructs reasoning paths from purposes back to conditions. ORT operates in three key phases: (1) using LLM to extract purpose labels and condition labels, (2) constructing label reasoning paths based on the KG ontology, and (3) using the label reasoning paths to guide knowledge retrieval. Experiments on the WebQSP and CWQ datasets show that ORT achieves state-of-the-art performance and significantly enhances the capability of LLMs for KGQA. </p> </div> </dd> <dt> <a name='item123'>[123]</a> <a href ="/abs/2502.11493" title="Abstract" id="2502.11493"> arXiv:2502.11493 </a> [<a href="/pdf/2502.11493" title="Download PDF" id="pdf-2502.11493" aria-labelledby="pdf-2502.11493">pdf</a>, <a href="https://arxiv.org/html/2502.11493v1" title="View HTML" id="html-2502.11493" aria-labelledby="html-2502.11493" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11493" title="Other formats" id="oth-2502.11493" aria-labelledby="oth-2502.11493">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> DAST: Context-Aware Compression in LLMs via Dynamic Allocation of Soft Tokens </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+S">Shaoshen Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yangning Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+Z">Zishan Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yinghui Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Su,+X">Xin Su</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shan,+Z">Zifei Shan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zheng,+H">Hai-tao Zheng</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large Language Models (LLMs) face computational inefficiencies and redundant processing when handling long context inputs, prompting a focus on compression techniques. While existing semantic vector-based compression methods achieve promising performance, these methods fail to account for the intrinsic information density variations between context chunks, instead allocating soft tokens uniformly across context chunks. This uniform distribution inevitably diminishes allocation to information-critical regions. To address this, we propose Dynamic Allocation of Soft Tokens (DAST), a simple yet effective method that leverages the LLM's intrinsic understanding of contextual relevance to guide compression. DAST combines perplexity-based local information with attention-driven global information to dynamically allocate soft tokens to the informative-rich chunks, enabling effective, context-aware compression. Experimental results across multiple benchmarks demonstrate that DAST surpasses state-of-the-art methods. </p> </div> </dd> <dt> <a name='item124'>[124]</a> <a href ="/abs/2502.11494" title="Abstract" id="2502.11494"> arXiv:2502.11494 </a> [<a href="/pdf/2502.11494" title="Download PDF" id="pdf-2502.11494" aria-labelledby="pdf-2502.11494">pdf</a>, <a href="https://arxiv.org/html/2502.11494v1" title="View HTML" id="html-2502.11494" aria-labelledby="html-2502.11494" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11494" title="Other formats" id="oth-2502.11494" aria-labelledby="oth-2502.11494">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Stop Looking for Important Tokens in Multimodal Language Models: Duplication Matters More </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wen,+Z">Zichen Wen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+Y">Yifeng Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Shaobo Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+J">Junyuan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Q">Qintong Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+W">Weijia Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+C">Conghui He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+L">Linfeng Zhang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 15 pages, 8 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Vision tokens in multimodal large language models often dominate huge computational overhead due to their excessive length compared to linguistic modality. Abundant recent methods aim to solve this problem with token pruning, which first defines an importance criterion for tokens and then prunes the unimportant vision tokens during inference. However, in this paper, we show that the importance is not an ideal indicator to decide whether a token should be pruned. Surprisingly, it usually results in inferior performance than random token pruning and leading to incompatibility to efficient attention computation <a href="http://operators.Instead" rel="external noopener nofollow" class="link-external link-http">this http URL</a>, we propose DART (Duplication-Aware Reduction of Tokens), which prunes tokens based on its duplication with other tokens, leading to significant and training-free acceleration. Concretely, DART selects a small subset of pivot tokens and then retains the tokens with low duplication to the pivots, ensuring minimal information loss during token pruning. Experiments demonstrate that DART can prune 88.9% vision tokens while maintaining comparable performance, leading to a 1.99$\times$ and 2.99$\times$ speed-up in total time and prefilling stage, respectively, with good compatibility to efficient attention operators. Our codes are available at <a href="https://github.com/ZichenWen1/DART" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item125'>[125]</a> <a href ="/abs/2502.11495" title="Abstract" id="2502.11495"> arXiv:2502.11495 </a> [<a href="/pdf/2502.11495" title="Download PDF" id="pdf-2502.11495" aria-labelledby="pdf-2502.11495">pdf</a>, <a href="https://arxiv.org/html/2502.11495v1" title="View HTML" id="html-2502.11495" aria-labelledby="html-2502.11495" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11495" title="Other formats" id="oth-2502.11495" aria-labelledby="oth-2502.11495">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Balanced Multi-Factor In-Context Learning for Multilingual Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Kaneko,+M">Masahiro Kaneko</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Aji,+A+F">Alham Fikri Aji</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Baldwin,+T">Timothy Baldwin</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Multilingual large language models (MLLMs) are able to leverage in-context learning (ICL) to achieve high performance by leveraging cross-lingual knowledge transfer without parameter updates. However, their effectiveness is highly sensitive to example selection, particularly in multilingual settings. Based on the findings of existing work, three key factors influence multilingual ICL: (1) semantic similarity, (2) linguistic alignment, and (3) language-specific performance. However, existing approaches address these factors independently, without explicitly disentangling their combined impact, leaving optimal example selection underexplored. To address this gap, we propose balanced multi-factor ICL (\textbf{BMF-ICL}), a method that quantifies and optimally balances these factors for improved example selection. Experiments on mCSQA and TYDI across four MLLMs demonstrate that BMF-ICL outperforms existing methods. Further analysis highlights the importance of incorporating all three factors and the importance of selecting examples from multiple languages. </p> </div> </dd> <dt> <a name='item126'>[126]</a> <a href ="/abs/2502.11501" title="Abstract" id="2502.11501"> arXiv:2502.11501 </a> [<a href="/pdf/2502.11501" title="Download PDF" id="pdf-2502.11501" aria-labelledby="pdf-2502.11501">pdf</a>, <a href="https://arxiv.org/html/2502.11501v1" title="View HTML" id="html-2502.11501" aria-labelledby="html-2502.11501" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11501" title="Other formats" id="oth-2502.11501" aria-labelledby="oth-2502.11501">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Token Pruning in Multimodal Large Language Models: Are We Solving the Right Problem? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wen,+Z">Zichen Wen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+Y">Yifeng Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+W">Weijia Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+C">Conghui He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+L">Linfeng Zhang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 12 pages, 3 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Multimodal large language models (MLLMs) have shown remarkable performance for cross-modal understanding and generation, yet still suffer from severe inference costs. Recently, abundant works have been proposed to solve this problem with token pruning, which identifies the redundant tokens in MLLMs and then prunes them to reduce the computation and KV storage costs, leading to significant acceleration without training. While these methods claim efficiency gains, critical questions about their fundamental design and evaluation remain unanswered: Why do many existing approaches underperform even compared to naive random token selection? Are attention-based scoring sufficient for reliably identifying redundant tokens? Is language information really helpful during token pruning? What makes a good trade-off between token importance and duplication? Are current evaluation protocols comprehensive and unbiased? The ignorance of previous research on these problems hinders the long-term development of token pruning. In this paper, we answer these questions one by one, providing insights into the design of future token pruning methods. </p> </div> </dd> <dt> <a name='item127'>[127]</a> <a href ="/abs/2502.11508" title="Abstract" id="2502.11508"> arXiv:2502.11508 </a> [<a href="/pdf/2502.11508" title="Download PDF" id="pdf-2502.11508" aria-labelledby="pdf-2502.11508">pdf</a>, <a href="https://arxiv.org/html/2502.11508v1" title="View HTML" id="html-2502.11508" aria-labelledby="html-2502.11508" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11508" title="Other formats" id="oth-2502.11508" aria-labelledby="oth-2502.11508">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Chinese Spelling Correction: A Comprehensive Survey of Progress, Challenges, and Opportunities </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+C">Changchun Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+K">Kai Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+J">Junzhe Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kong,+Z">Zixiao Kong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Q">Qi Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+E">Enhong Chen</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Chinese Spelling Correction (CSC) is a critical task in natural language processing, aimed at detecting and correcting spelling errors in Chinese text. This survey provides a comprehensive overview of CSC, tracing its evolution from pre-trained language models to large language models, and critically analyzing their respective strengths and weaknesses in this domain. Moreover, we further present a detailed examination of existing benchmark datasets, highlighting their inherent challenges and limitations. Finally, we propose promising future research directions, particularly focusing on leveraging the potential of LLMs and their reasoning capabilities for improved CSC performance. To the best of our knowledge, this is the first comprehensive survey dedicated to the field of CSC. We believe this work will serve as a valuable resource for researchers, fostering a deeper understanding of the field and inspiring future advancements. </p> </div> </dd> <dt> <a name='item128'>[128]</a> <a href ="/abs/2502.11514" title="Abstract" id="2502.11514"> arXiv:2502.11514 </a> [<a href="/pdf/2502.11514" title="Download PDF" id="pdf-2502.11514" aria-labelledby="pdf-2502.11514">pdf</a>, <a href="https://arxiv.org/html/2502.11514v1" title="View HTML" id="html-2502.11514" aria-labelledby="html-2502.11514" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11514" title="Other formats" id="oth-2502.11514" aria-labelledby="oth-2502.11514">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Investigating Inference-time Scaling for Chain of Multi-modal Thought: A Preliminary Study </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+Y">Yujie Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+A">Ante Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+M">Moye Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+J">Jingyao Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+H">Hao Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Su,+J">Jinsong Su</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiao,+X">Xinyan Xiao</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Recently, inference-time scaling of chain-of-thought (CoT) has been demonstrated as a promising approach for addressing multi-modal reasoning tasks. While existing studies have predominantly centered on text-based thinking, the integration of both visual and textual modalities within the reasoning process remains unexplored. In this study, we pioneer the exploration of inference-time scaling with multi-modal thought, aiming to bridge this gap. To provide a comprehensive analysis, we systematically investigate popular sampling-based and tree search-based inference-time scaling methods on 10 challenging tasks spanning various domains. Besides, we uniformly adopt a consistency-enhanced verifier to ensure effective guidance for both methods across different thought paradigms. Results show that multi-modal thought promotes better performance against conventional text-only thought, and blending the two types of thought fosters more diverse thinking. Despite these advantages, multi-modal thoughts necessitate higher token consumption for processing richer visual inputs, which raises concerns in practical applications. We hope that our findings on the merits and drawbacks of this research line will inspire future works in the field. </p> </div> </dd> <dt> <a name='item129'>[129]</a> <a href ="/abs/2502.11517" title="Abstract" id="2502.11517"> arXiv:2502.11517 </a> [<a href="/pdf/2502.11517" title="Download PDF" id="pdf-2502.11517" aria-labelledby="pdf-2502.11517">pdf</a>, <a href="https://arxiv.org/html/2502.11517v1" title="View HTML" id="html-2502.11517" aria-labelledby="html-2502.11517" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11517" title="Other formats" id="oth-2502.11517" aria-labelledby="oth-2502.11517">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Learning to Keep a Promise: Scaling Language Model Decoding Parallelism with Learned Asynchronous Decoding </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Jin,+T">Tian Jin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cheng,+E+Y">Ellie Y. Cheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ankner,+Z">Zack Ankner</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Saunshi,+N">Nikunj Saunshi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Elias,+B+M">Blake M. Elias</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yazdanbakhsh,+A">Amir Yazdanbakhsh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ragan-Kelley,+J">Jonathan Ragan-Kelley</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Subramanian,+S">Suvinay Subramanian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Carbin,+M">Michael Carbin</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 13 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Distributed, Parallel, and Cluster Computing (cs.DC); Machine Learning (cs.LG) </div> <p class='mathjax'> Decoding with autoregressive large language models (LLMs) traditionally occurs sequentially, generating one token after another. An emerging line of work explored parallel decoding by identifying and simultaneously generating semantically independent chunks of LLM responses. However, these techniques rely on hand-crafted heuristics tied to syntactic structures like lists and paragraphs, making them rigid and imprecise. We present PASTA, a learning-based system that teaches LLMs to identify semantic independence and express parallel decoding opportunities in their own responses. At its core are PASTA-LANG and its interpreter: PASTA-LANG is an annotation language that enables LLMs to express semantic independence in their own responses; the language interpreter acts on these annotations to orchestrate parallel decoding on-the-fly at inference time. Through a two-stage finetuning process, we train LLMs to generate PASTA-LANG annotations that optimize both response quality and decoding speed. Evaluation on AlpacaEval, an instruction following benchmark, shows that our approach Pareto-dominates existing methods in terms of decoding speed and response quality; our results demonstrate geometric mean speedups ranging from 1.21x to 1.93x with corresponding quality changes of +2.2% to -7.1%, measured by length-controlled win rates against sequential decoding baseline. </p> </div> </dd> <dt> <a name='item130'>[130]</a> <a href ="/abs/2502.11520" title="Abstract" id="2502.11520"> arXiv:2502.11520 </a> [<a href="/pdf/2502.11520" title="Download PDF" id="pdf-2502.11520" aria-labelledby="pdf-2502.11520">pdf</a>, <a href="/format/2502.11520" title="Other formats" id="oth-2502.11520" aria-labelledby="oth-2502.11520">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> AURORA:Automated Training Framework of Universal Process Reward Models via Ensemble Prompting and Reverse Verification </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Tan,+X">Xiaoyu Tan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yao,+T">Tianchu Yao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qu,+C">Chao Qu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+B">Bin Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+M">Minghao Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+D">Dakuan Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+H">Haozhe Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qiu,+X">Xihe Qiu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chu,+W">Wei Chu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+Y">Yinghui Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qi,+Y">Yuan Qi</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Under Review </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The reasoning capabilities of advanced large language models (LLMs) like o1 have revolutionized artificial intelligence applications. Nevertheless, evaluating and optimizing complex reasoning processes remain significant challenges due to diverse policy distributions and the inherent limitations of human effort and accuracy. In this paper, we present AURORA, a novel automated framework for training universal process reward models (PRMs) using ensemble prompting and reverse verification. The framework employs a two-phase approach: First, it uses diverse prompting strategies and ensemble methods to perform automated annotation and evaluation of processes, ensuring robust assessments for reward learning. Second, it leverages practical reference answers for reverse verification, enhancing the model's ability to validate outputs and improving training accuracy. To assess the framework's performance, we extend beyond the existing ProcessBench benchmark by introducing UniversalBench, which evaluates reward predictions across full trajectories under diverse policy distribtion with long Chain-of-Thought (CoT) outputs. Experimental results demonstrate that AURORA enhances process evaluation accuracy, improves PRMs' accuracy for diverse policy distributions and long-CoT responses. The project will be open-sourced at <a href="https://auroraprm.github.io/" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. The Universal-PRM-7B is available at <a href="https://huggingface.co/infly/Universal-PRM-7B" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item131'>[131]</a> <a href ="/abs/2502.11525" title="Abstract" id="2502.11525"> arXiv:2502.11525 </a> [<a href="/pdf/2502.11525" title="Download PDF" id="pdf-2502.11525" aria-labelledby="pdf-2502.11525">pdf</a>, <a href="https://arxiv.org/html/2502.11525v1" title="View HTML" id="html-2502.11525" aria-labelledby="html-2502.11525" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11525" title="Other formats" id="oth-2502.11525" aria-labelledby="oth-2502.11525">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Training Large Language Models to be Better Rule Followers </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+Y">Yi Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kang,+S">Shijia Kang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+H">Haotong Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+H">Haotian Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+M">Muhan Zhang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large language models (LLMs) have shown impressive performance across a wide range of tasks. However, they often exhibit unexpected failures in seemingly straightforward tasks, suggesting a reliance on case-based reasoning rather than rule-based reasoning. While the vast training corpus of LLMs contains numerous textual "rules", current training methods fail to leverage these rules effectively. Crucially, the relationships between these "rules" and their corresponding "instances" are not explicitly modeled. As a result, while LLMs can often recall rules with ease, they fail to apply these rules strictly and consistently in relevant reasoning scenarios. In this paper, we investigate the rule-following capabilities of LLMs and propose Meta Rule-Following Fine-Tuning (Meta-RFFT) to enhance the cross-task transferability of rule-following abilities. We first construct a dataset of 88 tasks requiring following rules, encompassing diverse reasoning domains. We demonstrate through extensive experiments that models trained on large-scale rule-following tasks are better rule followers, outperforming the baselines in both downstream fine-tuning and few-shot prompting scenarios. This highlights the cross-task transferability of models with the aid of Meta-RFFT. Furthermore, we examine the influence of factors such as dataset size, rule formulation, and in-context learning. </p> </div> </dd> <dt> <a name='item132'>[132]</a> <a href ="/abs/2502.11533" title="Abstract" id="2502.11533"> arXiv:2502.11533 </a> [<a href="/pdf/2502.11533" title="Download PDF" id="pdf-2502.11533" aria-labelledby="pdf-2502.11533">pdf</a>, <a href="https://arxiv.org/html/2502.11533v1" title="View HTML" id="html-2502.11533" aria-labelledby="html-2502.11533" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11533" title="Other formats" id="oth-2502.11533" aria-labelledby="oth-2502.11533">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Be Cautious When Merging Unfamiliar LLMs: A Phishing Model Capable of Stealing Privacy </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+Z">Zhenyuan Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shi,+Y">Yi Shi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Meng,+W">Wenlong Meng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gong,+C">Chen Gong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wei,+C">Chengkun Wei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+W">Wenzhi Chen</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Model merging is a widespread technology in large language models (LLMs) that integrates multiple task-specific LLMs into a unified one, enabling the merged model to inherit the specialized capabilities of these LLMs. Most task-specific LLMs are sourced from open-source communities and have not undergone rigorous auditing, potentially imposing risks in model merging. This paper highlights an overlooked privacy risk: \textit{an unsafe model could compromise the privacy of other LLMs involved in the model merging.} Specifically, we propose PhiMM, a privacy attack approach that trains a phishing model capable of stealing privacy using a crafted privacy phishing instruction dataset. Furthermore, we introduce a novel model cloaking method that mimics a specialized capability to conceal attack intent, luring users into merging the phishing model. Once victims merge the phishing model, the attacker can extract personally identifiable information (PII) or infer membership information (MI) by querying the merged model with the phishing instruction. Experimental results show that merging a phishing model increases the risk of privacy breaches. Compared to the results before merging, PII leakage increased by 3.9\% and MI leakage increased by 17.4\% on average. We release the code of PhiMM through a link. </p> </div> </dd> <dt> <a name='item133'>[133]</a> <a href ="/abs/2502.11541" title="Abstract" id="2502.11541"> arXiv:2502.11541 </a> [<a href="/pdf/2502.11541" title="Download PDF" id="pdf-2502.11541" aria-labelledby="pdf-2502.11541">pdf</a>, <a href="https://arxiv.org/html/2502.11541v1" title="View HTML" id="html-2502.11541" aria-labelledby="html-2502.11541" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11541" title="Other formats" id="oth-2502.11541" aria-labelledby="oth-2502.11541">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MuSC: Improving Complex Instruction Following with Multi-granularity Self-Contrastive Training </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+H">Hui Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+J">Jiaheng Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+Y">Yancheng He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+S">Shilong Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+B">Bing Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+C">Conghui Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+M">Muyun Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+T">Tiejun Zhao</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Complex instruction-following with elaborate constraints is imperative for Large Language Models (LLMs). While existing methods have constructed data for complex instruction alignment, they all rely on a more advanced model, especially GPT-4, limiting their application. In this paper, we propose a Multi-granularity Self-Contrastive Training (MuSC) framework, to improve the complex instruction alignment without relying on a stronger model. Our method is conducted on both coarse and fine granularity. On coarse-granularity, we construct constraint-aware preference data based on instruction decomposition and recombination. On fine-granularity, we perform token-aware preference optimization with dynamic token-level supervision. Our method is evaluated on open-sourced models, and experiment results show our method achieves significant improvement on both complex and general instruction-following benchmarks, surpassing previous self-alignment methods. </p> </div> </dd> <dt> <a name='item134'>[134]</a> <a href ="/abs/2502.11544" title="Abstract" id="2502.11544"> arXiv:2502.11544 </a> [<a href="/pdf/2502.11544" title="Download PDF" id="pdf-2502.11544" aria-labelledby="pdf-2502.11544">pdf</a>, <a href="https://arxiv.org/html/2502.11544v1" title="View HTML" id="html-2502.11544" aria-labelledby="html-2502.11544" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11544" title="Other formats" id="oth-2502.11544" aria-labelledby="oth-2502.11544">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Evaluating o1-Like LLMs: Unlocking Reasoning for Translation through Comprehensive Analysis </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+A">Andong Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+Y">Yuchen Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+W">Wenxin Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+K">Kehai Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+M">Muyun Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+T">Tiejun Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=zhang,+M">Min zhang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The o1-Like LLMs are transforming AI by simulating human cognitive processes, but their performance in multilingual machine translation (MMT) remains underexplored. This study examines: (1) how o1-Like LLMs perform in MMT tasks and (2) what factors influence their translation quality. We evaluate multiple o1-Like LLMs and compare them with traditional models like ChatGPT and GPT-4o. Results show that o1-Like LLMs establish new multilingual translation benchmarks, with DeepSeek-R1 surpassing GPT-4o in contextless tasks. They demonstrate strengths in historical and cultural translation but exhibit a tendency for rambling issues in Chinese-centric outputs. Further analysis reveals three key insights: (1) High inference costs and slower processing speeds make complex translation tasks more resource-intensive. (2) Translation quality improves with model size, enhancing commonsense reasoning and cultural translation. (3) The temperature parameter significantly impacts output quality-lower temperatures yield more stable and accurate translations, while higher temperatures reduce coherence and precision. </p> </div> </dd> <dt> <a name='item135'>[135]</a> <a href ="/abs/2502.11546" title="Abstract" id="2502.11546"> arXiv:2502.11546 </a> [<a href="/pdf/2502.11546" title="Download PDF" id="pdf-2502.11546" aria-labelledby="pdf-2502.11546">pdf</a>, <a href="https://arxiv.org/html/2502.11546v1" title="View HTML" id="html-2502.11546" aria-labelledby="html-2502.11546" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11546" title="Other formats" id="oth-2502.11546" aria-labelledby="oth-2502.11546">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> DCAD-2000: A Multilingual Dataset across 2000+ Languages with Data Cleaning as Anomaly Detection </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+Y">Yingli Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lai,+W">Wen Lai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Shuo Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+X">Xueren Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+K">Kangyang Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fraser,+A">Alexander Fraser</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+M">Maosong Sun</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The rapid development of multilingual large language models (LLMs) highlights the need for high-quality, diverse, and clean multilingual datasets. In this paper, we introduce DCAD-2000 (Data Cleaning as Anomaly Detection), a large-scale multilingual corpus built using newly extracted Common Crawl data and existing multilingual datasets. DCAD-2000 includes over 2,282 languages, 46.72TB of data, and 8.63 billion documents, spanning 155 high- and medium-resource languages and 159 writing scripts. To overcome the limitations of current data cleaning methods, which rely on manual heuristic thresholds, we propose reframing data cleaning as an anomaly detection task. This dynamic filtering approach significantly enhances data quality by identifying and removing noisy or anomalous content. We evaluate the quality of DCAD-2000 on the FineTask benchmark, demonstrating substantial improvements in multilingual dataset quality and task performance. </p> </div> </dd> <dt> <a name='item136'>[136]</a> <a href ="/abs/2502.11559" title="Abstract" id="2502.11559"> arXiv:2502.11559 </a> [<a href="/pdf/2502.11559" title="Download PDF" id="pdf-2502.11559" aria-labelledby="pdf-2502.11559">pdf</a>, <a href="https://arxiv.org/html/2502.11559v1" title="View HTML" id="html-2502.11559" aria-labelledby="html-2502.11559" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11559" title="Other formats" id="oth-2502.11559" aria-labelledby="oth-2502.11559">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Auto-Search and Refinement: An Automated Framework for Gender Bias Mitigation in Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+Y">Yue Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fu,+C">Chengyan Fu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiong,+L">Li Xiong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+S">Sibei Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+W">Wenjie Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Pre-training large language models (LLMs) on vast text corpora enhances natural language processing capabilities but risks encoding social biases, particularly gender bias. While parameter-modification methods like fine-tuning mitigate bias, they are resource-intensive, unsuitable for closed-source models, and lack adaptability to evolving societal norms. Instruction-based approaches offer flexibility but often compromise task performance. To address these limitations, we propose $\textit{FaIRMaker}$, an automated and model-independent framework that employs an $\textbf{auto-search and refinement}$ paradigm to adaptively generate Fairwords, which act as instructions integrated into input queries to reduce gender bias and enhance response quality. Extensive experiments demonstrate that $\textit{FaIRMaker}$ automatically searches for and dynamically refines Fairwords, effectively mitigating gender bias while preserving task integrity and ensuring compatibility with both API-based and open-source LLMs. </p> </div> </dd> <dt> <a name='item137'>[137]</a> <a href ="/abs/2502.11562" title="Abstract" id="2502.11562"> arXiv:2502.11562 </a> [<a href="/pdf/2502.11562" title="Download PDF" id="pdf-2502.11562" aria-labelledby="pdf-2502.11562">pdf</a>, <a href="https://arxiv.org/html/2502.11562v1" title="View HTML" id="html-2502.11562" aria-labelledby="html-2502.11562" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11562" title="Other formats" id="oth-2502.11562" aria-labelledby="oth-2502.11562">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Reinforced Information Retrieval </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+C">Chaofan Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zheng Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+J">Jianlyv Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lian,+D">Defu Lian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shao,+Y">Yingxia Shao</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> While retrieval techniques are widely used in practice, they still face significant challenges in cross-domain scenarios. Recently, generation-augmented methods have emerged as a promising solution to this problem. These methods enhance raw queries by incorporating additional information from an LLM-based generator, facilitating more direct retrieval of relevant documents. However, existing methods struggle with highly specialized situations that require extensive domain expertise. To address this problem, we present \textbf{Reinforced-IR}, a novel approach that jointly adapts a pre-trained retriever and generator for precise cross-domain retrieval. A key innovation of Reinforced-IR is its \textbf{Self-Boosting} framework, which enables retriever and generator to learn from each other's feedback. Specifically, the generator is reinforced to generate query augmentations that enhance the retriever's performance, while the retriever is trained to better discriminate the relevant documents identified by the generator. This iterative process allows the end-to-end retrieval performance to be progressively optimized using an unlabeled corpus from the target domain. In our experiment, Reinforced-IR outperforms existing domain adaptation methods by a large margin, leading to substantial improvements in retrieval quality across a wide range of application scenarios. </p> </div> </dd> <dt> <a name='item138'>[138]</a> <a href ="/abs/2502.11569" title="Abstract" id="2502.11569"> arXiv:2502.11569 </a> [<a href="/pdf/2502.11569" title="Download PDF" id="pdf-2502.11569" aria-labelledby="pdf-2502.11569">pdf</a>, <a href="/format/2502.11569" title="Other formats" id="oth-2502.11569" aria-labelledby="oth-2502.11569">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Towards Reasoning Ability of Small Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Gaurav">Gaurav Srivastava</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+S">Shuxiang Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+X">Xuan Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Reasoning has long been viewed as an emergent property of large language models (LLMs), appearing at or above a certain scale ($\sim$100B parameters). However, recent studies challenge this assumption, showing that small language models (SLMs) can also achieve competitive reasoning performance. SLMs are increasingly favored for their efficiency and deployability. However, there is a lack of systematic study on the reasoning abilities of diverse SLMs, including those trained from scratch or derived from LLMs through quantization, pruning, and distillation. This raises a critical question: Can SLMs achieve reasoning abilities comparable to LLMs? In this work, we systematically survey, benchmark, and analyze 72 SLMs from six model families across 14 reasoning benchmarks. For reliable evaluation, we examine four evaluation methods and compare four LLM judges against human evaluations on 800 data points. We repeat all experiments three times to ensure a robust performance assessment. Additionally, we analyze the impact of different prompting strategies in small models. Beyond accuracy, we also evaluate model robustness under adversarial conditions and intermediate reasoning steps. Our findings challenge the assumption that scaling is the only way to achieve strong reasoning. Instead, we foresee a future where SLMs with strong reasoning capabilities can be developed through structured training or post-training compression. They can serve as efficient alternatives to LLMs for reasoning-intensive tasks. </p> </div> </dd> <dt> <a name='item139'>[139]</a> <a href ="/abs/2502.11571" title="Abstract" id="2502.11571"> arXiv:2502.11571 </a> [<a href="/pdf/2502.11571" title="Download PDF" id="pdf-2502.11571" aria-labelledby="pdf-2502.11571">pdf</a>, <a href="https://arxiv.org/html/2502.11571v1" title="View HTML" id="html-2502.11571" aria-labelledby="html-2502.11571" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11571" title="Other formats" id="oth-2502.11571" aria-labelledby="oth-2502.11571">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> FaMTEB: Massive Text Embedding Benchmark in Persian Language </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zinvandi,+E">Erfan Zinvandi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Alikhani,+M">Morteza Alikhani</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sarmadi,+M">Mehran Sarmadi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pourbahman,+Z">Zahra Pourbahman</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Arvin,+S">Sepehr Arvin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kazemi,+R">Reza Kazemi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Amini,+A">Arash Amini</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> to appear in ACL 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Information Retrieval (cs.IR); Machine Learning (cs.LG) </div> <p class='mathjax'> In this paper, we introduce a comprehensive benchmark for Persian (Farsi) text embeddings, built upon the Massive Text Embedding Benchmark (MTEB). Our benchmark includes 63 datasets spanning seven different tasks: classification, clustering, pair classification, reranking, retrieval, summary retrieval, and semantic textual similarity. The datasets are formed as a combination of existing, translated, and newly generated data, offering a diverse evaluation framework for Persian language models. Given the increasing use of text embedding models in chatbots, evaluation datasets are becoming inseparable ingredients in chatbot challenges and Retrieval-Augmented Generation systems. As a contribution, we include chatbot evaluation datasets in the MTEB benchmark for the first time. In addition, in this paper, we introduce the new task of summary retrieval which is not part of the tasks included in standard MTEB. Another contribution of this paper is the introduction of a substantial number of new Persian language NLP datasets suitable for training and evaluation, some of which have no previous counterparts in Persian. We evaluate the performance of several Persian and multilingual embedding models in a range of tasks. This work introduces an open-source benchmark with datasets, code and a public leaderboard. </p> </div> </dd> <dt> <a name='item140'>[140]</a> <a href ="/abs/2502.11573" title="Abstract" id="2502.11573"> arXiv:2502.11573 </a> [<a href="/pdf/2502.11573" title="Download PDF" id="pdf-2502.11573" aria-labelledby="pdf-2502.11573">pdf</a>, <a href="https://arxiv.org/html/2502.11573v1" title="View HTML" id="html-2502.11573" aria-labelledby="html-2502.11573" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11573" title="Other formats" id="oth-2502.11573" aria-labelledby="oth-2502.11573">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> InfiR : Crafting Effective Small Language Models and Multimodal Small Language Models in Reasoning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xie,+C">Congkai Xie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cai,+S">Shuo Cai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+W">Wenjun Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+P">Pengxiang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sang,+Z">Zhijie Sang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+K">Kejing Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yiming Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Z">Zhen Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+G">Guanghao Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zeyu Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+Y">Yang Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Y">Yuhang Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+S">Su Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+B">Baoyi He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+Q">Qi Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+X">Xiaotian Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+J">Jianbo Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+S">Shengyu Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+F">Fei Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+H">Hongxia Yang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Large Language Models (LLMs) and Multimodal Large Language Models (MLLMs) have made significant advancements in reasoning capabilities. However, they still face challenges such as high computational demands and privacy concerns. This paper focuses on developing efficient Small Language Models (SLMs) and Multimodal Small Language Models (MSLMs) that retain competitive reasoning abilities. We introduce a novel training pipeline that enhances reasoning capabilities and facilitates deployment on edge devices, achieving state-of-the-art performance while minimizing development costs. \InfR~ aims to advance AI systems by improving reasoning, reducing adoption barriers, and addressing privacy concerns through smaller model sizes. Resources are available at https://github. com/Reallm-Labs/InfiR. </p> </div> </dd> <dt> <a name='item141'>[141]</a> <a href ="/abs/2502.11578" title="Abstract" id="2502.11578"> arXiv:2502.11578 </a> [<a href="/pdf/2502.11578" title="Download PDF" id="pdf-2502.11578" aria-labelledby="pdf-2502.11578">pdf</a>, <a href="https://arxiv.org/html/2502.11578v1" title="View HTML" id="html-2502.11578" aria-labelledby="html-2502.11578" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11578" title="Other formats" id="oth-2502.11578" aria-labelledby="oth-2502.11578">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Language Complexity Measurement as a Noisy Zero-Shot Proxy for Evaluating LLM Performance </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Moell,+B">Birger Moell</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Boye,+J">Johan Boye</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Submitted to ACL 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Large Language Models (LLMs) have made significant strides in natural language generation but often face challenges in tasks requiring precise calculations and structural analysis. This paper investigates the performance of state-of-the-art LLMs on language complexity measurement tasks, through the computation of the LIX readability metric and Average Dependency Distance (ADD). Using Swedish high school and university-level essays, we evaluate the models' abilities to compute LIX scores and perform dependency parsing, comparing their results to established ground truths. Our findings reveal that while all models demonstrate some capacity for these tasks, ChatGPT-o1-mini performs most consistently, achieving the highest accuracy in both LIX computation and dependency parsing. Additionally, we observe a strong significant correlation -0.875 p 0.026 (N=6) between the models' accuracy in computing LIX and their overall performance on the Massive Multitask Language Understanding (MMLU) benchmark. These results suggest that language complexity measurement abilities can serve as a noisy zero-shot proxies for assessing the general capabilities of LLMs, providing a practical method for model evaluation without the need for extensive benchmarking datasets. </p> </div> </dd> <dt> <a name='item142'>[142]</a> <a href ="/abs/2502.11598" title="Abstract" id="2502.11598"> arXiv:2502.11598 </a> [<a href="/pdf/2502.11598" title="Download PDF" id="pdf-2502.11598" aria-labelledby="pdf-2502.11598">pdf</a>, <a href="https://arxiv.org/html/2502.11598v1" title="View HTML" id="html-2502.11598" aria-labelledby="html-2502.11598" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11598" title="Other formats" id="oth-2502.11598" aria-labelledby="oth-2502.11598">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Can LLM Watermarks Robustly Prevent Unauthorized Knowledge Distillation? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Pan,+L">Leyi Pan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+A">Aiwei Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+S">Shiyu Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+Y">Yijian Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+X">Xuming Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wen,+L">Lijie Wen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=King,+I">Irwin King</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+P+S">Philip S. Yu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 22 pages, 12 figures, 13 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The radioactive nature of Large Language Model (LLM) watermarking enables the detection of watermarks inherited by student models when trained on the outputs of watermarked teacher models, making it a promising tool for preventing unauthorized knowledge distillation. However, the robustness of watermark radioactivity against adversarial actors remains largely unexplored. In this paper, we investigate whether student models can acquire the capabilities of teacher models through knowledge distillation while avoiding watermark inheritance. We propose two categories of watermark removal approaches: pre-distillation removal through untargeted and targeted training data paraphrasing (UP and TP), and post-distillation removal through inference-time watermark neutralization (WN). Extensive experiments across multiple model pairs, watermarking schemes and hyper-parameter settings demonstrate that both TP and WN thoroughly eliminate inherited watermarks, with WN achieving this while maintaining knowledge transfer efficiency and low computational overhead. Given the ongoing deployment of watermarking techniques in production LLMs, these findings emphasize the urgent need for more robust defense strategies. Our code is available at <a href="https://github.com/THU-BPM/Watermark-Radioactivity-Attack" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item143'>[143]</a> <a href ="/abs/2502.11603" title="Abstract" id="2502.11603"> arXiv:2502.11603 </a> [<a href="/pdf/2502.11603" title="Download PDF" id="pdf-2502.11603" aria-labelledby="pdf-2502.11603">pdf</a>, <a href="https://arxiv.org/html/2502.11603v1" title="View HTML" id="html-2502.11603" aria-labelledby="html-2502.11603" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11603" title="Other formats" id="oth-2502.11603" aria-labelledby="oth-2502.11603">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> DR.GAP: Mitigating Bias in Large Language Models using Gender-Aware Prompting with Demonstration and Reasoning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Qiu,+H">Hongye Qiu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+Y">Yue Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qiu,+M">Meikang Qiu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+W">Wenjie Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Large Language Models (LLMs) exhibit strong natural language processing capabilities but also inherit and amplify societal biases, including gender bias, raising fairness concerns. Existing debiasing methods face significant limitations: parameter tuning requires access to model weights, prompt-based approaches often degrade model utility, and optimization-based techniques lack generalizability. To address these challenges, we propose <a href="http://DR.GAP" rel="external noopener nofollow" class="link-external link-http">this http URL</a> (Demonstration and Reasoning for Gender-Aware Prompting), an automated and model-agnostic approach that mitigates gender bias while preserving model performance. <a href="http://DR.GAP" rel="external noopener nofollow" class="link-external link-http">this http URL</a> selects bias-revealing examples and generates structured reasoning to guide models toward more impartial responses. Extensive experiments on coreference resolution and QA tasks across multiple LLMs (GPT-3.5, Llama3, and Llama2-Alpaca) demonstrate its effectiveness, generalization ability, and robustness. <a href="http://DR.GAP" rel="external noopener nofollow" class="link-external link-http">this http URL</a> can generalize to vision-language models (VLMs), achieving significant bias reduction. </p> </div> </dd> <dt> <a name='item144'>[144]</a> <a href ="/abs/2502.11611" title="Abstract" id="2502.11611"> arXiv:2502.11611 </a> [<a href="/pdf/2502.11611" title="Download PDF" id="pdf-2502.11611" aria-labelledby="pdf-2502.11611">pdf</a>, <a href="https://arxiv.org/html/2502.11611v1" title="View HTML" id="html-2502.11611" aria-labelledby="html-2502.11611" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11611" title="Other formats" id="oth-2502.11611" aria-labelledby="oth-2502.11611">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Identifying Gender Stereotypes and Biases in Automated Translation from English to Italian using Similarity Networks </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Mohammadi,+F">Fatemeh Mohammadi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tamborini,+M+A">Marta Annamaria Tamborini</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ceravolo,+P">Paolo Ceravolo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nardocci,+C">Costanza Nardocci</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Maghool,+S">Samira Maghool</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> This paper is a collaborative effort between Linguistics, Law, and Computer Science to evaluate stereotypes and biases in automated translation systems. We advocate gender-neutral translation as a means to promote gender inclusion and improve the objectivity of machine translation. Our approach focuses on identifying gender bias in English-to-Italian translations. First, we define gender bias following human rights law and linguistics literature. Then we proceed by identifying gender-specific terms such as she/lei and he/lui as key elements. We then evaluate the cosine similarity between these target terms and others in the dataset to reveal the model's perception of semantic relations. Using numerical features, we effectively evaluate the intensity and direction of the bias. Our findings provide tangible insights for developing and training gender-neutral translation algorithms. </p> </div> </dd> <dt> <a name='item145'>[145]</a> <a href ="/abs/2502.11614" title="Abstract" id="2502.11614"> arXiv:2502.11614 </a> [<a href="/pdf/2502.11614" title="Download PDF" id="pdf-2502.11614" aria-labelledby="pdf-2502.11614">pdf</a>, <a href="https://arxiv.org/html/2502.11614v1" title="View HTML" id="html-2502.11614" aria-labelledby="html-2502.11614" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11614" title="Other formats" id="oth-2502.11614" aria-labelledby="oth-2502.11614">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Is Human-Like Text Liked by Humans? Multilingual Human Detection and Preference Against AI </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yuxia Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xing,+R">Rui Xing</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mansurov,+J">Jonibek Mansurov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Puccetti,+G">Giovanni Puccetti</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xie,+Z">Zhuohan Xie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ta,+M+N">Minh Ngoc Ta</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Geng,+J">Jiahui Geng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Su,+J">Jinyan Su</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Abassy,+M">Mervat Abassy</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ahmed,+S+E+D">Saad El Dine Ahmed</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Elozeiri,+K">Kareem Elozeiri</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Laiyk,+N">Nurkhan Laiyk</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Goloburda,+M">Maiya Goloburda</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mahmoud,+T">Tarek Mahmoud</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tomar,+R+V">Raj Vardhan Tomar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Aziz,+A">Alexander Aziz</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Koike,+R">Ryuto Koike</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kaneko,+M">Masahiro Kaneko</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shelmanov,+A">Artem Shelmanov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Artemova,+E">Ekaterina Artemova</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mikhailov,+V">Vladislav Mikhailov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tsvigun,+A">Akim Tsvigun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Aji,+A+F">Alham Fikri Aji</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Habash,+N">Nizar Habash</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gurevych,+I">Iryna Gurevych</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nakov,+P">Preslav Nakov</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Prior studies have shown that distinguishing text generated by large language models (LLMs) from human-written one is highly challenging, and often no better than random guessing. To verify the generalizability of this finding across languages and domains, we perform an extensive case study to identify the upper bound of human detection accuracy. Across 16 datasets covering 9 languages and 9 domains, 19 annotators achieved an average detection accuracy of 87.6%, thus challenging previous conclusions. We find that major gaps between human and machine text lie in concreteness, cultural nuances, and diversity. Prompting by explicitly explaining the distinctions in the prompts can partially bridge the gaps in over 50% of the cases. However, we also find that humans do not always prefer human-written text, particularly when they cannot clearly identify its source. </p> </div> </dd> <dt> <a name='item146'>[146]</a> <a href ="/abs/2502.11633" title="Abstract" id="2502.11633"> arXiv:2502.11633 </a> [<a href="/pdf/2502.11633" title="Download PDF" id="pdf-2502.11633" aria-labelledby="pdf-2502.11633">pdf</a>, <a href="https://arxiv.org/html/2502.11633v1" title="View HTML" id="html-2502.11633" aria-labelledby="html-2502.11633" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11633" title="Other formats" id="oth-2502.11633" aria-labelledby="oth-2502.11633">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CLASS: Enhancing Cross-Modal Text-Molecule Retrieval Performance and Training Efficiency </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+H">Hongyan Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zeng,+P">Peijian Zeng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zheng,+W">Weixiong Zheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+L">Lianxi Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+N">Nankai Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+S">Shengyi Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+A">Aimin Yang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 12 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Cross-modal text-molecule retrieval task bridges molecule structures and natural language descriptions. Existing methods predominantly focus on aligning text modality and molecule modality, yet they overlook adaptively adjusting the learning states at different training stages and enhancing training efficiency. To tackle these challenges, this paper proposes a Curriculum Learning-bAsed croSS-modal text-molecule training framework (CLASS), which can be integrated with any backbone to yield promising performance improvement. Specifically, we quantify the sample difficulty considering both text modality and molecule modality, and design a sample scheduler to introduce training samples via an easy-to-difficult paradigm as the training advances, remarkably reducing the scale of training samples at the early stage of training and improving training efficiency. Moreover, we introduce adaptive intensity learning to increase the training intensity as the training progresses, which adaptively controls the learning intensity across all curriculum stages. Experimental results on the ChEBI-20 dataset demonstrate that our proposed method gains superior performance, simultaneously achieving prominent time savings. </p> </div> </dd> <dt> <a name='item147'>[147]</a> <a href ="/abs/2502.11656" title="Abstract" id="2502.11656"> arXiv:2502.11656 </a> [<a href="/pdf/2502.11656" title="Download PDF" id="pdf-2502.11656" aria-labelledby="pdf-2502.11656">pdf</a>, <a href="https://arxiv.org/html/2502.11656v1" title="View HTML" id="html-2502.11656" aria-labelledby="html-2502.11656" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11656" title="Other formats" id="oth-2502.11656" aria-labelledby="oth-2502.11656">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Uncovering the Impact of Chain-of-Thought Reasoning for Direct Preference Optimization: Lessons from Text-to-SQL </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+H">Hanbing Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+H">Haoyang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+X">Xiaokang Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+R">Ruotong Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+H">Haiyong Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tian,+T">Tian Tian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qi,+Q">Qi Qi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+J">Jing Zhang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Databases (cs.DB) </div> <p class='mathjax'> Direct Preference Optimization (DPO) has proven effective in complex reasoning tasks like math word problems and code generation. However, when applied to Text-to-SQL datasets, it often fails to improve performance and can even degrade it. Our investigation reveals the root cause: unlike math and code tasks, which naturally integrate Chain-of-Thought (CoT) reasoning with DPO, Text-to-SQL datasets typically include only final answers (gold SQL queries) without detailed CoT solutions. By augmenting Text-to-SQL datasets with synthetic CoT solutions, we achieve, for the first time, consistent and significant performance improvements using DPO. Our analysis shows that CoT reasoning is crucial for unlocking DPO's potential, as it mitigates reward hacking, strengthens discriminative capabilities, and improves scalability. These findings offer valuable insights for building more robust Text-to-SQL models. To support further research, we publicly release the code and CoT-enhanced datasets. </p> </div> </dd> <dt> <a name='item148'>[148]</a> <a href ="/abs/2502.11671" title="Abstract" id="2502.11671"> arXiv:2502.11671 </a> [<a href="/pdf/2502.11671" title="Download PDF" id="pdf-2502.11671" aria-labelledby="pdf-2502.11671">pdf</a>, <a href="https://arxiv.org/html/2502.11671v1" title="View HTML" id="html-2502.11671" aria-labelledby="html-2502.11671" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11671" title="Other formats" id="oth-2502.11671" aria-labelledby="oth-2502.11671">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Diversity-Oriented Data Augmentation with Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zaitian Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+J">Jinghan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+X">Xinhao Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+K">Kunpeng Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+P">Pengfei Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+Y">Yuanchun Zhou</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Data augmentation is an essential technique in natural language processing (NLP) for enriching training datasets by generating diverse samples. This process is crucial for improving the robustness and generalization capabilities of NLP models. However, a significant challenge remains: \textit{Insufficient Attention to Sample Distribution Diversity}. Most existing methods focus on increasing the sample numbers while neglecting the sample distribution diversity, which can lead to model overfitting. In response, we explore data augmentation's impact on dataset diversity and propose a \textbf{\underline{D}}iversity-\textbf{\underline{o}}riented data \textbf{\underline{Aug}}mentation framework (\textbf{DoAug}). % \(\mathscr{DoAug}\) Specifically, we utilize a diversity-oriented fine-tuning approach to train an LLM as a diverse paraphraser, which is capable of augmenting textual datasets by generating diversified paraphrases. Then, we apply the LLM paraphraser to a selected coreset of highly informative samples and integrate the paraphrases with the original data to create a more diverse augmented dataset. Finally, we conduct extensive experiments on 12 real-world textual datasets. The results show that our fine-tuned LLM augmenter improves diversity while preserving label consistency, thereby enhancing the robustness and performance of downstream tasks. Specifically, it achieves an average performance gain of \(10.52\%\), surpassing the runner-up baseline with more than three percentage points. </p> </div> </dd> <dt> <a name='item149'>[149]</a> <a href ="/abs/2502.11677" title="Abstract" id="2502.11677"> arXiv:2502.11677 </a> [<a href="/pdf/2502.11677" title="Download PDF" id="pdf-2502.11677" aria-labelledby="pdf-2502.11677">pdf</a>, <a href="https://arxiv.org/html/2502.11677v1" title="View HTML" id="html-2502.11677" aria-labelledby="html-2502.11677" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11677" title="Other formats" id="oth-2502.11677" aria-labelledby="oth-2502.11677">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Towards Fully Exploiting LLM Internal States to Enhance Knowledge Boundary Perception </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ni,+S">Shiyu Ni</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bi,+K">Keping Bi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+J">Jiafeng Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+L">Lulu Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bi,+B">Baolong Bi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cheng,+X">Xueqi Cheng</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large language models (LLMs) exhibit impressive performance across diverse tasks but often struggle to accurately gauge their knowledge boundaries, leading to confident yet incorrect responses. This paper explores leveraging LLMs' internal states to enhance their perception of knowledge boundaries from efficiency and risk perspectives. We investigate whether LLMs can estimate their confidence using internal states before response generation, potentially saving computational resources. Our experiments on datasets like Natural Questions, HotpotQA, and MMLU reveal that LLMs demonstrate significant pre-generation perception, which is further refined post-generation, with perception gaps remaining stable across varying conditions. To mitigate risks in critical domains, we introduce Consistency-based Confidence Calibration ($C^3$), which assesses confidence consistency through question reformulation. $C^3$ significantly improves LLMs' ability to recognize their knowledge gaps, enhancing the unknown perception rate by 5.6\% on NQ and 4.9\% on HotpotQA. Our findings suggest that pre-generation confidence estimation can optimize efficiency, while $C^3$ effectively controls output risks, advancing the reliability of LLMs in practical applications. </p> </div> </dd> <dt> <a name='item150'>[150]</a> <a href ="/abs/2502.11681" title="Abstract" id="2502.11681"> arXiv:2502.11681 </a> [<a href="/pdf/2502.11681" title="Download PDF" id="pdf-2502.11681" aria-labelledby="pdf-2502.11681">pdf</a>, <a href="https://arxiv.org/html/2502.11681v1" title="View HTML" id="html-2502.11681" aria-labelledby="html-2502.11681" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11681" title="Other formats" id="oth-2502.11681" aria-labelledby="oth-2502.11681">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> RIDE: Enhancing Large Language Model Alignment through Restyled In-Context Learning Demonstration Exemplars </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Hua,+Y">Yuncheng Hua</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qu,+L">Lizhen Qu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Z">Zhuang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xue,+H">Hao Xue</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Salim,+F+D">Flora D. Salim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Haffari,+G">Gholamreza Haffari</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 37 pages, 1 figure, 20 tables; The paper is under review </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Alignment tuning is crucial for ensuring large language models (LLMs) behave ethically and helpfully. Current alignment approaches require high-quality annotations and significant training resources. This paper proposes a low-cost, tuning-free method using in-context learning (ICL) to enhance LLM alignment. Through an analysis of high-quality ICL demos, we identified style as a key factor influencing LLM alignment capabilities and explicitly restyled ICL exemplars based on this stylistic framework. Additionally, we combined the restyled demos to achieve a balance between the two conflicting aspects of LLM alignment--factuality and safety. We packaged the restyled examples as prompts to trigger few-shot learning, improving LLM alignment. Compared to the best baseline approach, with an average score of 5.00 as the maximum, our method achieves a maximum 0.10 increase on the Alpaca task (from 4.50 to 4.60), a 0.22 enhancement on the Just-eval benchmark (from 4.34 to 4.56), and a maximum improvement of 0.32 (from 3.53 to 3.85) on the MT-Bench dataset. We release the code and data at <a href="https://github.com/AnonymousCode-ComputerScience/RIDE" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item151'>[151]</a> <a href ="/abs/2502.11684" title="Abstract" id="2502.11684"> arXiv:2502.11684 </a> [<a href="/pdf/2502.11684" title="Download PDF" id="pdf-2502.11684" aria-labelledby="pdf-2502.11684">pdf</a>, <a href="https://arxiv.org/html/2502.11684v1" title="View HTML" id="html-2502.11684" aria-labelledby="html-2502.11684" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11684" title="Other formats" id="oth-2502.11684" aria-labelledby="oth-2502.11684">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MathFimer: Enhancing Mathematical Reasoning by Expanding Reasoning Steps through Fill-in-the-Middle Task </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yan,+Y">Yuchen Yan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+Y">Yongliang Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Y">Yang Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+J">Jin Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+X">Xin Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+M">Mengdi Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shao,+J">Jian Shao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhuang,+Y">Yueting Zhuang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Mathematical reasoning represents a critical frontier in advancing large language models (LLMs). While step-by-step approaches have emerged as the dominant paradigm for mathematical problem-solving in LLMs, the quality of reasoning steps in training data fundamentally constrains the performance of the models. Recent studies has demonstrated that more detailed intermediate steps can enhance model performance, yet existing methods for step expansion either require more powerful external models or incur substantial computational costs. In this paper, we introduce MathFimer, a novel framework for mathematical reasoning step expansion inspired by the "Fill-in-the-middle" task from code completion. By decomposing solution chains into prefix-suffix pairs and training models to reconstruct missing intermediate steps, we develop a specialized model, MathFimer-7B, on our carefully curated NuminaMath-FIM dataset. We then apply these models to enhance existing mathematical reasoning datasets by inserting detailed intermediate steps into their solution chains, creating MathFimer-expanded versions. Through comprehensive experiments on multiple mathematical reasoning datasets, including MathInstruct, MetaMathQA and etc., we demonstrate that models trained on MathFimer-expanded data consistently outperform their counterparts trained on original data across various benchmarks such as GSM8K and MATH. Our approach offers a practical, scalable solution for enhancing mathematical reasoning capabilities in LLMs without relying on powerful external models or expensive inference procedures. </p> </div> </dd> <dt> <a name='item152'>[152]</a> <a href ="/abs/2502.11688" title="Abstract" id="2502.11688"> arXiv:2502.11688 </a> [<a href="/pdf/2502.11688" title="Download PDF" id="pdf-2502.11688" aria-labelledby="pdf-2502.11688">pdf</a>, <a href="https://arxiv.org/html/2502.11688v1" title="View HTML" id="html-2502.11688" aria-labelledby="html-2502.11688" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11688" title="Other formats" id="oth-2502.11688" aria-labelledby="oth-2502.11688">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> From Isolates to Families: Using Neural Networks for Automated Language Affiliation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Blum,+F">Frederic Blum</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Herbold,+S">Steffen Herbold</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=List,+J">Johann-Mattis List</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Submitted to the 63rd Annual Meeting of the Association for Computational Linguistics, Vienna, Austria </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> In historical linguistics, the affiliation of languages to a common language family is traditionally carried out using a complex workflow that relies on manually comparing individual languages. Large-scale standardized collections of multilingual wordlists and grammatical language structures might help to improve this and open new avenues for developing automated language affiliation workflows. Here, we present neural network models that use lexical and grammatical data from a worldwide sample of more than 1,000 languages with known affiliations to classify individual languages into families. In line with the traditional assumption of most linguists, our results show that models trained on lexical data alone outperform models solely based on grammatical data, whereas combining both types of data yields even better performance. In additional experiments, we show how our models can identify long-ranging relations between entire subgroups, how they can be employed to investigate potential relatives of linguistic isolates, and how they can help us to obtain first hints on the affiliation of so far unaffiliated languages. We conclude that models for automated language affiliation trained on lexical and grammatical data provide comparative linguists with a valuable tool for evaluating hypotheses about deep and unknown language relations. </p> </div> </dd> <dt> <a name='item153'>[153]</a> <a href ="/abs/2502.11689" title="Abstract" id="2502.11689"> arXiv:2502.11689 </a> [<a href="/pdf/2502.11689" title="Download PDF" id="pdf-2502.11689" aria-labelledby="pdf-2502.11689">pdf</a>, <a href="https://arxiv.org/html/2502.11689v1" title="View HTML" id="html-2502.11689" aria-labelledby="html-2502.11689" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11689" title="Other formats" id="oth-2502.11689" aria-labelledby="oth-2502.11689">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Improve LLM-as-a-Judge Ability as a General Ability </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+J">Jiachen Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+S">Shaoning Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+X">Xiaohui Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yan,+J">Jiaxu Yan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+K">Kaidong Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+X">Xuelong Li</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> LLM-as-a-Judge leverages the generative and reasoning capabilities of large language models (LLMs) to evaluate LLM responses across diverse scenarios, providing accurate preference signals. This approach plays a vital role in aligning LLMs with human values, ensuring ethical and reliable AI outputs that align with societal norms. Recent studies have raised many methods to train LLM as generative judges, but most of them are data consuming or lack accuracy, and only focus on LLM's judge ability. In this work, we regard judge ability as a general ability of LLM and implement a two-stage training approach, comprising supervised fine-tuning (SFT) warm-up and direct preference optimization (DPO) enhancement, to achieve judge style adaptation and improve judgment accuracy. Additionally, we introduce an efficient data synthesis method to generate judgmental content. Experimental results demonstrate that our approach, utilizing only about 2% to 40% of the data required by other methods, achieves SOTA performance on RewardBench. Furthermore, our training method enhances the general capabilities of the model by constructing complicated judge task, and the judge signals provided by our model have significantly enhanced the downstream DPO training performance of our internal models in our test to optimize policy model with Judge Model. We also open-source our model weights and training data to facilitate further research. </p> </div> </dd> <dt> <a name='item154'>[154]</a> <a href ="/abs/2502.11703" title="Abstract" id="2502.11703"> arXiv:2502.11703 </a> [<a href="/pdf/2502.11703" title="Download PDF" id="pdf-2502.11703" aria-labelledby="pdf-2502.11703">pdf</a>, <a href="https://arxiv.org/html/2502.11703v1" title="View HTML" id="html-2502.11703" aria-labelledby="html-2502.11703" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11703" title="Other formats" id="oth-2502.11703" aria-labelledby="oth-2502.11703">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CMQCIC-Bench: A Chinese Benchmark for Evaluating Large Language Models in Medical Quality Control Indicator Calculation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+G">Guangya Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yanhao Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+Z">Zongying Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jin,+Y">Yuxiong Jin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dai,+L">Li Dai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+Y">Yupian Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hou,+R">Ruihui Hou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+W">Weiyan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fan,+Y">Yongqi Fan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ye,+Q">Qi Ye</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+J">Jingping Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ruan,+T">Tong Ruan</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 16 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Medical quality control indicators are essential to assess the qualifications of healthcare institutions for medical services. With the impressive performance of large language models (LLMs) like GPT-4 in the medical field, leveraging these technologies for the Medical Quality Control Indicator Calculation (MQCIC) presents a promising approach. In this work, (1) we introduce a real-world task MQCIC and propose an open-source Chinese electronic medical records (EMRs)-based dataset (CMQCIC-Bench) comprising 785 instances and 76 indicators. (2) We propose a semi-automatic method to enhance the rule representation. Then we propose the Clinical Facts-based Inferential Rule (CF-IR) method that disentangles the clinical fact verification and inferential rule reasoning actions. (3) We conduct comprehensive experiments on 20 representative LLMs, covering general and medical models. Our findings reveal that CF-IR outperforms Chain-of-Thought methods in MQCIC tasks. (4) We conduct an error analysis and investigate the capabilities of clinical fact verification and inferential rule reasoning, providing insights to improve performance in the MQCIC further. The dataset and code is available in this repo <a href="https://anonymous.4open.science/r/C-MQCIC-1151" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item155'>[155]</a> <a href ="/abs/2502.11705" title="Abstract" id="2502.11705"> arXiv:2502.11705 </a> [<a href="/pdf/2502.11705" title="Download PDF" id="pdf-2502.11705" aria-labelledby="pdf-2502.11705">pdf</a>, <a href="/format/2502.11705" title="Other formats" id="oth-2502.11705" aria-labelledby="oth-2502.11705">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LLM Agents Making Agent Tools </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=W%C3%B6lflein,+G">Georg W枚lflein</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ferber,+D">Dyke Ferber</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Truhn,+D">Daniel Truhn</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Arandjelovi%C4%87,+O">Ognjen Arandjelovi膰</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kather,+J+N">Jakob Nikolas Kather</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG); Multiagent Systems (cs.MA) </div> <p class='mathjax'> Tool use has turned large language models (LLMs) into powerful agents that can perform complex multi-step tasks by dynamically utilising external software components. However, these tools must be implemented in advance by human developers, hindering the applicability of LLM agents in domains which demand large numbers of highly specialised tools, like in life sciences and medicine. Motivated by the growing trend of scientific studies accompanied by public code repositories, we propose ToolMaker, a novel agentic framework that autonomously transforms papers with code into LLM-compatible tools. Given a short task description and a repository URL, ToolMaker autonomously installs required dependencies and generates code to perform the task, using a closed-loop self-correction mechanism to iteratively diagnose and rectify errors. To evaluate our approach, we introduce a benchmark comprising 15 diverse and complex computational tasks spanning both medical and non-medical domains with over 100 unit tests to objectively assess tool correctness and robustness. ToolMaker correctly implements 80% of the tasks, substantially outperforming current state-of-the-art software engineering agents. ToolMaker therefore is a step towards fully autonomous agent-based scientific workflows. </p> </div> </dd> <dt> <a name='item156'>[156]</a> <a href ="/abs/2502.11707" title="Abstract" id="2502.11707"> arXiv:2502.11707 </a> [<a href="/pdf/2502.11707" title="Download PDF" id="pdf-2502.11707" aria-labelledby="pdf-2502.11707">pdf</a>, <a href="https://arxiv.org/html/2502.11707v1" title="View HTML" id="html-2502.11707" aria-labelledby="html-2502.11707" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11707" title="Other formats" id="oth-2502.11707" aria-labelledby="oth-2502.11707">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Ad-hoc Concept Forming in the Game Codenames as a Means for Evaluating Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Hakimov,+S">Sherzod Hakimov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pfennigschmidt,+L">Lara Pfennigschmidt</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Schlangen,+D">David Schlangen</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> This study utilizes the game Codenames as a benchmarking tool to evaluate large language models (LLMs) with respect to specific linguistic and cognitive skills. LLMs play each side of the game, where one side generates a clue word covering several target words and the other guesses those target words. We designed various experiments by controlling the choice of words (abstract vs. concrete words, ambiguous vs. monosemic) or the opponent (programmed to be faster or slower in revealing words). Recent commercial and open-weight models were compared side-by-side to find out factors affecting their performance. The evaluation reveals details about their strategies, challenging cases, and limitations of LLMs. </p> </div> </dd> <dt> <a name='item157'>[157]</a> <a href ="/abs/2502.11718" title="Abstract" id="2502.11718"> arXiv:2502.11718 </a> [<a href="/pdf/2502.11718" title="Download PDF" id="pdf-2502.11718" aria-labelledby="pdf-2502.11718">pdf</a>, <a href="https://arxiv.org/html/2502.11718v1" title="View HTML" id="html-2502.11718" aria-labelledby="html-2502.11718" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11718" title="Other formats" id="oth-2502.11718" aria-labelledby="oth-2502.11718">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> "See the World, Discover Knowledge": A Chinese Factuality Evaluation for Large Vision Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Gu,+J">Jihao Gu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yingyao Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bu,+P">Pi Bu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+C">Chen Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Ziming Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+T">Tengtao Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wei,+D">Donglai Wei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+J">Jiale Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+Y">Yingxiu Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+Y">Yancheng He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+S">Shilong Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+J">Jiaheng Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+M">Meng Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+J">Jun Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tan,+Y">Yingshui Tan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+X">Xiang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Su,+W">Wenbo Su</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zheng,+Z">Zhicheng Zheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+X">Xiaoyong Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zheng,+B">Bo Zheng</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 24 pages, 21 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> The evaluation of factual accuracy in large vision language models (LVLMs) has lagged behind their rapid development, making it challenging to fully reflect these models' knowledge capacity and reliability. In this paper, we introduce the first factuality-based visual question-answering benchmark in Chinese, named ChineseSimpleVQA, aimed at assessing the visual factuality of LVLMs across 8 major topics and 56 subtopics. The key features of this benchmark include a focus on the Chinese language, diverse knowledge types, a multi-hop question construction, high-quality data, static consistency, and easy-to-evaluate through short answers. Moreover, we contribute a rigorous data construction pipeline and decouple the visual factuality into two parts: seeing the world (i.e., object recognition) and discovering knowledge. This decoupling allows us to analyze the capability boundaries and execution mechanisms of LVLMs. Subsequently, we evaluate 34 advanced open-source and closed-source models, revealing critical performance gaps within this field. </p> </div> </dd> <dt> <a name='item158'>[158]</a> <a href ="/abs/2502.11733" title="Abstract" id="2502.11733"> arXiv:2502.11733 </a> [<a href="/pdf/2502.11733" title="Download PDF" id="pdf-2502.11733" aria-labelledby="pdf-2502.11733">pdf</a>, <a href="https://arxiv.org/html/2502.11733v1" title="View HTML" id="html-2502.11733" aria-labelledby="html-2502.11733" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11733" title="Other formats" id="oth-2502.11733" aria-labelledby="oth-2502.11733">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Plant in Cupboard, Orange on Table, Book on Shelf. Benchmarking Practical Reasoning and Situation Modelling in a Text-Simulated Situated Environment </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Jordan,+J">Jonathan Jordan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hakimov,+S">Sherzod Hakimov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Schlangen,+D">David Schlangen</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large language models (LLMs) have risen to prominence as 'chatbots' for users to interact via natural language. However, their abilities to capture common-sense knowledge make them seem promising as language-based planners of situated or embodied action as well. We have implemented a simple text-based environment -- similar to others that have before been used for reinforcement-learning of agents -- that simulates, very abstractly, a household setting. We use this environment and the detailed error-tracking capabilities we implemented for targeted benchmarking of LLMs on the problem of practical reasoning: Going from goals and observations to actions. Our findings show that environmental complexity and game restrictions hamper performance, and concise action planning is demanding for current LLMs. </p> </div> </dd> <dt> <a name='item159'>[159]</a> <a href ="/abs/2502.11735" title="Abstract" id="2502.11735"> arXiv:2502.11735 </a> [<a href="/pdf/2502.11735" title="Download PDF" id="pdf-2502.11735" aria-labelledby="pdf-2502.11735">pdf</a>, <a href="https://arxiv.org/html/2502.11735v1" title="View HTML" id="html-2502.11735" aria-labelledby="html-2502.11735" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11735" title="Other formats" id="oth-2502.11735" aria-labelledby="oth-2502.11735">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MT-RAIG: Novel Benchmark and Evaluation Framework for Retrieval-Augmented Insight Generation over Multiple Tables </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Seo,+K">Kwangwook Seo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kwon,+D">Donguk Kwon</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+D">Dongha Lee</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Work in progress </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Recent advancements in table-based reasoning have expanded beyond factoid-level QA to address insight-level tasks, where systems should synthesize implicit knowledge in the table to provide explainable analyses. Although effective, existing studies remain confined to scenarios where a single gold table is given alongside the user query, failing to address cases where users seek comprehensive insights from multiple unknown tables. To bridge these gaps, we propose MT-RAIG Bench, design to evaluate systems on Retrieval-Augmented Insight Generation over Mulitple-Tables. Additionally, to tackle the suboptimality of existing automatic evaluation methods in the table domain, we further introduce a fine-grained evaluation framework MT-RAIG Eval, which achieves better alignment with human quality judgments on the generated insights. We conduct extensive experiments and reveal that even frontier LLMs still struggle with complex multi-table reasoning, establishing our MT-RAIG Bench as a challenging testbed for future research. </p> </div> </dd> <dt> <a name='item160'>[160]</a> <a href ="/abs/2502.11736" title="Abstract" id="2502.11736"> arXiv:2502.11736 </a> [<a href="/pdf/2502.11736" title="Download PDF" id="pdf-2502.11736" aria-labelledby="pdf-2502.11736">pdf</a>, <a href="https://arxiv.org/html/2502.11736v1" title="View HTML" id="html-2502.11736" aria-labelledby="html-2502.11736" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11736" title="Other formats" id="oth-2502.11736" aria-labelledby="oth-2502.11736">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> ReviewEval: An Evaluation Framework for AI-Generated Reviews </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Kirtani,+C">Chavvi Kirtani</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Garg,+M+K">Madhav Krishan Garg</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Prasad,+T">Tejash Prasad</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Singhal,+T">Tanmay Singhal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mandal,+M">Murari Mandal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kumar,+D">Dhruv Kumar</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Under review: 8 pages, 2 figures, 2 tables, 3 pages for appendix </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> The escalating volume of academic research, coupled with a shortage of qualified reviewers, necessitates innovative approaches to peer review. While large language model (LLMs) offer potential for automating this process, their current limitations include superficial critiques, hallucinations, and a lack of actionable insights. This research addresses these challenges by introducing a comprehensive evaluation framework for AI-generated reviews, that measures alignment with human evaluations, verifies factual accuracy, assesses analytical depth, and identifies actionable insights. We also propose a novel alignment mechanism that tailors LLM-generated reviews to the unique evaluation priorities of individual conferences and journals. To enhance the quality of these reviews, we introduce a self-refinement loop that iteratively optimizes the LLM's review prompts. Our framework establishes standardized metrics for evaluating AI-based review systems, thereby bolstering the reliability of AI-generated reviews in academic research. </p> </div> </dd> <dt> <a name='item161'>[161]</a> <a href ="/abs/2502.11766" title="Abstract" id="2502.11766"> arXiv:2502.11766 </a> [<a href="/pdf/2502.11766" title="Download PDF" id="pdf-2502.11766" aria-labelledby="pdf-2502.11766">pdf</a>, <a href="https://arxiv.org/html/2502.11766v1" title="View HTML" id="html-2502.11766" aria-labelledby="html-2502.11766" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11766" title="Other formats" id="oth-2502.11766" aria-labelledby="oth-2502.11766">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Warmup-Distill: Bridge the Distribution Mismatch between Teacher and Student before Knowledge Distillation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+Z">Zengkui Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Y">Yijin Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Meng,+F">Fandong Meng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Y">Yufeng Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+J">Jinan Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+J">Jie Zhou</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 11 Pages, 4 figures, Code at <a href="https://github.com/Acerkoo/WarmupDistill" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The widespread deployment of Large Language Models (LLMs) is hindered by the high computational demands, making knowledge distillation (KD) crucial for developing compact smaller ones. However, the conventional KD methods endure the distribution mismatch issue between the teacher and student models, leading to the poor performance of distillation. For instance, the widely-used KL-based methods suffer the mode-averaging and mode-collapsing problems, since the mismatched probabitliy distribution between both models. Previous studies mainly optimize this issue via different distance calculations towards the distribution of both models. Unfortunately, the distribution mismatch issue still exists in the early stage of the distillation. Hence, to reduce the impact of distribution mismatch, we propose a simple yet efficient method, named Warmup-Distill, which aligns the distillation of the student to that of the teacher in advance of distillation. Specifically, we first detect the distribution of the student model in practical scenarios with its internal knowledge, and then modify the knowledge with low probability via the teacher as the checker. Consequently, Warmup-Distill aligns the internal student's knowledge to that of the teacher, which expands the distribution of the student with the teacher's, and assists the student model to learn better in the subsequent distillation. Experiments on the seven benchmarks demonstrate that Warmup-Distill could provide a warmup student more suitable for distillation, which outperforms the vanilla student by as least +0.4 averaged score among all benchmarks. Noteably, with the assistance of Warmup-Distill, the distillation on the math task could yield a further improvement, at most +1.9% accuracy. </p> </div> </dd> <dt> <a name='item162'>[162]</a> <a href ="/abs/2502.11771" title="Abstract" id="2502.11771"> arXiv:2502.11771 </a> [<a href="/pdf/2502.11771" title="Download PDF" id="pdf-2502.11771" aria-labelledby="pdf-2502.11771">pdf</a>, <a href="https://arxiv.org/html/2502.11771v1" title="View HTML" id="html-2502.11771" aria-labelledby="html-2502.11771" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11771" title="Other formats" id="oth-2502.11771" aria-labelledby="oth-2502.11771">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> The Validation Gap: A Mechanistic Analysis of How Language Models Compute Arithmetic but Fail to Validate It </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Bertolazzi,+L">Leonardo Bertolazzi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mondorf,+P">Philipp Mondorf</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Plank,+B">Barbara Plank</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bernardi,+R">Raffaella Bernardi</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 34 pages, 31 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> The ability of large language models (LLMs) to validate their output and identify potential errors is crucial for ensuring robustness and reliability. However, current research indicates that LLMs struggle with self-correction, encountering significant challenges in detecting errors. While studies have explored methods to enhance self-correction in LLMs, relatively little attention has been given to understanding the models' internal mechanisms underlying error detection. In this paper, we present a mechanistic analysis of error detection in LLMs, focusing on simple arithmetic problems. Through circuit analysis, we identify the computational subgraphs responsible for detecting arithmetic errors across four smaller-sized LLMs. Our findings reveal that all models heavily rely on $\textit{consistency heads}$--attention heads that assess surface-level alignment of numerical values in arithmetic solutions. Moreover, we observe that the models' internal arithmetic computation primarily occurs in higher layers, whereas validation takes place in middle layers, before the final arithmetic results are fully encoded. This structural dissociation between arithmetic computation and validation seems to explain why current LLMs struggle to detect even simple arithmetic errors. </p> </div> </dd> <dt> <a name='item163'>[163]</a> <a href ="/abs/2502.11779" title="Abstract" id="2502.11779"> arXiv:2502.11779 </a> [<a href="/pdf/2502.11779" title="Download PDF" id="pdf-2502.11779" aria-labelledby="pdf-2502.11779">pdf</a>, <a href="https://arxiv.org/html/2502.11779v1" title="View HTML" id="html-2502.11779" aria-labelledby="html-2502.11779" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11779" title="Other formats" id="oth-2502.11779" aria-labelledby="oth-2502.11779">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Efficient Response Generation Method Selection for Fine-Tuning Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ren,+X">Xuan Ren</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Q">Qi Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+L">Lingqiao Liu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The training data for fine-tuning large language models (LLMs) is typically structured as input-output pairs. However, for many tasks, there can be multiple equally valid output variations for the same input. Recent studies have observed that the choice of output variation used in training can affect the model's performance. This raises an important question: how can we generate the most effective output from the many possible response generation strategy options? Rather than relying on the traditional but resource-intensive train-and-evaluate approach, this paper proposes a scalable, approximate method for estimating the quality of a small subset of generated training data derived from the same input. We then evaluate how well this small subset of generated output fits the target model we are trying to train. We present a large-scale benchmark covering diverse reasoning-based datasets to support our study. <br>The central idea is that a good output should closely resemble the output generated by the target LLM. We formalize this 'closeness' as the expected alignment score between a candidate output and the output sampled from the target LLM. We connect this measurement to the perplexity metric used in previous literature and demonstrate that leveraging an alignment-based metric can provide better predictions of model performance. Using this strategy, we can evaluate a small subset of the generated output from each response generation strategy option, then select the most effective strategy. We show that an LLM trained on data generated by the selected strategy could lead to a significant performance gain in many cases. </p> </div> </dd> <dt> <a name='item164'>[164]</a> <a href ="/abs/2502.11789" title="Abstract" id="2502.11789"> arXiv:2502.11789 </a> [<a href="/pdf/2502.11789" title="Download PDF" id="pdf-2502.11789" aria-labelledby="pdf-2502.11789">pdf</a>, <a href="https://arxiv.org/html/2502.11789v1" title="View HTML" id="html-2502.11789" aria-labelledby="html-2502.11789" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11789" title="Other formats" id="oth-2502.11789" aria-labelledby="oth-2502.11789">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Personality Editing for Language Models through Relevant Knowledge Editing </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Hwang,+S">Seojin Hwang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+Y">Yumin Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+B">Byeongjeong Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+H">Hwanhee Lee</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 15 pages, 3 figures, 16 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large Language Models (LLMs) play a vital role in applications like conversational agents and content creation, where controlling a model's personality is crucial for maintaining tone, consistency, and engagement. However, traditional prompt-based techniques for controlling personality often fall short, as they do not effectively mitigate the model's inherent biases. In this paper, we introduce a novel method PALETTE that enhances personality control through knowledge editing. By generating adjustment queries inspired by psychological assessments, our approach systematically adjusts responses to personality-related queries similar to modifying factual knowledge, thereby achieving controlled shifts in personality traits. Experimental results from both automatic and human evaluations demonstrate that our method enables more stable and well-balanced personality control in LLMs. </p> </div> </dd> <dt> <a name='item165'>[165]</a> <a href ="/abs/2502.11806" title="Abstract" id="2502.11806"> arXiv:2502.11806 </a> [<a href="/pdf/2502.11806" title="Download PDF" id="pdf-2502.11806" aria-labelledby="pdf-2502.11806">pdf</a>, <a href="https://arxiv.org/html/2502.11806v1" title="View HTML" id="html-2502.11806" aria-labelledby="html-2502.11806" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11806" title="Other formats" id="oth-2502.11806" aria-labelledby="oth-2502.11806">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Exploring Translation Mechanism of Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+H">Hongbin Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+K">Kehai Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bai,+X">Xuefeng Bai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+X">Xiucheng Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+M">Min Zhang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large language models (LLMs) have succeeded remarkably in multilingual translation tasks. However, the inherent translation mechanisms of LLMs remain poorly understood, largely due to sophisticated architectures and vast parameter scales. In response to this issue, this study explores the translation mechanism of LLM from the perspective of computational components (e.g., attention heads and MLPs). Path patching is utilized to explore causal relationships between components, detecting those crucial for translation tasks and subsequently analyzing their behavioral patterns in human-interpretable terms. Comprehensive analysis reveals that translation is predominantly facilitated by a sparse subset of specialized attention heads (less than 5\%), which extract source language, indicator, and positional features. MLPs subsequently integrate and process these features by transiting towards English-centric latent representations. Notably, building on the above findings, targeted fine-tuning of only 64 heads achieves translation improvement comparable to full-parameter tuning while preserving general capabilities. </p> </div> </dd> <dt> <a name='item166'>[166]</a> <a href ="/abs/2502.11811" title="Abstract" id="2502.11811"> arXiv:2502.11811 </a> [<a href="/pdf/2502.11811" title="Download PDF" id="pdf-2502.11811" aria-labelledby="pdf-2502.11811">pdf</a>, <a href="https://arxiv.org/html/2502.11811v1" title="View HTML" id="html-2502.11811" aria-labelledby="html-2502.11811" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11811" title="Other formats" id="oth-2502.11811" aria-labelledby="oth-2502.11811">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> FineFilter: A Fine-grained Noise Filtering Mechanism for Retrieval-Augmented Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Q">Qianchi Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+H">Hainan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pang,+L">Liang Pang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zheng,+H">Hongwei Zheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tong,+Y">Yongxin Tong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zheng,+Z">Zhiming Zheng</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Retrieved documents containing noise will hinder Retrieval-Augmented Generation (RAG) from detecting answer clues, necessitating noise filtering mechanisms to enhance <a href="http://accuracy.Existing" rel="external noopener nofollow" class="link-external link-http">this http URL</a> methods use re-ranking or summarization to identify the most relevant sentences, but directly and accurately locating answer clues from these large-scale and complex documents remains challenging. Unlike these document-level operations, we treat noise filtering as a sentence-level MinMax optimization problem: first identifying the potential clues from multiple documents using contextual information, then ranking them by relevance, and finally retaining the least clues through truncation. In this paper, we propose FineFilter, a novel fine-grained noise filtering mechanism for RAG consisting of a clue extractor, a re-ranker, and a truncator. We optimize each module to tackle complex reasoning challenges: (1) Clue extractor firstly uses sentences containing the answer and similar ones as fine-tuned targets, aiming at extracting sufficient potential clues; (2) Re-ranker is trained to prioritize effective clues based on the real feedback from generation module, with clues capable of generating correct answer as positive samples and others as negative; (3) Truncator takes the minimum clues needed to answer the question (truncation point) as fine-tuned targets, and performs truncation on the re-ranked clues to achieve fine-grained noise filtering. Experiments on three QA datasets demonstrate that FineFilter significantly outperforms baselines in terms of performance and inference cost. Further analysis on each module shows the effectiveness of our optimizations for complex reasoning. </p> </div> </dd> <dt> <a name='item167'>[167]</a> <a href ="/abs/2502.11812" title="Abstract" id="2502.11812"> arXiv:2502.11812 </a> [<a href="/pdf/2502.11812" title="Download PDF" id="pdf-2502.11812" aria-labelledby="pdf-2502.11812">pdf</a>, <a href="https://arxiv.org/html/2502.11812v1" title="View HTML" id="html-2502.11812" aria-labelledby="html-2502.11812" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11812" title="Other formats" id="oth-2502.11812" aria-labelledby="oth-2502.11812">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Towards Understanding Fine-Tuning Mechanisms of LLMs via Circuit Analysis </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+X">Xu Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+Y">Yan Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Du,+W">Wenyu Du</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cheng,+R">Reynold Cheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+B">Benyou Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zou,+D">Difan Zou</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 25 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Fine-tuning significantly improves the performance of Large Language Models (LLMs), yet its underlying mechanisms remain poorly understood. This paper aims to provide an in-depth interpretation of the fine-tuning process through circuit analysis, a popular tool in Mechanistic Interpretability (MI). Unlike previous studies \cite{prakash2024finetuningenhancesexistingmechanisms,chhabra2024neuroplasticity} that focus on tasks where pre-trained models already perform well, we develop a set of mathematical tasks where fine-tuning yields substantial performance gains, which are closer to the practical setting. In our experiments, we identify circuits at various checkpoints during fine-tuning and examine the interplay between circuit analysis, fine-tuning methods, and task complexities. First, we find that while circuits maintain high node similarity before and after fine-tuning, their edges undergo significant changes, which is in contrast to the previous work \cite{prakash2024finetuningenhancesexistingmechanisms,chhabra2024neuroplasticity} that show circuits only add some additional components after fine-tuning. Based on these observations, we develop a circuit-aware Low-Rank Adaptation (LoRA) method, which assigns ranks to layers based on edge changes in the circuits. Experimental results demonstrate that our circuit-based LoRA algorithm achieves an average performance improvement of 2.46\% over standard LoRA with similar parameter sizes. Furthermore, we explore how combining circuits from subtasks can enhance fine-tuning in compositional tasks, providing new insights into the design of such tasks and deepening the understanding of circuit dynamics and fine-tuning mechanisms. </p> </div> </dd> <dt> <a name='item168'>[168]</a> <a href ="/abs/2502.11824" title="Abstract" id="2502.11824"> arXiv:2502.11824 </a> [<a href="/pdf/2502.11824" title="Download PDF" id="pdf-2502.11824" aria-labelledby="pdf-2502.11824">pdf</a>, <a href="/format/2502.11824" title="Other formats" id="oth-2502.11824" aria-labelledby="oth-2502.11824">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> M-ABSA: A Multilingual Dataset for Aspect-Based Sentiment Analysis </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+C">Chengyan Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+B">Bolei Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Y">Yihong Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Z">Zheyu Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Deng,+N">Ningyuan Deng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yanshu Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+B">Baolan Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yi Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Plank,+B">Barbara Plank</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xue,+Y">Yun Xue</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Aspect-based sentiment analysis (ABSA) is a crucial task in information extraction and sentiment analysis, aiming to identify aspects with associated sentiment elements in text. However, existing ABSA datasets are predominantly English-centric, limiting the scope for multilingual evaluation and research. To bridge this gap, we present M-ABSA, a comprehensive dataset spanning 7 domains and 21 languages, making it the most extensive multilingual parallel dataset for ABSA to date. Our primary focus is on triplet extraction, which involves identifying aspect terms, aspect categories, and sentiment polarities. The dataset is constructed through an automatic translation process with human review to ensure quality. We perform extensive experiments using various baselines to assess performance and compatibility on M-ABSA. Our empirical findings highlight that the dataset enables diverse evaluation tasks, such as multilingual and multi-domain transfer learning, and large language model evaluation, underscoring its inclusivity and its potential to drive advancements in multilingual ABSA research. </p> </div> </dd> <dt> <a name='item169'>[169]</a> <a href ="/abs/2502.11829" title="Abstract" id="2502.11829"> arXiv:2502.11829 </a> [<a href="/pdf/2502.11829" title="Download PDF" id="pdf-2502.11829" aria-labelledby="pdf-2502.11829">pdf</a>, <a href="https://arxiv.org/html/2502.11829v1" title="View HTML" id="html-2502.11829" aria-labelledby="html-2502.11829" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11829" title="Other formats" id="oth-2502.11829" aria-labelledby="oth-2502.11829">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Code-Vision: Evaluating Multimodal LLMs Logic Understanding and Code Generation Capabilities </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+H">Hanbin Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+X">Xiaoxuan Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+Z">Zhipeng Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cheng,+K">Keyuan Cheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zuo,+Y">Yuxin Zuo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tian,+K">Kai Tian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+J">Jingwei Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+J">Junting Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+W">Wenhui Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+X">Xueyang Liu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 15 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Software Engineering (cs.SE) </div> <p class='mathjax'> This paper introduces Code-Vision, a benchmark designed to evaluate the logical understanding and code generation capabilities of Multimodal Large Language Models (MLLMs). It challenges MLLMs to generate a correct program that fulfills specific functionality requirements based on a given flowchart, which visually represents the desired algorithm or process. Code-Vision comprises three subsets: HumanEval-V, Algorithm, and MATH, which evaluate MLLMs' coding abilities across basic programming, algorithmic, and mathematical problem-solving domains. Our experiments evaluate 12 MLLMs on Code-Vision. Experimental results demonstrate that there is a large performance difference between proprietary and open-source models. On Hard problems, GPT-4o can achieve 79.3% pass@1, but the best open-source model only achieves 15%. Further experiments reveal that Code-Vision can pose unique challenges compared to other multimodal reasoning benchmarks MMCode and MathVista. We also explore the reason for the poor performance of the open-source models. All data and codes are available at <a href="https://github.com/wanghanbinpanda/CodeVision" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item170'>[170]</a> <a href ="/abs/2502.11830" title="Abstract" id="2502.11830"> arXiv:2502.11830 </a> [<a href="/pdf/2502.11830" title="Download PDF" id="pdf-2502.11830" aria-labelledby="pdf-2502.11830">pdf</a>, <a href="https://arxiv.org/html/2502.11830v1" title="View HTML" id="html-2502.11830" aria-labelledby="html-2502.11830" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11830" title="Other formats" id="oth-2502.11830" aria-labelledby="oth-2502.11830">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Text Classification in the LLM Era - Where do we stand? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Vajjala,+S">Sowmya Vajjala</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shimangaud,+S">Shwetali Shimangaud</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Pre-print </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large Language Models revolutionized NLP and showed dramatic performance improvements across several tasks. In this paper, we investigated the role of such language models in text classification and how they compare with other approaches relying on smaller pre-trained language models. Considering 32 datasets spanning 8 languages, we compared zero-shot classification, few-shot fine-tuning and synthetic data based classifiers with classifiers built using the complete human labeled dataset. Our results show that zero-shot approaches do well for sentiment classification, but are outperformed by other approaches for the rest of the tasks, and synthetic data sourced from multiple LLMs can build better classifiers than zero-shot open LLMs. We also see wide performance disparities across languages in all the classification scenarios. We expect that these findings would guide practitioners working on developing text classification systems across languages. </p> </div> </dd> <dt> <a name='item171'>[171]</a> <a href ="/abs/2502.11843" title="Abstract" id="2502.11843"> arXiv:2502.11843 </a> [<a href="/pdf/2502.11843" title="Download PDF" id="pdf-2502.11843" aria-labelledby="pdf-2502.11843">pdf</a>, <a href="https://arxiv.org/html/2502.11843v1" title="View HTML" id="html-2502.11843" aria-labelledby="html-2502.11843" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11843" title="Other formats" id="oth-2502.11843" aria-labelledby="oth-2502.11843">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Can LLM Agents Maintain a Persona in Discourse? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Bhandari,+P">Pranav Bhandari</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fay,+N">Nicolas Fay</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wise,+M">Michael Wise</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Datta,+A">Amitava Datta</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Meek,+S">Stephanie Meek</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Naseem,+U">Usman Naseem</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nasim,+M">Mehwish Nasim</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Social and Information Networks (cs.SI) </div> <p class='mathjax'> Large Language Models (LLMs) are widely used as conversational agents, exploiting their capabilities in various sectors such as education, law, medicine, and more. However, LLMs are often subjected to context-shifting behaviour, resulting in a lack of consistent and interpretable personality-aligned interactions. Adherence to psychological traits lacks comprehensive analysis, especially in the case of dyadic (pairwise) conversations. We examine this challenge from two viewpoints, initially using two conversation agents to generate a discourse on a certain topic with an assigned personality from the OCEAN framework (Openness, Conscientiousness, Extraversion, Agreeableness, and Neuroticism) as High/Low for each trait. This is followed by using multiple judge agents to infer the original traits assigned to explore prediction consistency, inter-model agreement, and alignment with the assigned personality. Our findings indicate that while LLMs can be guided toward personality-driven dialogue, their ability to maintain personality traits varies significantly depending on the combination of models and discourse settings. These inconsistencies emphasise the challenges in achieving stable and interpretable personality-aligned interactions in LLMs. </p> </div> </dd> <dt> <a name='item172'>[172]</a> <a href ="/abs/2502.11856" title="Abstract" id="2502.11856"> arXiv:2502.11856 </a> [<a href="/pdf/2502.11856" title="Download PDF" id="pdf-2502.11856" aria-labelledby="pdf-2502.11856">pdf</a>, <a href="https://arxiv.org/html/2502.11856v1" title="View HTML" id="html-2502.11856" aria-labelledby="html-2502.11856" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11856" title="Other formats" id="oth-2502.11856" aria-labelledby="oth-2502.11856">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LLMs as a synthesis between symbolic and continuous approaches to language </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Boleda,+G">Gemma Boleda</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Under review </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Since the middle of the 20th century, a fierce battle is being fought between symbolic and continuous approaches to language and cognition. The success of deep learning models, and LLMs in particular, has been alternatively taken as showing that the continuous camp has won, or dismissed as an irrelevant engineering development. However, in this position paper I argue that deep learning models for language actually represent a synthesis between the two traditions. This is because 1) deep learning architectures allow for both continuous/distributed and symbolic/discrete-like representations and computations; 2) models trained on language make use this flexibility. In particular, I review recent research in mechanistic interpretability that showcases how a substantial part of morphosyntactic knowledge is encoded in a near-discrete fashion in LLMs. This line of research suggests that different behaviors arise in an emergent fashion, and models flexibly alternate between the two modes (and everything in between) as needed. This is possibly one of the main reasons for their wild success; and it is also what makes them particularly interesting for the study of language and cognition. Is it time for peace? </p> </div> </dd> <dt> <a name='item173'>[173]</a> <a href ="/abs/2502.11861" title="Abstract" id="2502.11861"> arXiv:2502.11861 </a> [<a href="/pdf/2502.11861" title="Download PDF" id="pdf-2502.11861" aria-labelledby="pdf-2502.11861">pdf</a>, <a href="/format/2502.11861" title="Other formats" id="oth-2502.11861" aria-labelledby="oth-2502.11861">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Exploring Large Language Models in Healthcare: Insights into Corpora Sources, Customization Strategies, and Evaluation Metrics </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+S">Shuqi Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jing,+M">Mingrui Jing</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Shuai Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kou,+J">Jiaxin Kou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shi,+M">Manfei Shi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xing,+W">Weijie Xing</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+Y">Yan Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+Z">Zheng Zhu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 45 pages, 1 figure, 5 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> This study reviewed the use of Large Language Models (LLMs) in healthcare, focusing on their training corpora, customization techniques, and evaluation metrics. A systematic search of studies from 2021 to 2024 identified 61 articles. Four types of corpora were used: clinical resources, literature, open-source datasets, and web-crawled data. Common construction techniques included pre-training, prompt engineering, and retrieval-augmented generation, with 44 studies combining multiple methods. Evaluation metrics were categorized into process, usability, and outcome metrics, with outcome metrics divided into model-based and expert-assessed outcomes. The study identified critical gaps in corpus fairness, which contributed to biases from geographic, cultural, and socio-economic factors. The reliance on unverified or unstructured data highlighted the need for better integration of evidence-based clinical guidelines. Future research should focus on developing a tiered corpus architecture with vetted sources and dynamic weighting, while ensuring model transparency. Additionally, the lack of standardized evaluation frameworks for domain-specific models called for comprehensive validation of LLMs in real-world healthcare settings. </p> </div> </dd> <dt> <a name='item174'>[174]</a> <a href ="/abs/2502.11862" title="Abstract" id="2502.11862"> arXiv:2502.11862 </a> [<a href="/pdf/2502.11862" title="Download PDF" id="pdf-2502.11862" aria-labelledby="pdf-2502.11862">pdf</a>, <a href="https://arxiv.org/html/2502.11862v1" title="View HTML" id="html-2502.11862" aria-labelledby="html-2502.11862" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11862" title="Other formats" id="oth-2502.11862" aria-labelledby="oth-2502.11862">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Understanding In-Context Machine Translation for Low-Resource Languages: A Case Study on Manchu </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Pei,+R">Renhao Pei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Y">Yihong Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+P">Peiqin Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yvon,+F">Fran莽ois Yvon</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sch%C3%BCtze,+H">Hinrich Sch眉tze</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> preprint </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> In-context machine translation (MT) with large language models (LLMs) is a promising approach for low-resource MT, as it can readily take advantage of linguistic resources such as grammar books and dictionaries. Such resources are usually selectively integrated into the prompt so that LLMs can directly perform translation without any specific training, via their in-context learning capability (ICL). However, the relative importance of each type of resource e.g., dictionary, grammar book, and retrieved parallel examples, is not entirely clear. To address this gap, this study systematically investigates how each resource and its quality affects the translation performance, with the Manchu language as our case study. To remove any prior knowledge of Manchu encoded in the LLM parameters and single out the effect of ICL, we also experiment with an encrypted version of Manchu texts. Our results indicate that high-quality dictionaries and good parallel examples are very helpful, while grammars hardly help. In a follow-up study, we showcase a promising application of in-context MT: parallel data augmentation as a way to bootstrap the conventional MT model. When monolingual data abound, generating synthetic parallel data through in-context MT offers a pathway to mitigate data scarcity and build effective and efficient low-resource neural MT systems. </p> </div> </dd> <dt> <a name='item175'>[175]</a> <a href ="/abs/2502.11866" title="Abstract" id="2502.11866"> arXiv:2502.11866 </a> [<a href="/pdf/2502.11866" title="Download PDF" id="pdf-2502.11866" aria-labelledby="pdf-2502.11866">pdf</a>, <a href="https://arxiv.org/html/2502.11866v1" title="View HTML" id="html-2502.11866" aria-labelledby="html-2502.11866" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11866" title="Other formats" id="oth-2502.11866" aria-labelledby="oth-2502.11866">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Southern Newswire Corpus: A Large-Scale Dataset of Mid-Century Wire Articles Beyond the Front Page </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=McRae,+M">Michael McRae</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> I introduce a new large-scale dataset of historical wire articles from U.S. Southern newspapers, spanning 1960-1975 and covering multiple wire services: The Associated Press, United Press International, Newspaper Enterprise Association. Unlike prior work focusing on front-page content, this dataset captures articles across the entire newspaper, offering broader insight into mid-century Southern coverage. The dataset includes a version that has undergone an LLM-based text cleanup pipeline to reduce OCR noise, enhancing its suitability for quantitative text analysis. Additionally, duplicate versions of articles are retained to enable analysis of editorial differences in language and framing across newspapers. Each article is tagged by wire service, facilitating comparative studies of editorial patterns across agencies. This resource opens new avenues for research in computational social science, digital humanities, and historical linguistics, providing a detailed perspective on how Southern newspapers relayed national and international news during a transformative period in American history. The dataset will be made available upon publication or request for research purposes. </p> </div> </dd> <dt> <a name='item176'>[176]</a> <a href ="/abs/2502.11874" title="Abstract" id="2502.11874"> arXiv:2502.11874 </a> [<a href="/pdf/2502.11874" title="Download PDF" id="pdf-2502.11874" aria-labelledby="pdf-2502.11874">pdf</a>, <a href="https://arxiv.org/html/2502.11874v1" title="View HTML" id="html-2502.11874" aria-labelledby="html-2502.11874" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11874" title="Other formats" id="oth-2502.11874" aria-labelledby="oth-2502.11874">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> VAQUUM: Are Vague Quantifiers Grounded in Visual Data? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wong,+H+M">Hugh Mee Wong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nouwen,+R">Rick Nouwen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gatt,+A">Albert Gatt</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Submitted to ARR ACL 2025, 12 pages for main paper (5 figures), 15 pages including appendix (2 figures) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Vague quantifiers such as "a few" and "many" are influenced by many contextual factors, including how many objects are present in a given context. In this work, we evaluate the extent to which vision-and-language models (VLMs) are compatible with humans when producing or judging the appropriateness of vague quantifiers in visual contexts. We release a novel dataset, VAQUUM, containing 20300 human ratings on quantified statements across a total of 1089 images. Using this dataset, we compare human judgments and VLM predictions using three different evaluation methods. Our findings show that VLMs, like humans, are influenced by object counts in vague quantifier use. However, we find significant inconsistencies across models in different evaluation settings, suggesting that judging and producing vague quantifiers rely on two different processes. </p> </div> </dd> <dt> <a name='item177'>[177]</a> <a href ="/abs/2502.11890" title="Abstract" id="2502.11890"> arXiv:2502.11890 </a> [<a href="/pdf/2502.11890" title="Download PDF" id="pdf-2502.11890" aria-labelledby="pdf-2502.11890">pdf</a>, <a href="/format/2502.11890" title="Other formats" id="oth-2502.11890" aria-labelledby="oth-2502.11890">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Revisiting Classification Taxonomy for Grammatical Errors </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zou,+D">Deqing Zou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ye,+J">Jingheng Ye</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Y">Yulu Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Y">Yu Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+Z">Zishan Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yinghui Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zheng,+H">Hai-Tao Zheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=An,+B">Bingxu An</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wei,+Z">Zhao Wei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+Y">Yong Xu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 30 pages, 4 figures and 5 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Grammatical error classification plays a crucial role in language learning systems, but existing classification taxonomies often lack rigorous validation, leading to inconsistencies and unreliable feedback. In this paper, we revisit previous classification taxonomies for grammatical errors by introducing a systematic and qualitative evaluation framework. Our approach examines four aspects of a taxonomy, i.e., exclusivity, coverage, balance, and usability. Then, we construct a high-quality grammatical error classification dataset annotated with multiple classification taxonomies and evaluate them grounding on our proposed evaluation framework. Our experiments reveal the drawbacks of existing taxonomies. Our contributions aim to improve the precision and effectiveness of error analysis, providing more understandable and actionable feedback for language learners. </p> </div> </dd> <dt> <a name='item178'>[178]</a> <a href ="/abs/2502.11901" title="Abstract" id="2502.11901"> arXiv:2502.11901 </a> [<a href="/pdf/2502.11901" title="Download PDF" id="pdf-2502.11901" aria-labelledby="pdf-2502.11901">pdf</a>, <a href="https://arxiv.org/html/2502.11901v1" title="View HTML" id="html-2502.11901" aria-labelledby="html-2502.11901" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11901" title="Other formats" id="oth-2502.11901" aria-labelledby="oth-2502.11901">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Building A Proof-Oriented Programmer That Is 64% Better Than GPT-4o Under Data Scarsity </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+D">Dylan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+J">Justin Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+T">Tianran Sun</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Programming Languages (cs.PL); Software Engineering (cs.SE) </div> <p class='mathjax'> Existing LMs struggle with proof-oriented programming due to data scarcity, which manifest in two key ways: (1) a lack of sufficient corpora for proof-oriented programming languages such as F*, and (2) the absence of large-scale, project-level proof-oriented implementations that can teach the model the intricate reasoning process when performing proof-oriented programming. We present the first on synthetic data augmentation for project level proof oriented programming for both generation and repair. Our method addresses data scarcity by synthesizing basic proof-oriented programming problems for proficiency in that language; incorporating diverse coding data for reasoning capability elicitation and creating new proofs and repair data within existing repositories. This approach enables language models to both synthesize and repair proofs for function- and repository-level code. We show that our fine-tuned 14B parameter model, PoPilot, can exceed the performance of the models that outperforms GPT-4o in project-level proof-oriented programming by 64% relative margin, and can improve GPT-4o's performance by 54% by repairing its outputs over GPT-4o's self-repair. </p> </div> </dd> <dt> <a name='item179'>[179]</a> <a href ="/abs/2502.11903" title="Abstract" id="2502.11903"> arXiv:2502.11903 </a> [<a href="/pdf/2502.11903" title="Download PDF" id="pdf-2502.11903" aria-labelledby="pdf-2502.11903">pdf</a>, <a href="https://arxiv.org/html/2502.11903v1" title="View HTML" id="html-2502.11903" aria-labelledby="html-2502.11903" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11903" title="Other formats" id="oth-2502.11903" aria-labelledby="oth-2502.11903">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MMRC: A Large-Scale Benchmark for Understanding Multimodal Large Language Model in Real-World Conversation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xue,+H">Haochen Xue</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+F">Feilong Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+M">Ming Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Y">Yexin Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+Q">Qidong Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yulong Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+C">Chengzhi Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+Z">Zhongxing Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+C">Chong Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+C">Chun-Mei Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xie,+Y">Yutong Xie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Razzak,+I">Imran Razzak</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ge,+Z">Zongyuan Ge</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Su,+J">Jionglong Su</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+J">Junjun He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qiao,+Y">Yu Qiao</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Recent multimodal large language models (MLLMs) have demonstrated significant potential in open-ended conversation, generating more accurate and personalized responses. However, their abilities to memorize, recall, and reason in sustained interactions within real-world scenarios remain underexplored. This paper introduces MMRC, a Multi-Modal Real-world Conversation benchmark for evaluating six core open-ended abilities of MLLMs: information extraction, multi-turn reasoning, information update, image management, memory recall, and answer refusal. With data collected from real-world scenarios, MMRC comprises 5,120 conversations and 28,720 corresponding manually labeled questions, posing a significant challenge to existing MLLMs. Evaluations on 20 MLLMs in MMRC indicate an accuracy drop during open-ended interactions. We identify four common failure patterns: long-term memory degradation, inadequacies in updating factual knowledge, accumulated assumption of error propagation, and reluctance to say no. To mitigate these issues, we propose a simple yet effective NOTE-TAKING strategy, which can record key information from the conversation and remind the model during its responses, enhancing conversational capabilities. Experiments across six MLLMs demonstrate significant performance improvements. </p> </div> </dd> <dt> <a name='item180'>[180]</a> <a href ="/abs/2502.11916" title="Abstract" id="2502.11916"> arXiv:2502.11916 </a> [<a href="/pdf/2502.11916" title="Download PDF" id="pdf-2502.11916" aria-labelledby="pdf-2502.11916">pdf</a>, <a href="https://arxiv.org/html/2502.11916v1" title="View HTML" id="html-2502.11916" aria-labelledby="html-2502.11916" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11916" title="Other formats" id="oth-2502.11916" aria-labelledby="oth-2502.11916">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> EssayJudge: A Multi-Granular Benchmark for Assessing Automated Essay Scoring Capabilities of Multimodal Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Su,+J">Jiamin Su</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yan,+Y">Yibo Yan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fu,+F">Fangteng Fu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+H">Han Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ye,+J">Jingheng Ye</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+X">Xiang Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huo,+J">Jiahao Huo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+H">Huiyu Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+X">Xuming Hu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> JS and YY are co-first authors. XH is the corresponding author </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Automated Essay Scoring (AES) plays a crucial role in educational assessment by providing scalable and consistent evaluations of writing tasks. However, traditional AES systems face three major challenges: (1) reliance on handcrafted features that limit generalizability, (2) difficulty in capturing fine-grained traits like coherence and argumentation, and (3) inability to handle multimodal contexts. In the era of Multimodal Large Language Models (MLLMs), we propose EssayJudge, the first multimodal benchmark to evaluate AES capabilities across lexical-, sentence-, and discourse-level traits. By leveraging MLLMs' strengths in trait-specific scoring and multimodal context understanding, EssayJudge aims to offer precise, context-rich evaluations without manual feature engineering, addressing longstanding AES limitations. Our experiments with 18 representative MLLMs reveal gaps in AES performance compared to human evaluation, particularly in discourse-level traits, highlighting the need for further advancements in MLLM-based AES research. Our dataset and code will be available upon acceptance. </p> </div> </dd> <dt> <a name='item181'>[181]</a> <a href ="/abs/2502.11926" title="Abstract" id="2502.11926"> arXiv:2502.11926 </a> [<a href="/pdf/2502.11926" title="Download PDF" id="pdf-2502.11926" aria-labelledby="pdf-2502.11926">pdf</a>, <a href="https://arxiv.org/html/2502.11926v1" title="View HTML" id="html-2502.11926" aria-labelledby="html-2502.11926" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11926" title="Other formats" id="oth-2502.11926" aria-labelledby="oth-2502.11926">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> BRIGHTER: BRIdging the Gap in Human-Annotated Textual Emotion Recognition Datasets for 28 Languages </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Muhammad,+S+H">Shamsuddeen Hassan Muhammad</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ousidhoum,+N">Nedjma Ousidhoum</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Abdulmumin,+I">Idris Abdulmumin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wahle,+J+P">Jan Philip Wahle</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ruas,+T">Terry Ruas</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Beloucif,+M">Meriem Beloucif</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=de+Kock,+C">Christine de Kock</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Surange,+N">Nirmal Surange</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Teodorescu,+D">Daniela Teodorescu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ahmad,+I+S">Ibrahim Said Ahmad</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Adelani,+D+I">David Ifeoluwa Adelani</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Aji,+A+F">Alham Fikri Aji</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ali,+F+D+M+A">Felermino D. M. A. Ali</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Alimova,+I">Ilseyar Alimova</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Araujo,+V">Vladimir Araujo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Babakov,+N">Nikolay Babakov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Baes,+N">Naomi Baes</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bucur,+A">Ana-Maria Bucur</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bukula,+A">Andiswa Bukula</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+G">Guanqun Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cardenas,+R+T">Rodrigo Tufino Cardenas</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chevi,+R">Rendi Chevi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chukwuneke,+C+I">Chiamaka Ijeoma Chukwuneke</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ciobotaru,+A">Alexandra Ciobotaru</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dementieva,+D">Daryna Dementieva</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gadanya,+M+S">Murja Sani Gadanya</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Geislinger,+R">Robert Geislinger</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gipp,+B">Bela Gipp</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hourrane,+O">Oumaima Hourrane</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ignat,+O">Oana Ignat</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lawan,+F+I">Falalu Ibrahim Lawan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mabuya,+R">Rooweither Mabuya</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mahendra,+R">Rahmad Mahendra</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Marivate,+V">Vukosi Marivate</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Piper,+A">Andrew Piper</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Panchenko,+A">Alexander Panchenko</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ferreira,+C+H+P">Charles Henrique Porto Ferreira</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Protasov,+V">Vitaly Protasov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rutunda,+S">Samuel Rutunda</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shrivastava,+M">Manish Shrivastava</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Udrea,+A+C">Aura Cristina Udrea</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wanzare,+L+D+A">Lilian Diana Awuor Wanzare</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+S">Sophie Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wunderlich,+F+V">Florian Valentin Wunderlich</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhafran,+H+M">Hanif Muhammad Zhafran</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+T">Tianhui Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+Y">Yi Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mohammad,+S+M">Saif M. Mohammad</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 20 pages, under review </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> People worldwide use language in subtle and complex ways to express emotions. While emotion recognition -- an umbrella term for several NLP tasks -- significantly impacts different applications in NLP and other fields, most work in the area is focused on high-resource languages. Therefore, this has led to major disparities in research and proposed solutions, especially for low-resource languages that suffer from the lack of high-quality datasets. In this paper, we present BRIGHTER-- a collection of multilabeled emotion-annotated datasets in 28 different languages. BRIGHTER covers predominantly low-resource languages from Africa, Asia, Eastern Europe, and Latin America, with instances from various domains annotated by fluent speakers. We describe the data collection and annotation processes and the challenges of building these datasets. Then, we report different experimental results for monolingual and crosslingual multi-label emotion identification, as well as intensity-level emotion recognition. We investigate results with and without using LLMs and analyse the large variability in performance across languages and text domains. We show that BRIGHTER datasets are a step towards bridging the gap in text-based emotion recognition and discuss their impact and utility. </p> </div> </dd> <dt> <a name='item182'>[182]</a> <a href ="/abs/2502.11932" title="Abstract" id="2502.11932"> arXiv:2502.11932 </a> [<a href="/pdf/2502.11932" title="Download PDF" id="pdf-2502.11932" aria-labelledby="pdf-2502.11932">pdf</a>, <a href="https://arxiv.org/html/2502.11932v1" title="View HTML" id="html-2502.11932" aria-labelledby="html-2502.11932" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11932" title="Other formats" id="oth-2502.11932" aria-labelledby="oth-2502.11932">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> On Representational Dissociation of Language and Arithmetic in Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Kisako,+R">Riku Kisako</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kuribayashi,+T">Tatsuki Kuribayashi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sasano,+R">Ryohei Sasano</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The association between language and (non-linguistic) thinking ability in humans has long been debated, and recently, neuroscientific evidence of brain activity patterns has been considered. Such a scientific context naturally raises an interdisciplinary question -- what about such a language-thought dissociation in large language models (LLMs)? In this paper, as an initial foray, we explore this question by focusing on simple arithmetic skills (e.g., $1+2=$ ?) as a thinking ability and analyzing the geometry of their encoding in LLMs' representation space. Our experiments with linear classifiers and cluster separability tests demonstrate that simple arithmetic equations and general language input are encoded in completely separated regions in LLMs' internal representation space across all the layers, which is also supported with more controlled stimuli (e.g., spelled-out equations). These tentatively suggest that arithmetic reasoning is mapped into a distinct region from general language input, which is in line with the neuroscientific observations of human brain activations, while we also point out their somewhat cognitively implausible geometric properties. </p> </div> </dd> <dt> <a name='item183'>[183]</a> <a href ="/abs/2502.11946" title="Abstract" id="2502.11946"> arXiv:2502.11946 </a> [<a href="/pdf/2502.11946" title="Download PDF" id="pdf-2502.11946" aria-labelledby="pdf-2502.11946">pdf</a>, <a href="https://arxiv.org/html/2502.11946v1" title="View HTML" id="html-2502.11946" aria-labelledby="html-2502.11946" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11946" title="Other formats" id="oth-2502.11946" aria-labelledby="oth-2502.11946">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Step-Audio: Unified Understanding and Generation in Intelligent Speech Interaction </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+A">Ailin Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+B">Boyong Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+B">Bruce Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yan,+C">Chao Yan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+C">Chen Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+C">Chengli Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tian,+F">Fei Tian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+F">Feiyu Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+J">Jingbei Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+M">Mingrui Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+P">Peng Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Miao,+R">Ruihang Miao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=You,+W">Wang You</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+X">Xi Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+X">Xuerui Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+Y">Yechang Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yuxiang Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gong,+Z">Zheng Gong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Z">Zixin Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+B">Brian Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wan,+C">Changyi Wan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+H">Hanpeng Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ming,+R">Ranchen Ming</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+S">Song Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+X">Xuelin Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+Y">Yu Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+B">Bingxin Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+B">Buyun Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=An,+K">Kang An</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ji,+W">Wei Ji</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+W">Wen Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wen,+X">Xuan Wen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+Y">Yuankai Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liang,+Y">Yuanwei Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mou,+Y">Yun Mou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ahmidi,+B">Bahtiyar Ahmidi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+B">Bin Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+B">Bo Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Miao,+C">Changxin Miao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+C">Chen Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+C">Chengting Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+C">Chenrun Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shi,+D">Dapeng Shi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+D">Deshan Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+D">Dingyuan Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sai,+D">Dula Sai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+E">Enle Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+G">Guanzhe Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yan,+G">Gulin Yan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+H">Heng Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jia,+H">Haonan Jia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+H">Haoyang Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gong,+J">Jiahao Gong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+J">Jianchang Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+J">Jiahong Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+J">Jianjian Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhen,+J">Jiangjie Zhen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+J">Jie Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+J">Jie Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+J">Jiaoren Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+J">Jie Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+J">Jinguo Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+J">Jingyang Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+J">Junzhe Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+K">Kaixiang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xia,+L">Lei Xia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+L">Li Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gu,+L">Longlong Gu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+M">Mei Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+M">Menglin Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+M">Ming Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+M">Mingxiao Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liang,+M">Mingyao Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+N">Na Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hao,+N">Nie Hao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Q">Qiling Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tan,+Q">Qinyuan Tan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pang,+S">Shaoliang Pang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+S">Shiliang Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+S">Shuli Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+S">Siqi Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+S">Sitong Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+T">Tiancheng Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+T">Tianyu Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Deng,+W">Wenjin Deng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+W">Wenqing He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+W">Wen Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+X">Xin Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Deng,+X">Xiaomin Deng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+X">Xiaojia Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+X">Xu Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wei,+Y">Yanan Wei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+Y">Yanbo Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+Y">Yang Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yangguang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+Y">Yangzhen Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+Y">Yanming Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shi,+Y">Yaqiang Shi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yilei Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhong,+Y">Yinmin Zhong</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Human-Computer Interaction (cs.HC); Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> <p class='mathjax'> Real-time speech interaction, serving as a fundamental interface for human-machine collaboration, holds immense potential. However, current open-source models face limitations such as high costs in voice data collection, weakness in dynamic control, and limited intelligence. To address these challenges, this paper introduces Step-Audio, the first production-ready open-source solution. Key contributions include: 1) a 130B-parameter unified speech-text multi-modal model that achieves unified understanding and generation, with the Step-Audio-Chat version open-sourced; 2) a generative speech data engine that establishes an affordable voice cloning framework and produces the open-sourced lightweight Step-Audio-TTS-3B model through distillation; 3) an instruction-driven fine control system enabling dynamic adjustments across dialects, emotions, singing, and RAP; 4) an enhanced cognitive architecture augmented with tool calling and role-playing abilities to manage complex tasks effectively. Based on our new StepEval-Audio-360 evaluation benchmark, Step-Audio achieves state-of-the-art performance in human evaluations, especially in terms of instruction following. On open-source benchmarks like LLaMA Question, shows 9.3% average performance improvement, demonstrating our commitment to advancing the development of open-source multi-modal language technologies. Our code and models are available at <a href="https://github.com/stepfun-ai/Step-Audio" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item184'>[184]</a> <a href ="/abs/2502.11948" title="Abstract" id="2502.11948"> arXiv:2502.11948 </a> [<a href="/pdf/2502.11948" title="Download PDF" id="pdf-2502.11948" aria-labelledby="pdf-2502.11948">pdf</a>, <a href="https://arxiv.org/html/2502.11948v1" title="View HTML" id="html-2502.11948" aria-labelledby="html-2502.11948" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11948" title="Other formats" id="oth-2502.11948" aria-labelledby="oth-2502.11948">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Can Your Uncertainty Scores Detect Hallucinated Entity? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yeh,+M">Min-Hsuan Yeh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kamachee,+M">Max Kamachee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Park,+S">Seongheon Park</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yixuan Li</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> To mitigate the impact of hallucination nature of LLMs, many studies propose detecting hallucinated generation through uncertainty estimation. However, these approaches predominantly operate at the sentence or paragraph level, failing to pinpoint specific spans or entities responsible for hallucinated content. This lack of granularity is especially problematic for long-form outputs that mix accurate and fabricated information. To address this limitation, we explore entity-level hallucination detection. We propose a new data set, HalluEntity, which annotates hallucination at the entity level. Based on the dataset, we comprehensively evaluate uncertainty-based hallucination detection approaches across 17 modern LLMs. Our experimental results show that uncertainty estimation approaches focusing on individual token probabilities tend to over-predict hallucinations, while context-aware methods show better but still suboptimal performance. Through an in-depth qualitative study, we identify relationships between hallucination tendencies and linguistic properties and highlight important directions for future research. </p> </div> </dd> <dt> <a name='item185'>[185]</a> <a href ="/abs/2502.11962" title="Abstract" id="2502.11962"> arXiv:2502.11962 </a> [<a href="/pdf/2502.11962" title="Download PDF" id="pdf-2502.11962" aria-labelledby="pdf-2502.11962">pdf</a>, <a href="https://arxiv.org/html/2502.11962v1" title="View HTML" id="html-2502.11962" aria-labelledby="html-2502.11962" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11962" title="Other formats" id="oth-2502.11962" aria-labelledby="oth-2502.11962">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Navigating the Helpfulness-Truthfulness Trade-Off with Uncertainty-Aware Instruction Fine-Tuning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+T">Tianyi Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ni,+J">Jingwei Ni</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hooi,+B">Bryan Hooi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+J">Jiaheng Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ash,+E">Elliott Ash</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ng,+S">See-Kiong Ng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sachan,+M">Mrinmaya Sachan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Leippold,+M">Markus Leippold</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Instruction Fine-tuning (IFT) can enhance the helpfulness of Large Language Models (LLMs), but it may lower their truthfulness. This trade-off arises because IFT steers LLMs to generate responses with long-tail knowledge that is not well covered during pre-training, leading to more informative but less truthful answers when generalizing to unseen tasks. In this paper, we empirically demonstrate this helpfulness-truthfulness trade-off in IFT and propose $\textbf{UNIT}$, a novel IFT paradigm to address it. UNIT teaches LLMs to recognize their uncertainty and explicitly reflect it at the end of their responses. Experimental results show that UNIT-tuned models maintain their helpfulness while distinguishing between certain and uncertain claims, thereby reducing hallucinations. </p> </div> </dd> <dt> <a name='item186'>[186]</a> <a href ="/abs/2502.11973" title="Abstract" id="2502.11973"> arXiv:2502.11973 </a> [<a href="/pdf/2502.11973" title="Download PDF" id="pdf-2502.11973" aria-labelledby="pdf-2502.11973">pdf</a>, <a href="https://arxiv.org/html/2502.11973v1" title="View HTML" id="html-2502.11973" aria-labelledby="html-2502.11973" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11973" title="Other formats" id="oth-2502.11973" aria-labelledby="oth-2502.11973">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Generating Text from Uniform Meaning Representation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Markle,+E">Emma Markle</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Iranmanesh,+R">Reihaneh Iranmanesh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wein,+S">Shira Wein</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Uniform Meaning Representation (UMR) is a recently developed graph-based semantic representation, which expands on Abstract Meaning Representation (AMR) in a number of ways, in particular through the inclusion of document-level information and multilingual flexibility. In order to effectively adopt and leverage UMR for downstream tasks, efforts must be placed toward developing a UMR technological ecosystem. Though still limited amounts of UMR annotations have been produced to date, in this work, we investigate the first approaches to producing text from multilingual UMR graphs: (1) a pipeline conversion of UMR to AMR, then using AMR-to-text generation models, (2) fine-tuning large language models with UMR data, and (3) fine-tuning existing AMR-to-text generation models with UMR data. Our best performing model achieves a multilingual BERTscore of 0.825 for English and 0.882 for Chinese when compared to the reference, which is a promising indication of the effectiveness of fine-tuning approaches for UMR-to-text generation with even limited amounts of UMR data. </p> </div> </dd> <dt> <a name='item187'>[187]</a> <a href ="/abs/2502.11995" title="Abstract" id="2502.11995"> arXiv:2502.11995 </a> [<a href="/pdf/2502.11995" title="Download PDF" id="pdf-2502.11995" aria-labelledby="pdf-2502.11995">pdf</a>, <a href="https://arxiv.org/html/2502.11995v1" title="View HTML" id="html-2502.11995" aria-labelledby="html-2502.11995" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11995" title="Other formats" id="oth-2502.11995" aria-labelledby="oth-2502.11995">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Presumed Cultural Identity: How Names Shape LLM Responses </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Pawar,+S">Siddhesh Pawar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Arora,+A">Arnav Arora</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kaffee,+L">Lucie-Aim茅e Kaffee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Augenstein,+I">Isabelle Augenstein</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 23 Pages, 13 Figures, 4 Tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Names are deeply tied to human identity. They can serve as markers of individuality, cultural heritage, and personal history. However, using names as a core indicator of identity can lead to over-simplification of complex identities. When interacting with LLMs, user names are an important point of information for personalisation. Names can enter chatbot conversations through direct user input (requested by chatbots), as part of task contexts such as CV reviews, or as built-in memory features that store user information for personalisation. We study biases associated with names by measuring cultural presumptions in the responses generated by LLMs when presented with common suggestion-seeking queries, which might involve making assumptions about the user. Our analyses demonstrate strong assumptions about cultural identity associated with names present in LLM generations across multiple cultures. Our work has implications for designing more nuanced personalisation systems that avoid reinforcing stereotypes while maintaining meaningful customisation. </p> </div> </dd> <dt> <a name='item188'>[188]</a> <a href ="/abs/2502.12001" title="Abstract" id="2502.12001"> arXiv:2502.12001 </a> [<a href="/pdf/2502.12001" title="Download PDF" id="pdf-2502.12001" aria-labelledby="pdf-2502.12001">pdf</a>, <a href="https://arxiv.org/html/2502.12001v1" title="View HTML" id="html-2502.12001" aria-labelledby="html-2502.12001" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12001" title="Other formats" id="oth-2502.12001" aria-labelledby="oth-2502.12001">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Merging Language and Domain Specific Models: The Impact on Technical Vocabulary Acquisition </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Rousset,+T">Thibault Rousset</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kakibuchi,+T">Taisei Kakibuchi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sasaki,+Y">Yusuke Sasaki</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nomura,+Y">Yoshihide Nomura</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Presented at the 263rd IPSJ-NL Workshop </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> This paper investigates the integration of technical vocabulary in merged language models. We explore the knowledge transfer mechanisms involved when combining a general-purpose language-specific model with a domain-specific model, focusing on the resulting model's comprehension of technical jargon. Our experiments analyze the impact of this merging process on the target model's proficiency in handling specialized terminology. We present a quantitative evaluation of the performance of the merged model, comparing it with that of the individual constituent models. The findings offer insights into the effectiveness of different model merging methods for enhancing domain-specific knowledge and highlight potential challenges and future directions in leveraging these methods for cross-lingual knowledge transfer in Natural Language Processing. </p> </div> </dd> <dt> <a name='item189'>[189]</a> <a href ="/abs/2502.12007" title="Abstract" id="2502.12007"> arXiv:2502.12007 </a> [<a href="/pdf/2502.12007" title="Download PDF" id="pdf-2502.12007" aria-labelledby="pdf-2502.12007">pdf</a>, <a href="https://arxiv.org/html/2502.12007v1" title="View HTML" id="html-2502.12007" aria-labelledby="html-2502.12007" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12007" title="Other formats" id="oth-2502.12007" aria-labelledby="oth-2502.12007">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Demographic Attributes Prediction from Speech Using WavLM Embeddings </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+Y">Yuchen Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Thebaud,+T">Thomas Thebaud</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dehak,+N">Najim Dehak</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 6 pages, accepted by The Conference on Information Sciences and Systems (CISS) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> This paper introduces a general classifier based on WavLM features, to infer demographic characteristics, such as age, gender, native language, education, and country, from speech. Demographic feature prediction plays a crucial role in applications like language learning, accessibility, and digital forensics, enabling more personalized and inclusive technologies. Leveraging pretrained models for embedding extraction, the proposed framework identifies key acoustic and linguistic fea-tures associated with demographic attributes, achieving a Mean Absolute Error (MAE) of 4.94 for age prediction and over 99.81% accuracy for gender classification across various datasets. Our system improves upon existing models by up to relative 30% in MAE and up to relative 10% in accuracy and F1 scores across tasks, leveraging a diverse range of datasets and large pretrained models to ensure robustness and generalizability. This study offers new insights into speaker diversity and provides a strong foundation for future research in speech-based demographic profiling. </p> </div> </dd> <dt> <a name='item190'>[190]</a> <a href ="/abs/2502.12018" title="Abstract" id="2502.12018"> arXiv:2502.12018 </a> [<a href="/pdf/2502.12018" title="Download PDF" id="pdf-2502.12018" aria-labelledby="pdf-2502.12018">pdf</a>, <a href="https://arxiv.org/html/2502.12018v1" title="View HTML" id="html-2502.12018" aria-labelledby="html-2502.12018" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12018" title="Other formats" id="oth-2502.12018" aria-labelledby="oth-2502.12018">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Atom of Thoughts for Markov LLM Test-Time Scaling </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Teng,+F">Fengwei Teng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+Z">Zhaoyang Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shi,+Q">Quan Shi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+J">Jiayi Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+C">Chenglin Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+Y">Yuyu Luo</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Large Language Models (LLMs) achieve superior performance through training-time scaling, and test-time scaling further enhances their capabilities by conducting effective reasoning during inference. However, as the scale of reasoning increases, existing test-time scaling methods suffer from accumulated historical information, which not only wastes computational resources but also interferes with effective reasoning. To address this issue, we observe that complex reasoning progress is often achieved by solving a sequence of independent subquestions, each being self-contained and verifiable. These subquestions are essentially atomic questions, relying primarily on their current state rather than accumulated history, similar to the memoryless transitions in a Markov process. Based on this observation, we propose Atom of Thoughts (AoT), where each state transition in the reasoning process consists of decomposing the current question into a dependency-based directed acyclic graph and contracting its subquestions, forming a new atomic question state. This iterative decomposition-contraction process continues until reaching directly solvable atomic questions, naturally realizing Markov transitions between question states. Furthermore, these atomic questions can be seamlessly integrated into existing test-time scaling methods, enabling AoT to serve as a plug-in enhancement for improving reasoning capabilities. Experiments across six benchmarks demonstrate the effectiveness of AoT both as a standalone framework and a plug-in enhancement. Notably, on HotpotQA, when applied to gpt-4o-mini, AoT achieves an 80.6% F1 score, surpassing o3-mini by 3.4% and DeepSeek-R1 by 10.6%. The code will be available at <a href="https://github.com/qixucen/atom" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item191'>[191]</a> <a href ="/abs/2502.12022" title="Abstract" id="2502.12022"> arXiv:2502.12022 </a> [<a href="/pdf/2502.12022" title="Download PDF" id="pdf-2502.12022" aria-labelledby="pdf-2502.12022">pdf</a>, <a href="https://arxiv.org/html/2502.12022v1" title="View HTML" id="html-2502.12022" aria-labelledby="html-2502.12022" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12022" title="Other formats" id="oth-2502.12022" aria-labelledby="oth-2502.12022">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Teaching LLMs According to Their Aptitude: Adaptive Reasoning for Mathematical Problem Solving </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+X">Xin Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+Y">Yan Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+T">Tianhao Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yan,+Y">Yuchen Yan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+C">Chengwu Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Z">Zaoyu Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yufei Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yin,+Y">Yichun Yin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yasheng Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shang,+L">Lifeng Shang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Q">Qun Liu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 8 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Existing approaches to mathematical reasoning with large language models (LLMs) rely on Chain-of-Thought (CoT) for generalizability or Tool-Integrated Reasoning (TIR) for precise computation. While efforts have been made to combine these methods, they primarily rely on post-selection or predefined strategies, leaving an open question: whether LLMs can autonomously adapt their reasoning strategy based on their inherent capabilities. In this work, we propose TATA (Teaching LLMs According to Their Aptitude), an adaptive framework that enables LLMs to personalize their reasoning strategy spontaneously, aligning it with their intrinsic aptitude. TATA incorporates base-LLM-aware data selection during supervised fine-tuning (SFT) to tailor training data to the model's unique abilities. This approach equips LLMs to autonomously determine and apply the appropriate reasoning strategy at test time. We evaluate TATA through extensive experiments on six mathematical reasoning benchmarks, using both general-purpose and math-specialized LLMs. Empirical results demonstrate that TATA effectively combines the complementary strengths of CoT and TIR, achieving superior or comparable performance with improved inference efficiency compared to TIR alone. Further analysis underscores the critical role of aptitude-aware data selection in enabling LLMs to make effective and adaptive reasoning decisions and align reasoning strategies with model capabilities. </p> </div> </dd> <dt> <a name='item192'>[192]</a> <a href ="/abs/2502.12050" title="Abstract" id="2502.12050"> arXiv:2502.12050 </a> [<a href="/pdf/2502.12050" title="Download PDF" id="pdf-2502.12050" aria-labelledby="pdf-2502.12050">pdf</a>, <a href="https://arxiv.org/html/2502.12050v1" title="View HTML" id="html-2502.12050" aria-labelledby="html-2502.12050" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12050" title="Other formats" id="oth-2502.12050" aria-labelledby="oth-2502.12050">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SpeechT: Findings of the First Mentorship in Speech Translation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Moslem,+Y">Yasmin Moslem</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mor%C3%A1n,+J+J+C">Juan Juli谩n Cea Mor谩n</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gonzalez-Gomez,+M">Mariano Gonzalez-Gomez</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Farouq,+M+H+A">Muhammad Hazim Al Farouq</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Abdou,+F">Farah Abdou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Deb,+S">Satarupa Deb</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Sound (cs.SD) </div> <p class='mathjax'> This work presents the details and findings of the first mentorship in speech translation (SpeechT), which took place in December 2024 and January 2025. To fulfil the requirements of the mentorship, the participants engaged in key activities, including data preparation, modelling, and advanced research. </p> </div> </dd> <dt> <a name='item193'>[193]</a> <a href ="/abs/2502.12051" title="Abstract" id="2502.12051"> arXiv:2502.12051 </a> [<a href="/pdf/2502.12051" title="Download PDF" id="pdf-2502.12051" aria-labelledby="pdf-2502.12051">pdf</a>, <a href="https://arxiv.org/html/2502.12051v1" title="View HTML" id="html-2502.12051" aria-labelledby="html-2502.12051" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12051" title="Other formats" id="oth-2502.12051" aria-labelledby="oth-2502.12051">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> How to Upscale Neural Networks with Scaling Law? A Survey and Practical Guidelines </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Sengupta,+A">Ayan Sengupta</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Goel,+Y">Yash Goel</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chakraborty,+T">Tanmoy Chakraborty</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 20 pages, 8 tables, 4 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Neural scaling laws have revolutionized the design and optimization of large-scale AI models by revealing predictable relationships between model size, dataset volume, and computational resources. Early research established power-law relationships in model performance, leading to compute-optimal scaling strategies. However, recent studies highlighted their limitations across architectures, modalities, and deployment contexts. Sparse models, mixture-of-experts, retrieval-augmented learning, and multimodal models often deviate from traditional scaling patterns. Moreover, scaling behaviors vary across domains such as vision, reinforcement learning, and fine-tuning, underscoring the need for more nuanced approaches. In this survey, we synthesize insights from over 50 studies, examining the theoretical foundations, empirical findings, and practical implications of scaling laws. We also explore key challenges, including data efficiency, inference scaling, and architecture-specific constraints, advocating for adaptive scaling strategies tailored to real-world applications. We suggest that while scaling laws provide a useful guide, they do not always generalize across all architectures and training strategies. </p> </div> </dd> <dt> <a name='item194'>[194]</a> <a href ="/abs/2502.12052" title="Abstract" id="2502.12052"> arXiv:2502.12052 </a> [<a href="/pdf/2502.12052" title="Download PDF" id="pdf-2502.12052" aria-labelledby="pdf-2502.12052">pdf</a>, <a href="https://arxiv.org/html/2502.12052v1" title="View HTML" id="html-2502.12052" aria-labelledby="html-2502.12052" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12052" title="Other formats" id="oth-2502.12052" aria-labelledby="oth-2502.12052">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Dual-Perspective NLG Meta-Evaluation Framework with Automatic Benchmark and Better Interpretability </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+X">Xinyu Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+M">Mingqi Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+L">Li Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+Z">Zhenghan Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wan,+X">Xiaojun Wan</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 23 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> In NLG meta-evaluation, evaluation metrics are typically assessed based on their consistency with humans. However, we identify some limitations in traditional NLG meta-evaluation approaches, such as issues in handling human ratings and ambiguous selections of correlation measures, which undermine the effectiveness of meta-evaluation. In this work, we propose a dual-perspective NLG meta-evaluation framework that focuses on different evaluation capabilities, thereby providing better interpretability. In addition, we introduce a method of automatically constructing the corresponding benchmarks without requiring new human annotations. Furthermore, we conduct experiments with 16 representative LLMs as the evaluators based on our proposed framework, comprehensively analyzing their evaluation performance from different perspectives. </p> </div> </dd> <dt> <a name='item195'>[195]</a> <a href ="/abs/2502.12055" title="Abstract" id="2502.12055"> arXiv:2502.12055 </a> [<a href="/pdf/2502.12055" title="Download PDF" id="pdf-2502.12055" aria-labelledby="pdf-2502.12055">pdf</a>, <a href="https://arxiv.org/html/2502.12055v1" title="View HTML" id="html-2502.12055" aria-labelledby="html-2502.12055" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12055" title="Other formats" id="oth-2502.12055" aria-labelledby="oth-2502.12055">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Designing Role Vectors to Improve LLM Inference Behaviour </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Potert%C3%AC,+D">Daniele Potert矛</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Seveso,+A">Andrea Seveso</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mercorio,+F">Fabio Mercorio</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Submitted to ARR 2025 February cycle </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The influence of personas on Large Language Models (LLMs) has been widely studied, yet their direct impact on performance remains uncertain. This work explores a novel approach to guiding LLM behaviour through role vectors, an alternative to persona-based prompting. We construct 29 role vectors derived from model activations and evaluate their impact on benchmark performance across multiple domains. Our analysis investigates whether these vectors can effectively steer models toward domain-specific expertise. We measure two key interventions: (i) activation addition, which reinforces role-specific directions, and (ii) directional ablation, which removes them. Results on well-established benchmarks indicate that role vectors do, in fact, influence model behaviour, improving task performance in relevant domains while marginally affecting unrelated tasks. This, in turn, suggests that manipulating internal model representations has a greater impact on outcomes than persona-based prompting. </p> </div> </dd> <dt> <a name='item196'>[196]</a> <a href ="/abs/2502.12057" title="Abstract" id="2502.12057"> arXiv:2502.12057 </a> [<a href="/pdf/2502.12057" title="Download PDF" id="pdf-2502.12057" aria-labelledby="pdf-2502.12057">pdf</a>, <a href="https://arxiv.org/html/2502.12057v1" title="View HTML" id="html-2502.12057" aria-labelledby="html-2502.12057" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12057" title="Other formats" id="oth-2502.12057" aria-labelledby="oth-2502.12057">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Culture is Not Trivia: Sociocultural Theory for Cultural NLP </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+N">Naitian Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bamman,+D">David Bamman</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bleaman,+I+L">Isaac L. Bleaman</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> In submission </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Computers and Society (cs.CY) </div> <p class='mathjax'> The field of cultural NLP has recently experienced rapid growth, driven by a pressing need to ensure that language technologies are effective and safe across a pluralistic user base. This work has largely progressed without a shared conception of culture, instead choosing to rely on a wide array of cultural proxies. However, this leads to a number of recurring limitations: coarse national boundaries fail to capture nuanced differences that lay within them, limited coverage restricts datasets to only a subset of usually highly-represented cultures, and a lack of dynamicity results in static cultural benchmarks that do not change as culture evolves. In this position paper, we argue that these methodological limitations are symptomatic of a theoretical gap. We draw on a well-developed theory of culture from sociocultural linguistics to fill this gap by 1) demonstrating in a case study how it can clarify methodological constraints and affordances, 2) offering theoretically-motivated paths forward to achieving cultural competence, and 3) arguing that localization is a more useful framing for the goals of much current work in cultural NLP. </p> </div> </dd> <dt> <a name='item197'>[197]</a> <a href ="/abs/2502.12064" title="Abstract" id="2502.12064"> arXiv:2502.12064 </a> [<a href="/pdf/2502.12064" title="Download PDF" id="pdf-2502.12064" aria-labelledby="pdf-2502.12064">pdf</a>, <a href="https://arxiv.org/html/2502.12064v1" title="View HTML" id="html-2502.12064" aria-labelledby="html-2502.12064" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12064" title="Other formats" id="oth-2502.12064" aria-labelledby="oth-2502.12064">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> AI-generated Text Detection with a GLTR-based Approach </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+L+Y">Luc铆a Yan Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Segura-Bedmar,+I">Isabel Segura-Bedmar</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> The rise of LLMs (Large Language Models) has contributed to the improved performance and development of cutting-edge NLP applications. However, these can also pose risks when used maliciously, such as spreading fake news, harmful content, impersonating individuals, or facilitating school plagiarism, among others. This is because LLMs can generate high-quality texts, which are challenging to differentiate from those written by humans. GLTR, which stands for Giant Language Model Test Room and was developed jointly by the MIT-IBM Watson AI Lab and HarvardNLP, is a visual tool designed to help detect machine-generated texts based on GPT-2, that highlights the words in text depending on the probability that they were machine-generated. One limitation of GLTR is that the results it returns can sometimes be ambiguous and lead to confusion. This study aims to explore various ways to improve GLTR's effectiveness for detecting AI-generated texts within the context of the IberLef-AuTexTification 2023 shared task, in both English and Spanish languages. Experiment results show that our GLTR-based GPT-2 model overcomes the state-of-the-art models on the English dataset with a macro F1-score of 80.19%, except for the first ranking model (80.91%). However, for the Spanish dataset, we obtained a macro F1-score of 66.20%, which differs by 4.57% compared to the top-performing model. </p> </div> </dd> <dt> <a name='item198'>[198]</a> <a href ="/abs/2502.12065" title="Abstract" id="2502.12065"> arXiv:2502.12065 </a> [<a href="/pdf/2502.12065" title="Download PDF" id="pdf-2502.12065" aria-labelledby="pdf-2502.12065">pdf</a>, <a href="https://arxiv.org/html/2502.12065v1" title="View HTML" id="html-2502.12065" aria-labelledby="html-2502.12065" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12065" title="Other formats" id="oth-2502.12065" aria-labelledby="oth-2502.12065">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Formalizing Complex Mathematical Statements with LLMs: A Study on Mathematical Definitions </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+L">Lan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Valentino,+M">Marco Valentino</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Freitas,+A">Andre Freitas</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Formal Languages and Automata Theory (cs.FL) </div> <p class='mathjax'> Thanks to their linguistic capabilities, LLMs offer an opportunity to bridge the gap between informal mathematics and formal languages through autoformalization. However, it is still unclear how well LLMs generalize to sophisticated and naturally occurring mathematical statements. To address this gap, we investigate the task of autoformalizing real-world mathematical definitions -- a critical component of mathematical discourse. Specifically, we introduce two novel resources for autoformalisation, collecting definitions from Wikipedia (Def_Wiki) and arXiv papers (Def_ArXiv). We then systematically evaluate a range of LLMs, analyzing their ability to formalize definitions into Isabelle/HOL. Furthermore, we investigate strategies to enhance LLMs' performance including refinement through external feedback from Proof Assistants, and formal definition grounding, where we guide LLMs through relevant contextual elements from formal mathematical libraries. Our findings reveal that definitions present a greater challenge compared to existing benchmarks, such as miniF2F. In particular, we found that LLMs still struggle with self-correction, and aligning with relevant mathematical libraries. At the same time, structured refinement methods and definition grounding strategies yield notable improvements of up to 16% on self-correction capabilities and 43% on the reduction of undefined errors, highlighting promising directions for enhancing LLM-based autoformalization in real-world scenarios. </p> </div> </dd> <dt> <a name='item199'>[199]</a> <a href ="/abs/2502.12067" title="Abstract" id="2502.12067"> arXiv:2502.12067 </a> [<a href="/pdf/2502.12067" title="Download PDF" id="pdf-2502.12067" aria-labelledby="pdf-2502.12067">pdf</a>, <a href="https://arxiv.org/html/2502.12067v1" title="View HTML" id="html-2502.12067" aria-labelledby="html-2502.12067" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12067" title="Other formats" id="oth-2502.12067" aria-labelledby="oth-2502.12067">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> TokenSkip: Controllable Chain-of-Thought Compression in LLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xia,+H">Heming Xia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yongqi Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Leong,+C+T">Chak Tou Leong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+W">Wenjie Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+W">Wenjie Li</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Chain-of-Thought (CoT) has been proven effective in enhancing the reasoning capabilities of large language models (LLMs). Recent advancements, such as OpenAI's o1 and DeepSeek-R1, suggest that scaling up the length of CoT sequences during inference could further boost LLM reasoning performance. However, due to the autoregressive nature of LLM decoding, longer CoT outputs lead to a linear increase in inference latency, adversely affecting user experience, particularly when the CoT exceeds 10,000 tokens. To address this limitation, we analyze the semantic importance of tokens within CoT outputs and reveal that their contributions to reasoning vary. Building on this insight, we propose TokenSkip, a simple yet effective approach that enables LLMs to selectively skip less important tokens, allowing for controllable CoT compression. Extensive experiments across various models and tasks demonstrate the effectiveness of TokenSkip in reducing CoT token usage while preserving strong reasoning performance. Notably, when applied to Qwen2.5-14B-Instruct, TokenSkip reduces reasoning tokens by 40% (from 313 to 181) on GSM8K, with less than a 0.4% performance drop. </p> </div> </dd> <dt> <a name='item200'>[200]</a> <a href ="/abs/2502.12073" title="Abstract" id="2502.12073"> arXiv:2502.12073 </a> [<a href="/pdf/2502.12073" title="Download PDF" id="pdf-2502.12073" aria-labelledby="pdf-2502.12073">pdf</a>, <a href="https://arxiv.org/html/2502.12073v1" title="View HTML" id="html-2502.12073" aria-labelledby="html-2502.12073" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12073" title="Other formats" id="oth-2502.12073" aria-labelledby="oth-2502.12073">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Can LLMs Simulate Social Media Engagement? A Study on Action-Guided Response Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Qiu,+Z">Zhongyi Qiu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lyu,+H">Hanjia Lyu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiong,+W">Wei Xiong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+J">Jiebo Luo</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Social media enables dynamic user engagement with trending topics, and recent research has explored the potential of large language models (LLMs) for response generation. While some studies investigate LLMs as agents for simulating user behavior on social media, their focus remains on practical viability and scalability rather than a deeper understanding of how well LLM aligns with human behavior. This paper analyzes LLMs' ability to simulate social media engagement through action guided response generation, where a model first predicts a user's most likely engagement action-retweet, quote, or rewrite-towards a trending post before generating a personalized response conditioned on the predicted action. We benchmark GPT-4o-mini, O1-mini, and DeepSeek-R1 in social media engagement simulation regarding a major societal event discussed on X. Our findings reveal that zero-shot LLMs underperform BERT in action prediction, while few-shot prompting initially degrades the prediction accuracy of LLMs with limited examples. However, in response generation, few-shot LLMs achieve stronger semantic alignment with ground truth posts. </p> </div> </dd> <dt> <a name='item201'>[201]</a> <a href ="/abs/2502.12082" title="Abstract" id="2502.12082"> arXiv:2502.12082 </a> [<a href="/pdf/2502.12082" title="Download PDF" id="pdf-2502.12082" aria-labelledby="pdf-2502.12082">pdf</a>, <a href="https://arxiv.org/html/2502.12082v1" title="View HTML" id="html-2502.12082" aria-labelledby="html-2502.12082" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12082" title="Other formats" id="oth-2502.12082" aria-labelledby="oth-2502.12082">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> AdaSplash: Adaptive Sparse Flash Attention </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Gon%C3%A7alves,+N">Nuno Gon莽alves</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Treviso,+M">Marcos Treviso</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Martins,+A+F+T">Andr茅 F. T. Martins</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> The computational cost of softmax-based attention in transformers limits their applicability to long-context tasks. Adaptive sparsity, of which $\alpha$-entmax attention is an example, offers a flexible data-dependent alternative, but existing implementations are inefficient and do not leverage the sparsity to obtain runtime and memory gains. In this work, we propose AdaSplash, which combines the efficiency of GPU-optimized algorithms with the sparsity benefits of $\alpha$-entmax. We first introduce a hybrid Halley-bisection algorithm, resulting in a 7-fold reduction in the number of iterations needed to compute the $\alpha$-entmax transformation. Then, we implement custom Triton kernels to efficiently handle adaptive sparsity. Experiments with RoBERTa and ModernBERT for text classification and single-vector retrieval, along with GPT-2 for language modeling, show that our method achieves substantial improvements in runtime and memory efficiency compared to existing $\alpha$-entmax implementations. It approaches -- and in some cases surpasses -- the efficiency of highly optimized softmax implementations like FlashAttention-2, enabling long-context training while maintaining strong task performance. </p> </div> </dd> <dt> <a name='item202'>[202]</a> <a href ="/abs/2502.12084" title="Abstract" id="2502.12084"> arXiv:2502.12084 </a> [<a href="/pdf/2502.12084" title="Download PDF" id="pdf-2502.12084" aria-labelledby="pdf-2502.12084">pdf</a>, <a href="https://arxiv.org/html/2502.12084v1" title="View HTML" id="html-2502.12084" aria-labelledby="html-2502.12084" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12084" title="Other formats" id="oth-2502.12084" aria-labelledby="oth-2502.12084">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> VLM$^2$-Bench: A Closer Look at How Well VLMs Implicitly Link Explicit Matching Visual Cues </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+J">Jianshu Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yao,+D">Dongyu Yao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pi,+R">Renjie Pi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liang,+P+P">Paul Pu Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=R.,+Y">Yi R.</a> (May)<a href="https://arxiv.org/search/cs?searchtype=author&query=Fung">Fung</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Project Page: <a href="https://vlm2-bench.github.io/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Visually linking matching cues is a crucial ability in daily life, such as identifying the same person in multiple photos based on their cues, even without knowing who they are. Despite the extensive knowledge that vision-language models (VLMs) possess, it remains largely unexplored whether they are capable of performing this fundamental task. To address this, we introduce VLM$^2$-Bench, a benchmark designed to assess whether VLMs can Visually Link Matching cues, with 9 subtasks and over 3,000 test cases. Comprehensive evaluation across eight open-source VLMs and GPT-4o, along with further analysis of various language-side and vision-side prompting methods, leads to a total of eight key findings. We identify critical challenges in models' ability to link visual cues, highlighting a significant performance gap where even GPT-4o lags 34.80% behind humans. Based on these insights, we advocate for (i) enhancing core visual capabilities to improve adaptability and reduce reliance on prior knowledge, (ii) establishing clearer principles for integrating language-based reasoning in vision-centric tasks to prevent unnecessary biases, and (iii) shifting vision-text training paradigms toward fostering models' ability to independently structure and infer relationships among visual cues. </p> </div> </dd> <dt> <a name='item203'>[203]</a> <a href ="/abs/2502.12109" title="Abstract" id="2502.12109"> arXiv:2502.12109 </a> [<a href="/pdf/2502.12109" title="Download PDF" id="pdf-2502.12109" aria-labelledby="pdf-2502.12109">pdf</a>, <a href="https://arxiv.org/html/2502.12109v1" title="View HTML" id="html-2502.12109" aria-labelledby="html-2502.12109" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12109" title="Other formats" id="oth-2502.12109" aria-labelledby="oth-2502.12109">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Personality Structured Interview for Large Language Model Simulation in Personality Research </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+P">Pengda Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zou,+H">Huiqi Zou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+H">Hanjie Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+T">Tianjun Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiao,+Z">Ziang Xiao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Oswald,+F+L">Frederick L. Oswald</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 41 Pages, 30 Tables, 5 Figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Although psychometrics researchers have recently explored the use of large language models (LLMs) as proxies for human participants, LLMs often fail to generate heterogeneous data with human-like diversity, which diminishes their value in advancing social science research. To address these challenges, we explored the potential of the theory-informed Personality Structured Interview (PSI) as a tool for simulating human responses in personality research. In this approach, the simulation is grounded in nuanced real-human interview transcripts that target the personality construct of interest. We have provided a growing set of 357 structured interview transcripts from a representative sample, each containing an individual's response to 32 open-ended questions carefully designed to gather theory-based personality evidence. Additionally, grounded in psychometric research, we have summarized an evaluation framework to systematically validate LLM-generated psychometric data. Results from three experiments demonstrate that well-designed structured interviews could improve human-like heterogeneity in LLM-simulated personality data and predict personality-related behavioral outcomes (i.e., organizational citizenship behaviors and counterproductive work behavior). We further discuss the role of theory-informed structured interviews in LLM-based simulation and outline a general framework for designing structured interviews to simulate human-like data for psychometric research. </p> </div> </dd> <dt> <a name='item204'>[204]</a> <a href ="/abs/2502.12110" title="Abstract" id="2502.12110"> arXiv:2502.12110 </a> [<a href="/pdf/2502.12110" title="Download PDF" id="pdf-2502.12110" aria-labelledby="pdf-2502.12110">pdf</a>, <a href="https://arxiv.org/html/2502.12110v1" title="View HTML" id="html-2502.12110" aria-labelledby="html-2502.12110" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12110" title="Other formats" id="oth-2502.12110" aria-labelledby="oth-2502.12110">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A-MEM: Agentic Memory for LLM Agents </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+W">Wujiang Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liang,+Z">Zujie Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mei,+K">Kai Mei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+H">Hang Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tan,+J">Juntao Tan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yongfeng Zhang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Human-Computer Interaction (cs.HC) </div> <p class='mathjax'> While large language model (LLM) agents can effectively use external tools for complex real-world tasks, they require memory systems to leverage historical experiences. Current memory systems enable basic storage and retrieval but lack sophisticated memory organization, despite recent attempts to incorporate graph databases. Moreover, these systems' fixed operations and structures limit their adaptability across diverse tasks. To address this limitation, this paper proposes a novel agentic memory system for LLM agents that can dynamically organize memories in an agentic way. Following the basic principles of the Zettelkasten method, we designed our memory system to create interconnected knowledge networks through dynamic indexing and linking. When a new memory is added, we generate a comprehensive note containing multiple structured attributes, including contextual descriptions, keywords, and tags. The system then analyzes historical memories to identify relevant connections, establishing links where meaningful similarities exist. Additionally, this process enables memory evolution - as new memories are integrated, they can trigger updates to the contextual representations and attributes of existing historical memories, allowing the memory network to continuously refine its understanding. Our approach combines the structured organization principles of Zettelkasten with the flexibility of agent-driven decision making, allowing for more adaptive and context-aware memory management. Empirical experiments on six foundation models show superior improvement against existing SOTA baselines. The source code is available at <a href="https://github.com/WujiangXu/AgenticMemory" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item205'>[205]</a> <a href ="/abs/2502.12123" title="Abstract" id="2502.12123"> arXiv:2502.12123 </a> [<a href="/pdf/2502.12123" title="Download PDF" id="pdf-2502.12123" aria-labelledby="pdf-2502.12123">pdf</a>, <a href="https://arxiv.org/html/2502.12123v1" title="View HTML" id="html-2502.12123" aria-labelledby="html-2502.12123" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12123" title="Other formats" id="oth-2502.12123" aria-labelledby="oth-2502.12123">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> On the Query Complexity of Verifier-Assisted Language Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Botta,+E">Edoardo Botta</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yuchen Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mehta,+A">Aashay Mehta</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ash,+J+T">Jordan T. Ash</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+C">Cyril Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Risteski,+A">Andrej Risteski</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Recently, a plethora of works have proposed inference-time algorithms (e.g. best-of-n), which incorporate verifiers to assist the generation process. Their quality-efficiency trade-offs have been empirically benchmarked on a variety of constrained generation tasks, but the algorithmic design landscape is still largely poorly understood. In this paper, we develop a mathematical framework for reasoning about constrained generation using a pre-trained language model generator oracle and a process verifier--which can decide whether a prefix can be extended to a string which satisfies the constraints of choice. We show that even in very simple settings, access to a verifier can render an intractable problem (information-theoretically or computationally) to a tractable one. In fact, we show even simple algorithms, like tokenwise rejection sampling, can enjoy significant benefits from access to a verifier. Empirically, we show that a natural modification of tokenwise rejection sampling, in which the sampler is allowed to "backtrack" (i.e., erase the final few generated tokens) has robust and substantive benefits over natural baselines (e.g. (blockwise) rejection sampling, nucleus sampling)--both in terms of computational efficiency, accuracy and diversity. </p> </div> </dd> <dt> <a name='item206'>[206]</a> <a href ="/abs/2502.12124" title="Abstract" id="2502.12124"> arXiv:2502.12124 </a> [<a href="/pdf/2502.12124" title="Download PDF" id="pdf-2502.12124" aria-labelledby="pdf-2502.12124">pdf</a>, <a href="https://arxiv.org/html/2502.12124v1" title="View HTML" id="html-2502.12124" aria-labelledby="html-2502.12124" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12124" title="Other formats" id="oth-2502.12124" aria-labelledby="oth-2502.12124">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> RA-MTR: A Retrieval Augmented Multi-Task Reader based Approach for Inspirational Quote Extraction from Long Documents </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Adak,+S">Sayantan Adak</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mukherjee,+A">Animesh Mukherjee</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted at COLING2025-MAIN </div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> https://aclanthology.org/2025.coling-main.365/ </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Inspirational quotes from famous individuals are often used to convey thoughts in news articles, essays, and everyday conversations. In this paper, we propose a novel context-based quote extraction system that aims to extract the most relevant quote from a long text. We formulate this quote extraction as an open domain question answering problem first by employing a vector-store based retriever and then applying a multi-task reader. We curate three context-based quote extraction datasets and introduce a novel multi-task framework RA-MTR that improves the state-of-the-art performance, achieving a maximum improvement of 5.08% in BoW F1-score. </p> </div> </dd> <dt> <a name='item207'>[207]</a> <a href ="/abs/2502.12134" title="Abstract" id="2502.12134"> arXiv:2502.12134 </a> [<a href="/pdf/2502.12134" title="Download PDF" id="pdf-2502.12134" aria-labelledby="pdf-2502.12134">pdf</a>, <a href="https://arxiv.org/html/2502.12134v1" title="View HTML" id="html-2502.12134" aria-labelledby="html-2502.12134" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12134" title="Other formats" id="oth-2502.12134" aria-labelledby="oth-2502.12134">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SoftCoT: Soft Chain-of-Thought for Efficient Reasoning with LLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+Y">Yige Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+X">Xu Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zeng,+Z">Zhiwei Zeng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Miao,+C">Chunyan Miao</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Chain-of-Thought (CoT) reasoning enables Large Language Models (LLMs) to solve complex reasoning tasks by generating intermediate reasoning steps. However, most existing approaches focus on hard token decoding, which constrains reasoning within the discrete vocabulary space and may not always be optimal. While recent efforts explore continuous-space reasoning, they often suffer from catastrophic forgetting, limiting their applicability to state-of-the-art LLMs that already perform well in zero-shot settings with a proper instruction. To address this challenge, we propose a novel approach for continuous-space reasoning that does not require modifying the underlying LLM. Specifically, we employ a lightweight assistant model to generate instance-specific soft thought tokens speculatively as the initial chain of thoughts, which are then mapped into the LLM's representation space via a projection module. Experimental results on five reasoning benchmarks demonstrate that our method enhances LLM reasoning performance through supervised, parameter-efficient fine-tuning. </p> </div> </dd> <dt> <a name='item208'>[208]</a> <a href ="/abs/2502.12137" title="Abstract" id="2502.12137"> arXiv:2502.12137 </a> [<a href="/pdf/2502.12137" title="Download PDF" id="pdf-2502.12137" aria-labelledby="pdf-2502.12137">pdf</a>, <a href="https://arxiv.org/html/2502.12137v1" title="View HTML" id="html-2502.12137" aria-labelledby="html-2502.12137" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12137" title="Other formats" id="oth-2502.12137" aria-labelledby="oth-2502.12137">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> REVERSUM: A Multi-staged Retrieval-Augmented Generation Method to Enhance Wikipedia Tail Biographies through Personal Narratives </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Adak,+S">Sayantan Adak</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Meher,+P+M">Pauras Mangesh Meher</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Das,+P">Paramita Das</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mukherjee,+A">Animesh Mukherjee</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted at COLING2025 Industry Track </div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> https://aclanthology.org/2025.coling-industry.61/ </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Information Retrieval (cs.IR) </div> <p class='mathjax'> Wikipedia is an invaluable resource for factual information about a wide range of entities. However, the quality of articles on less-known entities often lags behind that of the well-known ones. This study proposes a novel approach to enhancing Wikipedia's B and C category biography articles by leveraging personal narratives such as autobiographies and biographies. By utilizing a multi-staged retrieval-augmented generation technique -- REVerSum -- we aim to enrich the informational content of these lesser-known articles. Our study reveals that personal narratives can significantly improve the quality of Wikipedia articles, providing a rich source of reliable information that has been underutilized in previous studies. Based on crowd-based evaluation, REVerSum generated content outperforms the best performing baseline by 17% in terms of integrability to the original Wikipedia article and 28.5\% in terms of informativeness. Code and Data are available at: <a href="https://github.com/sayantan11995/wikipedia_enrichment" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item209'>[209]</a> <a href ="/abs/2502.12150" title="Abstract" id="2502.12150"> arXiv:2502.12150 </a> [<a href="/pdf/2502.12150" title="Download PDF" id="pdf-2502.12150" aria-labelledby="pdf-2502.12150">pdf</a>, <a href="https://arxiv.org/html/2502.12150v1" title="View HTML" id="html-2502.12150" aria-labelledby="html-2502.12150" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12150" title="Other formats" id="oth-2502.12150" aria-labelledby="oth-2502.12150">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Idiosyncrasies in Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+M">Mingjie Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yin,+Y">Yida Yin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+Z">Zhiqiu Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kolter,+J+Z">J. Zico Kolter</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zhuang Liu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Website at <a href="https://eric-mingjie.github.io/llm-idiosyncrasies/index.html" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> In this work, we unveil and study idiosyncrasies in Large Language Models (LLMs) -- unique patterns in their outputs that can be used to distinguish the models. To do so, we consider a simple classification task: given a particular text output, the objective is to predict the source LLM that generates the text. We evaluate this synthetic task across various groups of LLMs and find that simply fine-tuning existing text embedding models on LLM-generated texts yields excellent classification accuracy. Notably, we achieve 97.1% accuracy on held-out validation data in the five-way classification problem involving ChatGPT, Claude, Grok, Gemini, and DeepSeek. Our further investigation reveals that these idiosyncrasies are rooted in word-level distributions. These patterns persist even when the texts are rewritten, translated, or summarized by an external LLM, suggesting that they are also encoded in the semantic content. Additionally, we leverage LLM as judges to generate detailed, open-ended descriptions of each model's idiosyncrasies. Finally, we discuss the broader implications of our findings, particularly for training on synthetic data and inferring model similarity. Code is available at <a href="https://github.com/locuslab/llm-idiosyncrasies" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> </dl> <dl id='articles'> <h3>Cross submissions (showing 66 of 66 entries)</h3> <dt> <a name='item210'>[210]</a> <a href ="/abs/2502.10394" title="Abstract" id="2502.10394"> arXiv:2502.10394 </a> (cross-list from cs.AI) [<a href="/pdf/2502.10394" title="Download PDF" id="pdf-2502.10394" aria-labelledby="pdf-2502.10394">pdf</a>, <a href="/format/2502.10394" title="Other formats" id="oth-2502.10394" aria-labelledby="oth-2502.10394">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Coordination-based Approach for Focused Learning in Knowledge-Based Systems </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Sharma,+A">Abhishek Sharma</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Recent progress in Learning by Reading and Machine Reading systems has significantly increased the capacity of knowledge-based systems to learn new facts. In this work, we discuss the problem of selecting a set of learning requests for these knowledge-based systems which would lead to maximum Q/A performance. To understand the dynamics of this problem, we simulate the properties of a learning strategy, which sends learning requests to an external knowledge source. We show that choosing an optimal set of facts for these learning systems is similar to a coordination game, and use reinforcement learning to solve this problem. Experiments show that such an approach can significantly improve Q/A performance. </p> </div> </dd> <dt> <a name='item211'>[211]</a> <a href ="/abs/2502.10411" title="Abstract" id="2502.10411"> arXiv:2502.10411 </a> (cross-list from cs.CY) [<a href="/pdf/2502.10411" title="Download PDF" id="pdf-2502.10411" aria-labelledby="pdf-2502.10411">pdf</a>, <a href="https://arxiv.org/html/2502.10411v1" title="View HTML" id="html-2502.10411" aria-labelledby="html-2502.10411" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10411" title="Other formats" id="oth-2502.10411" aria-labelledby="oth-2502.10411">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> TrueReason: An Exemplar Personalised Learning System Integrating Reasoning with Foundational Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Bulathwela,+S">Sahan Bulathwela</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Van+Niekerk,+D">Daniel Van Niekerk</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shipton,+J">Jarrod Shipton</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Perez-Ortiz,+M">Maria Perez-Ortiz</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rosman,+B">Benjamin Rosman</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shawe-Taylor,+J">John Shawe-Taylor</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> To be published as a book chapter </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computers and Society (cs.CY)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Information Retrieval (cs.IR); Multiagent Systems (cs.MA) </div> <p class='mathjax'> Personalised education is one of the domains that can greatly benefit from the most recent advances in Artificial Intelligence (AI) and Large Language Models (LLM). However, it is also one of the most challenging applications due to the cognitive complexity of teaching effectively while personalising the learning experience to suit independent learners. We hypothesise that one promising approach to excelling in such demanding use cases is using a \emph{society of minds}. In this chapter, we present TrueReason, an exemplar personalised learning system that integrates a multitude of specialised AI models that can mimic micro skills that are composed together by a LLM to operationalise planning and reasoning. The architecture of the initial prototype is presented while describing two micro skills that have been incorporated in the prototype. The proposed system demonstrates the first step in building sophisticated AI systems that can take up very complex cognitive tasks that are demanded by domains such as education. </p> </div> </dd> <dt> <a name='item212'>[212]</a> <a href ="/abs/2502.10413" title="Abstract" id="2502.10413"> arXiv:2502.10413 </a> (cross-list from cs.CY) [<a href="/pdf/2502.10413" title="Download PDF" id="pdf-2502.10413" aria-labelledby="pdf-2502.10413">pdf</a>, <a href="/format/2502.10413" title="Other formats" id="oth-2502.10413" aria-labelledby="oth-2502.10413">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Machine Learning-Driven Convergence Analysis in Multijurisdictional Compliance Using BERT and K-Means Clustering </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Sonani,+R">Raj Sonani</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Prayas,+L">Lohalekar Prayas</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 16 pages, 5 figures, 4 tables </div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> Aitoz Journal of AI Research 3 (2024) 126-141 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computers and Society (cs.CY)</span>; Artificial Intelligence (cs.AI); Computational Engineering, Finance, and Science (cs.CE); Computation and Language (cs.CL); Machine Learning (cs.LG) </div> <p class='mathjax'> Digital data continues to grow, there has been a shift towards using effective regulatory mechanisms to safeguard personal information. The CCPA of California and the General Data Protection Regulation (GDPR) of the European Union are two of the most important privacy laws. The regulation is intended to safeguard consumer privacy, but it varies greatly in scope, definitions, and methods of enforcement. This paper presents a fresh approach to adaptive compliance, using machine learning and emphasizing natural language processing (NLP) as the primary focus of comparison between the GDPR and CCPA. Using NLP, this study compares various regulations to identify areas where they overlap or diverge. This includes the "right to be forgotten" provision in the GDPR and the "opt-out of sale" provision under CCPA. International companies can learn valuable lessons from this report, as it outlines strategies for better enforcement of laws across different nations. Additionally, the paper discusses the challenges of utilizing NLP in legal literature and proposes methods to enhance the model-ability of machine learning models for studying regulations. The study's objective is to "bridge the gap between legal knowledge and technical expertise" by developing regulatory compliance strategies that are more efficient in operation and more effective in data protection. </p> </div> </dd> <dt> <a name='item213'>[213]</a> <a href ="/abs/2502.10420" title="Abstract" id="2502.10420"> arXiv:2502.10420 </a> (cross-list from cs.AI) [<a href="/pdf/2502.10420" title="Download PDF" id="pdf-2502.10420" aria-labelledby="pdf-2502.10420">pdf</a>, <a href="https://arxiv.org/html/2502.10420v1" title="View HTML" id="html-2502.10420" aria-labelledby="html-2502.10420" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10420" title="Other formats" id="oth-2502.10420" aria-labelledby="oth-2502.10420">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Position: Stop Acting Like Language Model Agents Are Normal Agents </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Perrier,+E">Elija Perrier</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bennett,+M+T">Michael Timothy Bennett</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Under review </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Language Model Agents (LMAs) are increasingly treated as capable of autonomously navigating interactions with humans and tools. Their design and deployment tends to presume they are normal agents capable of sustaining coherent goals, adapting across contexts and acting with a measure of intentionality. These assumptions are critical to prospective use cases in industrial, social and governmental settings. But LMAs are not normal agents. They inherit the structural problems of the large language models (LLMs) around which they are built: hallucinations, jailbreaking, misalignment and unpredictability. In this Position paper we argue LMAs should not be treated as normal agents, because doing so leads to problems that undermine their utility and trustworthiness. We enumerate pathologies of agency intrinsic to LMAs. Despite scaffolding such as external memory and tools, they remain ontologically stateless, stochastic, semantically sensitive, and linguistically intermediated. These pathologies destabilise the ontological properties of LMAs including identifiability, continuity, persistence and and consistency, problematising their claim to agency. In response, we argue LMA ontological properties should be measured before, during and after deployment so that the negative effects of pathologies can be mitigated. </p> </div> </dd> <dt> <a name='item214'>[214]</a> <a href ="/abs/2502.10440" title="Abstract" id="2502.10440"> arXiv:2502.10440 </a> (cross-list from cs.CR) [<a href="/pdf/2502.10440" title="Download PDF" id="pdf-2502.10440" aria-labelledby="pdf-2502.10440">pdf</a>, <a href="https://arxiv.org/html/2502.10440v1" title="View HTML" id="html-2502.10440" aria-labelledby="html-2502.10440" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10440" title="Other formats" id="oth-2502.10440" aria-labelledby="oth-2502.10440">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Towards Copyright Protection for Knowledge Bases of Retrieval-augmented Language Models via Ownership Verification with Reasoning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+J">Junfeng Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yiming Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+R">Ruibo Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Y">Yihan Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+C">Chenxi Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Y">Yanshuo Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+H">Heng Huang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> The first two authors contributed equally to this work. 19 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Cryptography and Security (cs.CR)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Information Retrieval (cs.IR); Machine Learning (cs.LG) </div> <p class='mathjax'> Large language models (LLMs) are increasingly integrated into real-world applications through retrieval-augmented generation (RAG) mechanisms to supplement their responses with up-to-date and domain-specific knowledge. However, the valuable and often proprietary nature of the knowledge bases used in RAG introduces the risk of unauthorized usage by adversaries. Existing methods that can be generalized as watermarking techniques to protect these knowledge bases typically involve poisoning attacks. However, these methods require to alter the results of verification samples (\eg, generating incorrect outputs), inevitably making them susceptible to anomaly detection and even introduce new security risks. To address these challenges, we propose \name{} for `harmless' copyright protection of knowledge bases. Instead of manipulating LLM's final output, \name{} implants distinct verification behaviors in the space of chain-of-thought (CoT) reasoning, maintaining the correctness of the final answer. Our method has three main stages: (1) \textbf{Generating CoTs}: For each verification question, we generate two CoTs, including a target CoT for building watermark behaviors; (2) \textbf{Optimizing Watermark Phrases and Target CoTs}: We optimize them to minimize retrieval errors under the black-box setting of suspicious LLM, ensuring that the watermarked verification queries activate the target CoTs without being activated in non-watermarked ones; (3) \textbf{Ownership Verification}: We exploit a pairwise Wilcoxon test to statistically verify whether a suspicious LLM is augmented with the protected knowledge base by comparing its responses to watermarked and benign verification queries. Our experiments on diverse benchmarks demonstrate that \name{} effectively protects knowledge bases against unauthorized usage while preserving the integrity and performance of the RAG. </p> </div> </dd> <dt> <a name='item215'>[215]</a> <a href ="/abs/2502.10447" title="Abstract" id="2502.10447"> arXiv:2502.10447 </a> (cross-list from eess.AS) [<a href="/pdf/2502.10447" title="Download PDF" id="pdf-2502.10447" aria-labelledby="pdf-2502.10447">pdf</a>, <a href="https://arxiv.org/html/2502.10447v1" title="View HTML" id="html-2502.10447" aria-labelledby="html-2502.10447" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10447" title="Other formats" id="oth-2502.10447" aria-labelledby="oth-2502.10447">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MoHAVE: Mixture of Hierarchical Audio-Visual Experts for Robust Speech Recognition </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Kim,+S">Sungnyun Kim</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Jang,+K">Kangwook Jang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Bae,+S">Sangmin Bae</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Cho,+S">Sungwoo Cho</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Yun,+S">Se-Young Yun</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Preliminary work </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Computation and Language (cs.CL); Machine Learning (cs.LG) </div> <p class='mathjax'> Audio-visual speech recognition (AVSR) has become critical for enhancing speech recognition in noisy environments by integrating both auditory and visual modalities. However, existing AVSR systems struggle to scale up without compromising computational efficiency. In this study, we introduce MoHAVE (Mixture of Hierarchical Audio-Visual Experts), a novel robust AVSR framework designed to address these scalability constraints. By leveraging a Mixture-of-Experts (MoE) architecture, MoHAVE activates modality-specific expert groups, ensuring dynamic adaptation to various audio-visual inputs with minimal computational overhead. Key contributions of MoHAVE include: (1) a sparse MoE framework that efficiently scales AVSR model capacity, (2) a hierarchical gating mechanism that dynamically utilizes the expert groups based on input context, enhancing adaptability and robustness, and (3) remarkable performance across robust AVSR benchmarks, including LRS3 and MuAViC transcription and translation tasks, setting a new standard for scalable speech recognition systems. </p> </div> </dd> <dt> <a name='item216'>[216]</a> <a href ="/abs/2502.10450" title="Abstract" id="2502.10450"> arXiv:2502.10450 </a> (cross-list from cs.CR) [<a href="/pdf/2502.10450" title="Download PDF" id="pdf-2502.10450" aria-labelledby="pdf-2502.10450">pdf</a>, <a href="https://arxiv.org/html/2502.10450v1" title="View HTML" id="html-2502.10450" aria-labelledby="html-2502.10450" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10450" title="Other formats" id="oth-2502.10450" aria-labelledby="oth-2502.10450">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Trustworthy AI on Safety, Bias, and Privacy: A Survey </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Fang,+X">Xingli Fang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+J">Jianwei Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mulchandani,+V">Varun Mulchandani</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+J">Jung-Eun Kim</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Cryptography and Security (cs.CR)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Machine Learning (cs.LG) </div> <p class='mathjax'> The capabilities of artificial intelligence systems have been advancing to a great extent, but these systems still struggle with failure modes, vulnerabilities, and biases. In this paper, we study the current state of the field, and present promising insights and perspectives regarding concerns that challenge the trustworthiness of AI models. In particular, this paper investigates the issues regarding three thrusts: safety, privacy, and bias, which hurt models' trustworthiness. For safety, we discuss safety alignment in the context of large language models, preventing them from generating toxic or harmful content. For bias, we focus on spurious biases that can mislead a network. Lastly, for privacy, we cover membership inference attacks in deep neural networks. The discussions addressed in this paper reflect our own experiments and observations. </p> </div> </dd> <dt> <a name='item217'>[217]</a> <a href ="/abs/2502.10453" title="Abstract" id="2502.10453"> arXiv:2502.10453 </a> (cross-list from cs.CR) [<a href="/pdf/2502.10453" title="Download PDF" id="pdf-2502.10453" aria-labelledby="pdf-2502.10453">pdf</a>, <a href="https://arxiv.org/html/2502.10453v1" title="View HTML" id="html-2502.10453" aria-labelledby="html-2502.10453" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10453" title="Other formats" id="oth-2502.10453" aria-labelledby="oth-2502.10453">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Linking Cryptoasset Attribution Tags to Knowledge Graph Entities: An LLM-based Approach </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Avice,+R">R茅gnier Avice</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Haslhofer,+B">Bernhard Haslhofer</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Z">Zhidong Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+J">Jianlong Zhou</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted at Financial Cryptography and Data Security 2025 Conference (FC2025) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Cryptography and Security (cs.CR)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Databases (cs.DB); Machine Learning (cs.LG) </div> <p class='mathjax'> Attribution tags form the foundation of modern cryptoasset forensics. However, inconsistent or incorrect tags can mislead investigations and even result in false accusations. To address this issue, we propose a novel computational method based on Large Language Models (LLMs) to link attribution tags with well-defined knowledge graph concepts. We implemented this method in an end-to-end pipeline and conducted experiments showing that our approach outperforms baseline methods by up to 37.4% in F1-score across three publicly available attribution tag datasets. By integrating concept filtering and blocking procedures, we generate candidate sets containing five knowledge graph entities, achieving a recall of 93% without the need for labeled data. Additionally, we demonstrate that local LLM models can achieve F1-scores of 90%, comparable to remote models which achieve 94%. We also analyze the cost-performance trade-offs of various LLMs and prompt templates, showing that selecting the most cost-effective configuration can reduce costs by 90%, with only a 1% decrease in performance. Our method not only enhances attribution tag quality but also serves as a blueprint for fostering more reliable forensic evidence. </p> </div> </dd> <dt> <a name='item218'>[218]</a> <a href ="/abs/2502.10454" title="Abstract" id="2502.10454"> arXiv:2502.10454 </a> (cross-list from cs.LG) [<a href="/pdf/2502.10454" title="Download PDF" id="pdf-2502.10454" aria-labelledby="pdf-2502.10454">pdf</a>, <a href="https://arxiv.org/html/2502.10454v1" title="View HTML" id="html-2502.10454" aria-labelledby="html-2502.10454" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10454" title="Other formats" id="oth-2502.10454" aria-labelledby="oth-2502.10454">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> One Example Shown, Many Concepts Known! Counterexample-Driven Conceptual Reasoning in Mathematical LLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yinghui Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kuang,+J">Jiayi Kuang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+H">Haojing Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+Z">Zhikun Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liang,+X">Xinnian Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+Y">Yi Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+W">Wenlian Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yangning Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tan,+X">Xiaoyu Tan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qu,+C">Chao Qu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+Y">Ying Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zheng,+H">Hai-Tao Zheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+P+S">Philip S. Yu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> Leveraging mathematical Large Language Models (LLMs) for proof generation is a fundamental topic in LLMs research. We argue that the ability of current LLMs to prove statements largely depends on whether they have encountered the relevant proof process during training. This reliance limits their deeper understanding of mathematical theorems and related concepts. Inspired by the pedagogical method of "proof by counterexamples" commonly used in human mathematics education, our work aims to enhance LLMs' ability to conduct mathematical reasoning and proof through counterexamples. Specifically, we manually create a high-quality, university-level mathematical benchmark, CounterMATH, which requires LLMs to prove mathematical statements by providing counterexamples, thereby assessing their grasp of mathematical concepts. Additionally, we develop a data engineering framework to automatically obtain training data for further model improvement. Extensive experiments and detailed analyses demonstrate that CounterMATH is challenging, indicating that LLMs, such as OpenAI o1, have insufficient counterexample-driven proof capabilities. Moreover, our exploration into model training reveals that strengthening LLMs' counterexample-driven conceptual reasoning abilities is crucial for improving their overall mathematical capabilities. We believe that our work offers new perspectives on the community of mathematical LLMs. </p> </div> </dd> <dt> <a name='item219'>[219]</a> <a href ="/abs/2502.10505" title="Abstract" id="2502.10505"> arXiv:2502.10505 </a> (cross-list from cs.LG) [<a href="/pdf/2502.10505" title="Download PDF" id="pdf-2502.10505" aria-labelledby="pdf-2502.10505">pdf</a>, <a href="https://arxiv.org/html/2502.10505v1" title="View HTML" id="html-2502.10505" aria-labelledby="html-2502.10505" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10505" title="Other formats" id="oth-2502.10505" aria-labelledby="oth-2502.10505">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Preference learning made easy: Everything should be understood through win rate </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+L+H">Lily H. Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ranganath,+R">Rajesh Ranganath</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computation and Language (cs.CL); Machine Learning (stat.ML) </div> <p class='mathjax'> Preference learning, or the task of aligning generative models to preference comparison data, has yet to reach the conceptual maturity of classification, density estimation, etc. To close this gap, this work presents a framework to understand preference learning starting from the sampling distribution of pairwise preference data. First, we prove that the only evaluation of a generative model that respects both preferences and prevalences in the data distribution is a form of win rate, justifying win rate as the focal point to understand preference learning. We then analyze preference learning methods as win rate optimization (WRO) or non-WRO. We present novel instances of WRO beyond existing examples (RLHF, NLHF) and identify two key theoretical benefits of all such methods. We prove that common non-WRO methods like DPO and SFT on preferred samples lack these properties and suggest ways to mitigate such theoretical limitations. We also show that WRO underperforms in practice due optimization difficulties and that optimization success predicts performance better than choices which affect the objective's solution. Our analysis highlights best practices for existing methods and provides recommendations for future research, guided by the principle that one should either align non-WRO methods more closely with WRO or improve the optimization of WRO objectives. </p> </div> </dd> <dt> <a name='item220'>[220]</a> <a href ="/abs/2502.10563" title="Abstract" id="2502.10563"> arXiv:2502.10563 </a> (cross-list from cs.LG) [<a href="/pdf/2502.10563" title="Download PDF" id="pdf-2502.10563" aria-labelledby="pdf-2502.10563">pdf</a>, <a href="https://arxiv.org/html/2502.10563v1" title="View HTML" id="html-2502.10563" aria-labelledby="html-2502.10563" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10563" title="Other formats" id="oth-2502.10563" aria-labelledby="oth-2502.10563">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Accelerating Unbiased LLM Evaluation via Synthetic Feedback </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+Z">Zhaoyi Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+Y">Yuda Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zanette,+A">Andrea Zanette</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> When developing new large language models (LLMs), a key step is evaluating their final performance, often by computing the win-rate against a reference model based on external feedback. Human feedback is the gold standard, particularly for capturing nuanced qualities like coherence, readability, and alignment with human expectations. However, human evaluations are costly -- even for large tech companies -- and when conducted with active users, they may negatively impact user experience. A promising alternative is synthetic feedback, where evaluations are conducted by other large language models, including reward models. While this eliminates the need for costly human annotations, it introduces biases that may distort the evaluation process. In this work, we propose a statistically principled framework that integrates human and synthetic feedback to reduce reliance on human annotations while maintaining unbiased win-rate calculations. Our experiments demonstrate a reduction in human annotations by up to 12.2% with an off-the-shelf synthetic evaluator and up to 24.8% with a finetuned variant. Apart from being generalizable, scalable, and free of hyper-parameter tuning, our method offers predictable annotation savings, which can be estimated based on data-dependent characteristics. </p> </div> </dd> <dt> <a name='item221'>[221]</a> <a href ="/abs/2502.10673" title="Abstract" id="2502.10673"> arXiv:2502.10673 </a> (cross-list from cs.CR) [<a href="/pdf/2502.10673" title="Download PDF" id="pdf-2502.10673" aria-labelledby="pdf-2502.10673">pdf</a>, <a href="https://arxiv.org/html/2502.10673v1" title="View HTML" id="html-2502.10673" aria-labelledby="html-2502.10673" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10673" title="Other formats" id="oth-2502.10673" aria-labelledby="oth-2502.10673">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Dataset Protection via Watermarked Canaries in Retrieval-Augmented LLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Y">Yepeng Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+X">Xuandong Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+D">Dawn Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bu,+Y">Yuheng Bu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Cryptography and Security (cs.CR)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Retrieval-Augmented Generation (RAG) has become an effective method for enhancing large language models (LLMs) with up-to-date knowledge. However, it poses a significant risk of IP infringement, as IP datasets may be incorporated into the knowledge database by malicious Retrieval-Augmented LLMs (RA-LLMs) without authorization. To protect the rights of the dataset owner, an effective dataset membership inference algorithm for RA-LLMs is needed. In this work, we introduce a novel approach to safeguard the ownership of text datasets and effectively detect unauthorized use by the RA-LLMs. Our approach preserves the original data completely unchanged while protecting it by inserting specifically designed canary documents into the IP dataset. These canary documents are created with synthetic content and embedded watermarks to ensure uniqueness, stealthiness, and statistical provability. During the detection process, unauthorized usage is identified by querying the canary documents and analyzing the responses of RA-LLMs for statistical evidence of the embedded watermark. Our experimental results demonstrate high query efficiency, detectability, and stealthiness, along with minimal perturbation to the original dataset, all without compromising the performance of the RAG system. </p> </div> </dd> <dt> <a name='item222'>[222]</a> <a href ="/abs/2502.10762" title="Abstract" id="2502.10762"> arXiv:2502.10762 </a> (cross-list from cs.LG) [<a href="/pdf/2502.10762" title="Download PDF" id="pdf-2502.10762" aria-labelledby="pdf-2502.10762">pdf</a>, <a href="https://arxiv.org/html/2502.10762v1" title="View HTML" id="html-2502.10762" aria-labelledby="html-2502.10762" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10762" title="Other formats" id="oth-2502.10762" aria-labelledby="oth-2502.10762">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Bone Soups: A Seek-and-Soup Model Merging Approach for Controllable Multi-Objective Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xie,+G">Guofu Xie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+X">Xiao Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yao,+T">Ting Yao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shi,+Y">Yunsheng Shi</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> work in progress </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> User information needs are often highly diverse and varied. A key challenge in current research is how to achieve controllable multi-objective generation while enabling rapid adaptation to accommodate diverse user demands during test time. Existing solutions, such as Rewarded Soup, focus on merging language models individually tuned on single objectives. While easy to implement and widely used, these approaches face limitations in achieving optimal performance due to their disregard for the impacts of competing objectives on model tuning. To address this issue, we propose Bone Soup, a novel model merging approach that first seeks a series of backbone models by considering the impacts of multiple objectives and then makes the soup (i.e., merge the backbone models). Specifically, Bone Soup begins by training multiple backbone models for different objectives using multi-objective reinforcement learning. Each backbone model is guided by a combination of backbone reward signals. To ensure that these models are optimal for the Pareto front, the backbone rewards are crafted by combining standard reward functions into basis vectors, which can then be modified through a rule-based construction method. Bone Soup leverages a symmetric circulant matrix mapping to generate the merging coefficients, which are used to merge the backbone models according to user preferences. Extensive experimental results demonstrate that Bone Soup exhibits strong controllability and Pareto optimality in controllable multi-objective generation, providing a more effective and efficient approach to addressing diverse user needs at test time. </p> </div> </dd> <dt> <a name='item223'>[223]</a> <a href ="/abs/2502.10768" title="Abstract" id="2502.10768"> arXiv:2502.10768 </a> (cross-list from cs.IR) [<a href="/pdf/2502.10768" title="Download PDF" id="pdf-2502.10768" aria-labelledby="pdf-2502.10768">pdf</a>, <a href="/format/2502.10768" title="Other formats" id="oth-2502.10768" aria-labelledby="oth-2502.10768">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Evaluating improvements on using Large Language Models (LLMs) for property extraction in the Open Research Knowledge Graph (ORKG) </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Schaftner,+S">Sandra Schaftner</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Information Retrieval (cs.IR)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> Current research highlights the great potential of Large Language Models (LLMs) for constructing Scholarly Knowledge Graphs (SKGs). One particularly complex step in this process is relation extraction, aimed at identifying suitable properties to describe the content of research. This study builds directly on previous research of three Open Research Knowledge Graph (ORKG) team members who assessed the readiness of LLMs such as GPT-3.5, Llama 2, and Mistral for property extraction in scientific literature. Given the moderate performance observed, the previous work concluded that fine-tuning is needed to improve these models' alignment with scientific tasks and their emulation of human expertise. Expanding on this prior experiment, this study evaluates the impact of advanced prompt engineering techniques and demonstrates that these techniques can highly significantly enhance the results. Additionally, this study extends the property extraction process to include property matching to existing ORKG properties, which are retrieved via the API. The evaluation reveals that results generated through advanced prompt engineering achieve a higher proportion of matches with ORKG properties, further emphasizing the enhanced alignment achieved. Moreover, this lays the groundwork for addressing challenges such as the inconsistency of ORKG properties, an issue highlighted in prior studies. By assigning unique URIs and using standardized terminology, this work increases the consistency of the properties, fulfilling a crucial aspect of Linked Data and FAIR principles - core commitments of ORKG. This, in turn, significantly enhances the applicability of ORKG content for subsequent tasks such as comparisons of research publications. Finally, the study concludes with recommendations for future improvements in the overall property extraction process. </p> </div> </dd> <dt> <a name='item224'>[224]</a> <a href ="/abs/2502.10858" title="Abstract" id="2502.10858"> arXiv:2502.10858 </a> (cross-list from cs.AI) [<a href="/pdf/2502.10858" title="Download PDF" id="pdf-2502.10858" aria-labelledby="pdf-2502.10858">pdf</a>, <a href="https://arxiv.org/html/2502.10858v1" title="View HTML" id="html-2502.10858" aria-labelledby="html-2502.10858" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10858" title="Other formats" id="oth-2502.10858" aria-labelledby="oth-2502.10858">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Is Depth All You Need? An Exploration of Iterative Reasoning in LLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Z">Zongqian Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+T">Tianyu Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+J">Jiaying Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhan,+M">Mengmeng Zhan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+X">Xiaofeng Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+L">Lei Feng</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 22 pages, 7 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Deep iterative chain-of-thought (CoT) reasoning enables LLMs to tackle complex tasks by progressively activating relevant pre-trained knowledge. However, it faces challenges in ensuring continual improvement and determining a stopping criterion. In this paper, we investigate whether the relevant knowledge that contributes directly to solving the given question can be activated from the initial reasoning path, thus circumventing the need for iterative refinement. Our experiments reveal that increasing the diversity of initial reasoning paths can achieve comparable or superior performance, a concept we term \textit{breadth reasoning}. However, existing breadth reasoning approaches, such as self-consistency, offer limited diversity. To address this limitation, we propose a simple yet effective method that enhances reasoning breadth by integrating contextual exploration with reduced sampling randomness. Extensive experiments demonstrate that our approach significantly outperforms deep iterative reasoning. Our code is provided in <a href="https://github.com/zongqianwu/breadth" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item225'>[225]</a> <a href ="/abs/2502.10867" title="Abstract" id="2502.10867"> arXiv:2502.10867 </a> (cross-list from cs.AI) [<a href="/pdf/2502.10867" title="Download PDF" id="pdf-2502.10867" aria-labelledby="pdf-2502.10867">pdf</a>, <a href="https://arxiv.org/html/2502.10867v1" title="View HTML" id="html-2502.10867" aria-labelledby="html-2502.10867" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10867" title="Other formats" id="oth-2502.10867" aria-labelledby="oth-2502.10867">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Tutorial on LLM Reasoning: Relevant Methods behind ChatGPT o1 </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+J">Jun Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> OpenAI o1 has shown that applying reinforcement learning to integrate reasoning steps directly during inference can significantly improve a model's reasoning capabilities. This result is exciting as the field transitions from the conventional autoregressive method of generating answers to a more deliberate approach that models the slow-thinking process through step-by-step reasoning training. Reinforcement learning plays a key role in both the model's training and decoding processes. In this article, we present a comprehensive formulation of reasoning problems and investigate the use of both model-based and model-free approaches to better support this slow-thinking framework. </p> </div> </dd> <dt> <a name='item226'>[226]</a> <a href ="/abs/2502.10928" title="Abstract" id="2502.10928"> arXiv:2502.10928 </a> (cross-list from cs.LG) [<a href="/pdf/2502.10928" title="Download PDF" id="pdf-2502.10928" aria-labelledby="pdf-2502.10928">pdf</a>, <a href="/format/2502.10928" title="Other formats" id="oth-2502.10928" aria-labelledby="oth-2502.10928">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Semantic Specialization in MoE Appears with Scale: A Study of DeepSeek R1 Expert Specialization </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Olson,+M+L">Matthew Lyle Olson</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ratzlaff,+N">Neale Ratzlaff</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hinck,+M">Musashi Hinck</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+M">Man Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+S">Sungduk Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xue,+C">Chendi Xue</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lal,+V">Vasudev Lal</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> DeepSeek-R1, the largest open-source Mixture-of-Experts (MoE) model, has demonstrated reasoning capabilities comparable to proprietary frontier models. Prior research has explored expert routing in MoE models, but findings suggest that expert selection is often token-dependent rather than semantically driven. Given DeepSeek-R1's enhanced reasoning abilities, we investigate whether its routing mechanism exhibits greater semantic specialization than previous MoE models. To explore this, we conduct two key experiments: (1) a word sense disambiguation task, where we examine expert activation patterns for words with differing senses, and (2) a cognitive reasoning analysis, where we assess DeepSeek-R1's structured thought process in an interactive task setting of DiscoveryWorld. We conclude that DeepSeek-R1's routing mechanism is more semantically aware and it engages in structured cognitive processes. </p> </div> </dd> <dt> <a name='item227'>[227]</a> <a href ="/abs/2502.10937" title="Abstract" id="2502.10937"> arXiv:2502.10937 </a> (cross-list from cs.AI) [<a href="/pdf/2502.10937" title="Download PDF" id="pdf-2502.10937" aria-labelledby="pdf-2502.10937">pdf</a>, <a href="/format/2502.10937" title="Other formats" id="oth-2502.10937" aria-labelledby="oth-2502.10937">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SCALE: Towards Collaborative Content Analysis in Social Science with Large Language Model Agents and Human Intervention </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+C">Chengshuai Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tan,+Z">Zhen Tan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wong,+C">Chau-Wai Wong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+X">Xinyan Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+T">Tianlong Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+H">Huan Liu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL); Multiagent Systems (cs.MA) </div> <p class='mathjax'> Content analysis breaks down complex and unstructured texts into theory-informed numerical categories. Particularly, in social science, this process usually relies on multiple rounds of manual annotation, domain expert discussion, and rule-based refinement. In this paper, we introduce SCALE, a novel multi-agent framework that effectively $\underline{\textbf{S}}$imulates $\underline{\textbf{C}}$ontent $\underline{\textbf{A}}$nalysis via $\underline{\textbf{L}}$arge language model (LLM) ag$\underline{\textbf{E}}$nts. SCALE imitates key phases of content analysis, including text coding, collaborative discussion, and dynamic codebook evolution, capturing the reflective depth and adaptive discussions of human researchers. Furthermore, by integrating diverse modes of human intervention, SCALE is augmented with expert input to further enhance its performance. Extensive evaluations on real-world datasets demonstrate that SCALE achieves human-approximated performance across various complex content analysis tasks, offering an innovative potential for future social science research. </p> </div> </dd> <dt> <a name='item228'>[228]</a> <a href ="/abs/2502.10976" title="Abstract" id="2502.10976"> arXiv:2502.10976 </a> (cross-list from cs.IR) [<a href="/pdf/2502.10976" title="Download PDF" id="pdf-2502.10976" aria-labelledby="pdf-2502.10976">pdf</a>, <a href="https://arxiv.org/html/2502.10976v1" title="View HTML" id="html-2502.10976" aria-labelledby="html-2502.10976" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10976" title="Other formats" id="oth-2502.10976" aria-labelledby="oth-2502.10976">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> QuOTE: Question-Oriented Text Embeddings </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Neeser,+A">Andrew Neeser</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Latimer,+K">Kaylen Latimer</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Khatri,+A">Aadyant Khatri</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Latimer,+C">Chris Latimer</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ramakrishnan,+N">Naren Ramakrishnan</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Information Retrieval (cs.IR)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Machine Learning (cs.LG) </div> <p class='mathjax'> We present QuOTE (Question-Oriented Text Embeddings), a novel enhancement to retrieval-augmented generation (RAG) systems, aimed at improving document representation for accurate and nuanced retrieval. Unlike traditional RAG pipelines, which rely on embedding raw text chunks, QuOTE augments chunks with hypothetical questions that the chunk can potentially answer, enriching the representation space. This better aligns document embeddings with user query semantics, and helps address issues such as ambiguity and context-dependent relevance. Through extensive experiments across diverse benchmarks, we demonstrate that QuOTE significantly enhances retrieval accuracy, including in multi-hop question-answering tasks. Our findings highlight the versatility of question generation as a fundamental indexing strategy, opening new avenues for integrating question generation into retrieval-based AI pipelines. </p> </div> </dd> <dt> <a name='item229'>[229]</a> <a href ="/abs/2502.10999" title="Abstract" id="2502.10999"> arXiv:2502.10999 </a> (cross-list from cs.CV) [<a href="/pdf/2502.10999" title="Download PDF" id="pdf-2502.10999" aria-labelledby="pdf-2502.10999">pdf</a>, <a href="https://arxiv.org/html/2502.10999v1" title="View HTML" id="html-2502.10999" aria-labelledby="html-2502.10999" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10999" title="Other formats" id="oth-2502.10999" aria-labelledby="oth-2502.10999">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> ControlText: Unlocking Controllable Fonts in Multilingual Text Rendering without Font Annotations </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+B">Bowen Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+Y">Yuan Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bai,+X">Xinyi Bai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hao,+Z">Zhuoqun Hao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yin,+A">Alyson Yin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+Y">Yaojie Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liao,+W">Wenyu Liao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ungar,+L">Lyle Ungar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Taylor,+C+J">Camillo J. Taylor</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> This is preliminary work and code will be released at <a href="http://github.com/bowen-upenn/ControlText" rel="external noopener nofollow" class="link-external link-http">this http URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Multimedia (cs.MM) </div> <p class='mathjax'> This work demonstrates that diffusion models can achieve font-controllable multilingual text rendering using just raw images without font label annotations. Visual text rendering remains a significant challenge. While recent methods condition diffusion on glyphs, it is impossible to retrieve exact font annotations from large-scale, real-world datasets, which prevents user-specified font control. To address this, we propose a data-driven solution that integrates the conditional diffusion model with a text segmentation model, utilizing segmentation masks to capture and represent fonts in pixel space in a self-supervised manner, thereby eliminating the need for any ground-truth labels and enabling users to customize text rendering with any multilingual font of their choice. The experiment provides a proof of concept of our algorithm in zero-shot text and font editing across diverse fonts and languages, providing valuable insights for the community and industry toward achieving generalized visual text rendering. </p> </div> </dd> <dt> <a name='item230'>[230]</a> <a href ="/abs/2502.11021" title="Abstract" id="2502.11021"> arXiv:2502.11021 </a> (cross-list from cs.NI) [<a href="/pdf/2502.11021" title="Download PDF" id="pdf-2502.11021" aria-labelledby="pdf-2502.11021">pdf</a>, <a href="https://arxiv.org/html/2502.11021v1" title="View HTML" id="html-2502.11021" aria-labelledby="html-2502.11021" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11021" title="Other formats" id="oth-2502.11021" aria-labelledby="oth-2502.11021">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Leveraging Uncertainty Estimation for Efficient LLM Routing </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+T">Tuo Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mehradfar,+A">Asal Mehradfar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dimitriadis,+D">Dimitrios Dimitriadis</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Avestimehr,+S">Salman Avestimehr</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Networking and Internet Architecture (cs.NI)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Deploying large language models (LLMs) in edge-cloud environments requires an efficient routing strategy to balance cost and response quality. Traditional approaches prioritize either human-preference data or accuracy metrics from benchmark datasets as routing criteria, but these methods suffer from rigidity and subjectivity. Moreover, existing routing frameworks primarily focus on accuracy and cost, neglecting response quality from a human preference perspective. In this work, we propose the Confidence-Driven LLM Router, a novel framework that leverages uncertainty estimation to optimize routing decisions. To comprehensively assess routing performance, we evaluate both system cost efficiency and response quality. In particular, we introduce the novel use of LLM-as-a-Judge to simulate human rating preferences, providing the first systematic assessment of response quality across different routing strategies. Extensive experiments on MT-Bench, GSM8K, and MMLU demonstrate that our approach outperforms state-of-the-art routing methods, achieving superior response quality while maintaining cost efficiency. </p> </div> </dd> <dt> <a name='item231'>[231]</a> <a href ="/abs/2502.11026" title="Abstract" id="2502.11026"> arXiv:2502.11026 </a> (cross-list from cs.LG) [<a href="/pdf/2502.11026" title="Download PDF" id="pdf-2502.11026" aria-labelledby="pdf-2502.11026">pdf</a>, <a href="https://arxiv.org/html/2502.11026v1" title="View HTML" id="html-2502.11026" aria-labelledby="html-2502.11026" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11026" title="Other formats" id="oth-2502.11026" aria-labelledby="oth-2502.11026">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Simplify RLHF as Reward-Weighted SFT: A Variational Method </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Du,+Y">Yuhao Du</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Z">Zhuo Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cheng,+P">Pengyu Cheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Z">Zhihong Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xie,+Y">Yuejiao Xie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wan,+X">Xiang Wan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+A">Anningzhe Gao</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> Reinforcement Learning from Human Feedback (RLHF) is crucial for aligning Large Language Models (LLMs) with human values. However, RLHF has been continuously challenged by its high complexity in implementation and computation consumption. Even with recent simplifications, such as Direct Preference Optimization (DPO) and Advantage Leftover Lunch (A-LoL), the problems of over-fitting and training instability remain hindering the alignment process from the expected optimal performance. To address the existing challenges, we propose a novel simplification of RLHF from the perspective of variational inference, called $\textbf{V}$ariational $\textbf{A}$lignment with $\textbf{R}$e-weighting ($\textbf{VAR}$). More specifically, by directly minimizing the distribution gap between the learning LLM policy and the optimal solution of RLHF, we transform the alignment objective into a reward-driven re-weighted supervised fine-tuning (SFT) form, which only requires minor adjustment on the SFT loss to obtain noticeable improvement on training stability and effectiveness. On comprehensive alignment and generation benchmarks, our VAR method has numerically achieved competitive performance in LLM alignment helpfulness and harmlessness. </p> </div> </dd> <dt> <a name='item232'>[232]</a> <a href ="/abs/2502.11096" title="Abstract" id="2502.11096"> arXiv:2502.11096 </a> (cross-list from cs.AI) [<a href="/pdf/2502.11096" title="Download PDF" id="pdf-2502.11096" aria-labelledby="pdf-2502.11096">pdf</a>, <a href="https://arxiv.org/html/2502.11096v1" title="View HTML" id="html-2502.11096" aria-labelledby="html-2502.11096" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11096" title="Other formats" id="oth-2502.11096" aria-labelledby="oth-2502.11096">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Mixture of Tunable Experts - Behavior Modification of DeepSeek-R1 at Inference Time </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Dahlke,+R">Robert Dahlke</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Klagges,+H">Henrik Klagges</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zecha,+D">Dan Zecha</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Merkel,+B">Benjamin Merkel</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rohr,+S">Sven Rohr</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Klemm,+F">Fabian Klemm</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> We present the Mixture-of-Tunable-Experts (MoTE), a method that extends the Mixture-of-Experts architecture of Large Language Models (LLMs). Without additional training, MoTE enables meaningful and focused behavior changes in LLMs on-the-fly during inference time. <br>By analyzing the digital LLM brain of DeepSeek-R1 using a technique we dub 'functional Token Resonance Imaging' (fTRI) - inspired by fMRI and using prompts designed to elicit specific behavior (e.g., 'What happened {time}{place}?') - we empirically identify distinctive experts associated with behaviors like refusal responses. <br>Using MoTE we are able to intervene and control such specific behavior. We switched off the top 10 most refusal-relevant experts (0.07% of R1's 14,848 routed experts), achieving a 52% refusal reduction on sensitive reference prompts without performance degradation on MT-Bench. Random expert deactivation resulted in smaller behavioral shifts with increased noise, whereas forced expert activation led to significantly higher refusal rates. <br>Our approach shares similarities with sparse autoencoders (SAEs) in terms of explainability and steerability. Unlike SAEs, MoTE does not require large training efforts, as within MoEs with a vast number of experts, specialization already emerged naturally during pretraining. <br>Our findings suggest that significant functional mechanisms in Mixture-of-Experts architectures can at least partially be localized in a small number of specific experts, rather than being distributed throughout the model's weights. Expert subgroups can be tuned to trigger significant behavior variations, providing insights into the inner workings of LLMs. </p> </div> </dd> <dt> <a name='item233'>[233]</a> <a href ="/abs/2502.11140" title="Abstract" id="2502.11140"> arXiv:2502.11140 </a> (cross-list from cs.SE) [<a href="/pdf/2502.11140" title="Download PDF" id="pdf-2502.11140" aria-labelledby="pdf-2502.11140">pdf</a>, <a href="https://arxiv.org/html/2502.11140v1" title="View HTML" id="html-2502.11140" aria-labelledby="html-2502.11140" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11140" title="Other formats" id="oth-2502.11140" aria-labelledby="oth-2502.11140">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> VisPath: Automated Visualization Code Synthesis via Multi-Path Reasoning and Feedback-Driven Optimization </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Seo,+W">Wonduk Seo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+S">Seungyong Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kang,+D">Daye Kang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+Z">Zonghao Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+S">Seunghyun Lee</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 14 pages, 3 figures, 4 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Software Engineering (cs.SE)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Human-Computer Interaction (cs.HC) </div> <p class='mathjax'> Unprecedented breakthroughs in Large Language Models (LLMs) has amplified its penetration into application of automated visualization code generation. Few-shot prompting and query expansion techniques have notably enhanced data visualization performance, however, still fail to overcome ambiguity and complexity of natural language queries - imposing an inherent burden for manual human intervention. To mitigate such limitations, we propose a holistic framework VisPath : A Multi-Path Reasoning and Feedback-Driven Optimization Framework for Visualization Code Generation, which systematically enhances code quality through structured reasoning and refinement. VisPath is a multi-stage framework, specially designed to handle underspecified queries. To generate a robust final visualization code, it first utilizes initial query to generate diverse reformulated queries via Chain-of-Thought (CoT) prompting, each representing a distinct reasoning path. Refined queries are used to produce candidate visualization scripts, consequently executed to generate multiple images. Comprehensively assessing correctness and quality of outputs, VisPath generates feedback for each image, which are then fed to aggregation module to generate optimal result. Extensive experiments on benchmarks including MatPlotBench and the Qwen-Agent Code Interpreter Benchmark show that VisPath significantly outperforms state-of-the-art (SOTA) methods, increased up to average 17%, offering a more reliable solution for AI-driven visualization code generation. </p> </div> </dd> <dt> <a name='item234'>[234]</a> <a href ="/abs/2502.11142" title="Abstract" id="2502.11142"> arXiv:2502.11142 </a> (cross-list from cs.AI) [<a href="/pdf/2502.11142" title="Download PDF" id="pdf-2502.11142" aria-labelledby="pdf-2502.11142">pdf</a>, <a href="https://arxiv.org/html/2502.11142v1" title="View HTML" id="html-2502.11142" aria-labelledby="html-2502.11142" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11142" title="Other formats" id="oth-2502.11142" aria-labelledby="oth-2502.11142">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> NavRAG: Generating User Demand Instructions for Embodied Navigation through Retrieval-Augmented LLM </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zihan Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+Y">Yaohui Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+G+H">Gim Hee Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fan,+Y">Yachun Fan</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Vision-and-Language Navigation (VLN) is an essential skill for embodied agents, allowing them to navigate in 3D environments following natural language instructions. High-performance navigation models require a large amount of training data, the high cost of manually annotating data has seriously hindered this field. Therefore, some previous methods translate trajectory videos into step-by-step instructions for expanding data, but such instructions do not match well with users' communication styles that briefly describe destinations or state specific needs. Moreover, local navigation trajectories overlook global context and high-level task planning. To address these issues, we propose NavRAG, a retrieval-augmented generation (RAG) framework that generates user demand instructions for VLN. NavRAG leverages LLM to build a hierarchical scene description tree for 3D scene understanding from global layout to local details, then simulates various user roles with specific demands to retrieve from the scene tree, generating diverse instructions with LLM. We annotate over 2 million navigation instructions across 861 scenes and evaluate the data quality and navigation performance of trained models. </p> </div> </dd> <dt> <a name='item235'>[235]</a> <a href ="/abs/2502.11155" title="Abstract" id="2502.11155"> arXiv:2502.11155 </a> (cross-list from cs.AI) [<a href="/pdf/2502.11155" title="Download PDF" id="pdf-2502.11155" aria-labelledby="pdf-2502.11155">pdf</a>, <a href="https://arxiv.org/html/2502.11155v1" title="View HTML" id="html-2502.11155" aria-labelledby="html-2502.11155" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11155" title="Other formats" id="oth-2502.11155" aria-labelledby="oth-2502.11155">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Uncertainty-Aware Search and Value Models: Mitigating Search Scaling Flaws in LLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+F">Fei Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yingru Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+B">Benyou Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Value model-guided search is effective in steering the generation but suffers from scaling flaws: Its superiority diminishes with larger sample sizes, underperforming non-search baselines. This limitation arises from reliability degradation in value models in unseen reasoning paths. To address this, we propose an uncertainty-aware search framework that includes two key components: (1) uncertainty-aware value models that incorporate uncertainty into predictions, and (2) an uncertainty-aware selection process using the proposed efficient Group Thompson Sampling algorithm. Experiments on GSM8K show that our method mitigates search scaling flaws, achieving 90.5% coverage at 16 samples compared to 85.8% for conventional value-guided search. This work establishes the first systematic integration of uncertainty quantification in LLM search paradigms. </p> </div> </dd> <dt> <a name='item236'>[236]</a> <a href ="/abs/2502.11163" title="Abstract" id="2502.11163"> arXiv:2502.11163 </a> (cross-list from cs.CV) [<a href="/pdf/2502.11163" title="Download PDF" id="pdf-2502.11163" aria-labelledby="pdf-2502.11163">pdf</a>, <a href="https://arxiv.org/html/2502.11163v1" title="View HTML" id="html-2502.11163" aria-labelledby="html-2502.11163" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11163" title="Other formats" id="oth-2502.11163" aria-labelledby="oth-2502.11163">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> VLMs as GeoGuessr Masters: Exceptional Performance, Hidden Biases, and Privacy Risks </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+J">Jingyuan Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+J">Jen-tse Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Ziyi Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+X">Xiaoyuan Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+W">Wenxuan Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+J">Jieyu Zhao</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 8 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Visual-Language Models (VLMs) have shown remarkable performance across various tasks, particularly in recognizing geographic information from images. However, significant challenges remain, including biases and privacy concerns. To systematically address these issues in the context of geographic information recognition, we introduce a benchmark dataset consisting of 1,200 images paired with detailed geographic metadata. Evaluating four VLMs, we find that while these models demonstrate the ability to recognize geographic information from images, achieving up to $53.8\%$ accuracy in city prediction, they exhibit significant regional biases. Specifically, performance is substantially higher for economically developed and densely populated regions compared to less developed ($-12.5\%$) and sparsely populated ($-17.0\%$) areas. Moreover, the models exhibit regional biases, frequently overpredicting certain locations; for instance, they consistently predict Sydney for images taken in Australia. The strong performance of VLMs also raises privacy concerns, particularly for users who share images online without the intent of being identified. Our code and dataset are publicly available at <a href="https://github.com/uscnlp-lime/FairLocator" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item237'>[237]</a> <a href ="/abs/2502.11167" title="Abstract" id="2502.11167"> arXiv:2502.11167 </a> (cross-list from cs.LG) [<a href="/pdf/2502.11167" title="Download PDF" id="pdf-2502.11167" aria-labelledby="pdf-2502.11167">pdf</a>, <a href="/format/2502.11167" title="Other formats" id="oth-2502.11167" aria-labelledby="oth-2502.11167">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SURGE: On the Potential of Large Language Models as General-Purpose Surrogate Code Executors </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lyu,+B">Bohan Lyu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+S">Siqiao Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liang,+Z">Zichen Liang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Large language models (LLMs) have demonstrated remarkable capabilities in code-related tasks, such as code understanding and code generation. However, an equally important yet underexplored question is whether LLMs can serve as general-purpose surrogate code executors, to predict the output and behavior of a program without actually running it. To systematically investigate this capability, we introduce SURGE, a comprehensive benchmark covering eight key aspects: multi-language programming tasks, competition-level programming problems, repository-level code analysis, high-cost scientific computing, time-complexity-intensive algorithms, buggy code analysis, programs dependent on specific compilers or execution environments, and formal mathematical proof verification. We evaluate multiple open-source and proprietary LLMs on SURGE and conduct a scaling study to analyze the impact of model size and training data scale on surrogate execution accuracy. Additionally, we categorize model prediction errors and explore potential areas for improvement. Our findings indicate that while LLMs can predict code execution results in certain cases, they exhibit limitations in general-purpose surrogate execution. This study provides empirical insights into the feasibility of using LLMs as surrogate code executors. Code and dataset are released at <a href="https://github.com/Imbernoulli/SURGE" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item238'>[238]</a> <a href ="/abs/2502.11191" title="Abstract" id="2502.11191"> arXiv:2502.11191 </a> (cross-list from cs.CR) [<a href="/pdf/2502.11191" title="Download PDF" id="pdf-2502.11191" aria-labelledby="pdf-2502.11191">pdf</a>, <a href="https://arxiv.org/html/2502.11191v1" title="View HTML" id="html-2502.11191" aria-labelledby="html-2502.11191" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11191" title="Other formats" id="oth-2502.11191" aria-labelledby="oth-2502.11191">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Primus: A Pioneering Collection of Open-Source Datasets for Cybersecurity LLM Training </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+Y">Yao-Ching Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chiang,+T">Tsun-Han Chiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tsai,+C">Cheng-Wei Tsai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+C">Chien-Ming Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tsao,+W">Wen-Kwang Tsao</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Cryptography and Security (cs.CR)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> Large Language Models (LLMs) have shown remarkable advancements in specialized fields such as finance, law, and medicine. However, in cybersecurity, we have noticed a lack of open-source datasets, with a particular lack of high-quality cybersecurity pretraining corpora, even though much research indicates that LLMs acquire their knowledge during pretraining. To address this, we present a comprehensive suite of datasets covering all major training stages, including pretraining, instruction fine-tuning, and reasoning distillation with cybersecurity-specific self-reflection data. Extensive ablation studies demonstrate their effectiveness on public cybersecurity benchmarks. In particular, continual pre-training on our dataset yields a 15.88% improvement in the aggregate score, while reasoning distillation leads to a 10% gain in security certification (CISSP). We will release all datasets and trained cybersecurity LLMs under the ODC-BY and MIT licenses to encourage further research in the community. For access to all datasets and model weights, please refer to <a href="https://huggingface.co/collections/trendmicro-ailab/primus-67b1fd27052b802b4af9d243" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item239'>[239]</a> <a href ="/abs/2502.11196" title="Abstract" id="2502.11196"> arXiv:2502.11196 </a> (cross-list from cs.LG) [<a href="/pdf/2502.11196" title="Download PDF" id="pdf-2502.11196" aria-labelledby="pdf-2502.11196">pdf</a>, <a href="https://arxiv.org/html/2502.11196v1" title="View HTML" id="html-2502.11196" aria-labelledby="html-2502.11196" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11196" title="Other formats" id="oth-2502.11196" aria-labelledby="oth-2502.11196">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> How Do LLMs Acquire New Knowledge? A Knowledge Circuits Perspective on Continual Pre-Training </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ou,+Y">Yixin Ou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yao,+Y">Yunzhi Yao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+N">Ningyu Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jin,+H">Hui Jin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+J">Jiacheng Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Deng,+S">Shumin Deng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Z">Zhenguo Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+H">Huajun Chen</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Work in progress </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Computer Vision and Pattern Recognition (cs.CV); Human-Computer Interaction (cs.HC) </div> <p class='mathjax'> Despite exceptional capabilities in knowledge-intensive tasks, Large Language Models (LLMs) face a critical gap in understanding how they internalize new knowledge, particularly how to structurally embed acquired knowledge in their neural computations. We address this issue through the lens of knowledge circuit evolution, identifying computational subgraphs that facilitate knowledge storage and processing. Our systematic analysis of circuit evolution throughout continual pre-training reveals several key findings: (1) the acquisition of new knowledge is influenced by its relevance to pre-existing knowledge; (2) the evolution of knowledge circuits exhibits a distinct phase shift from formation to optimization; (3) the evolution of knowledge circuits follows a deep-to-shallow pattern. These insights not only advance our theoretical understanding of the mechanisms of new knowledge acquisition in LLMs, but also provide potential implications for improving continual pre-training strategies to enhance model performance. Code and data will be available at <a href="https://github.com/zjunlp/DynamicKnowledgeCircuits" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item240'>[240]</a> <a href ="/abs/2502.11221" title="Abstract" id="2502.11221"> arXiv:2502.11221 </a> (cross-list from cs.AI) [<a href="/pdf/2502.11221" title="Download PDF" id="pdf-2502.11221" aria-labelledby="pdf-2502.11221">pdf</a>, <a href="https://arxiv.org/html/2502.11221v1" title="View HTML" id="html-2502.11221" aria-labelledby="html-2502.11221" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11221" title="Other formats" id="oth-2502.11221" aria-labelledby="oth-2502.11221">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> PlanGenLLMs: A Modern Survey of LLM Planning Capabilities </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wei,+H">Hui Wei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Z">Zihao Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+S">Shenghua He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xia,+T">Tian Xia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pan,+S">Shijia Pan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+F">Fei Liu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Preprint. Under review </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> LLMs have immense potential for generating plans, transforming an initial world state into a desired goal state. A large body of research has explored the use of LLMs for various planning tasks, from web navigation to travel planning and database querying. However, many of these systems are tailored to specific problems, making it challenging to compare them or determine the best approach for new tasks. There is also a lack of clear and consistent evaluation criteria. Our survey aims to offer a comprehensive overview of current LLM planners to fill this gap. It builds on foundational work by Kartam and Wilkins (1990) and examines six key performance criteria: completeness, executability, optimality, representation, generalization, and efficiency. For each, we provide a thorough analysis of representative works and highlight their strengths and weaknesses. Our paper also identifies crucial future directions, making it a valuable resource for both practitioners and newcomers interested in leveraging LLM planning to support agentic workflows. </p> </div> </dd> <dt> <a name='item241'>[241]</a> <a href ="/abs/2502.11246" title="Abstract" id="2502.11246"> arXiv:2502.11246 </a> (cross-list from cs.IR) [<a href="/pdf/2502.11246" title="Download PDF" id="pdf-2502.11246" aria-labelledby="pdf-2502.11246">pdf</a>, <a href="https://arxiv.org/html/2502.11246v1" title="View HTML" id="html-2502.11246" aria-labelledby="html-2502.11246" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11246" title="Other formats" id="oth-2502.11246" aria-labelledby="oth-2502.11246">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MemeSense: An Adaptive In-Context Framework for Social Commonsense Driven Meme Moderation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Adak,+S">Sayantan Adak</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Banerjee,+S">Somnath Banerjee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mandal,+R">Rajarshi Mandal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Halder,+A">Avik Halder</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Layek,+S">Sayan Layek</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hazra,+R">Rima Hazra</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mukherjee,+A">Animesh Mukherjee</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Code and data available at: <a href="https://github.com/sayantan11995/MemeSense" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Information Retrieval (cs.IR)</span>; Computation and Language (cs.CL); Computers and Society (cs.CY) </div> <p class='mathjax'> Memes present unique moderation challenges due to their subtle, multimodal interplay of images, text, and social context. Standard systems relying predominantly on explicit textual cues often overlook harmful content camouflaged by irony, symbolism, or cultural references. To address this gap, we introduce MemeSense, an adaptive in-context learning framework that fuses social commonsense reasoning with visually and semantically related reference examples. By encoding crucial task information into a learnable cognitive shift vector, MemeSense effectively balances lexical, visual, and ethical considerations, enabling precise yet context-aware meme intervention. Extensive evaluations on a curated set of implicitly harmful memes demonstrate that MemeSense substantially outperforms strong baselines, paving the way for safer online communities. Code and data available at: <a href="https://github.com/sayantan11995/MemeSense" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item242'>[242]</a> <a href ="/abs/2502.11256" title="Abstract" id="2502.11256"> arXiv:2502.11256 </a> (cross-list from cs.LG) [<a href="/pdf/2502.11256" title="Download PDF" id="pdf-2502.11256" aria-labelledby="pdf-2502.11256">pdf</a>, <a href="https://arxiv.org/html/2502.11256v1" title="View HTML" id="html-2502.11256" aria-labelledby="html-2502.11256" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11256" title="Other formats" id="oth-2502.11256" aria-labelledby="oth-2502.11256">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Unveiling Environmental Impacts of Large Language Model Serving: A Functional Unit View </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Y">Yanran Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hua,+I">Inez Hua</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ding,+Y">Yi Ding</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 17 pages, 38 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Hardware Architecture (cs.AR); Computation and Language (cs.CL) </div> <p class='mathjax'> Large language models (LLMs) offer powerful capabilities but come with significant environmental costs, particularly in carbon emissions. Existing studies benchmark these emissions but lack a standardized basis for comparison across models. To address this, we introduce the concept of a functional unit (FU) and develop FUEL, the first FU-based framework for evaluating LLM serving's environmental impact. Through case studies on model size, quantization, and hardware, we uncover key trade-offs in sustainability. Our findings highlight the potential for reducing carbon emissions by optimizing model selection, deployment strategies, and hardware choices, paving the way for more sustainable AI infrastructure. </p> </div> </dd> <dt> <a name='item243'>[243]</a> <a href ="/abs/2502.11267" title="Abstract" id="2502.11267"> arXiv:2502.11267 </a> (cross-list from cs.HC) [<a href="/pdf/2502.11267" title="Download PDF" id="pdf-2502.11267" aria-labelledby="pdf-2502.11267">pdf</a>, <a href="https://arxiv.org/html/2502.11267v1" title="View HTML" id="html-2502.11267" aria-labelledby="html-2502.11267" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11267" title="Other formats" id="oth-2502.11267" aria-labelledby="oth-2502.11267">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Prompting in the Dark: Assessing Human Performance in Prompt Engineering for Data Labeling When Gold Labels Are Absent </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=He,+Z">Zeyu He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Naphade,+S">Saniya Naphade</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+T+'">Ting-Hao 'Kenneth' Huang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted By CHI 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Human-Computer Interaction (cs.HC)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Machine Learning (cs.LG) </div> <p class='mathjax'> Millions of users prompt large language models (LLMs) for various tasks, but how good are people at prompt engineering? Do users actually get closer to their desired outcome over multiple iterations of their prompts? These questions are crucial when no gold-standard labels are available to measure progress. This paper investigates a scenario in LLM-powered data labeling, "prompting in the dark," where users iteratively prompt LLMs to label data without using manually-labeled benchmarks. We developed PromptingSheet, a Google Sheets add-on that enables users to compose, revise, and iteratively label data through spreadsheets. Through a study with 20 participants, we found that prompting in the dark was highly unreliable-only 9 participants improved labeling accuracy after four or more iterations. Automated prompt optimization tools like DSPy also struggled when few gold labels were available. Our findings highlight the importance of gold labels and the needs, as well as the risks, of automated support in human prompt engineering, providing insights for future tool design. </p> </div> </dd> <dt> <a name='item244'>[244]</a> <a href ="/abs/2502.11271" title="Abstract" id="2502.11271"> arXiv:2502.11271 </a> (cross-list from cs.LG) [<a href="/pdf/2502.11271" title="Download PDF" id="pdf-2502.11271" aria-labelledby="pdf-2502.11271">pdf</a>, <a href="https://arxiv.org/html/2502.11271v1" title="View HTML" id="html-2502.11271" aria-labelledby="html-2502.11271" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11271" title="Other formats" id="oth-2502.11271" aria-labelledby="oth-2502.11271">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> OctoTools: An Agentic Framework with Extensible Tools for Complex Reasoning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+P">Pan Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+B">Bowen Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+S">Sheng Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Thapa,+R">Rahul Thapa</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Boen,+J">Joseph Boen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zou,+J">James Zou</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 89 pages, 18 figures. Project website: <a href="https://octotools.github.io/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computation and Language (cs.CL); Computer Vision and Pattern Recognition (cs.CV); Multiagent Systems (cs.MA) </div> <p class='mathjax'> Solving complex reasoning tasks may involve visual understanding, domain knowledge retrieval, numerical calculation, and multi-step reasoning. Existing methods augment large language models (LLMs) with external tools but are restricted to specialized domains, limited tool types, or require additional training data. In this paper, we introduce OctoTools, a training-free, user-friendly, and easily extensible open-source agentic framework designed to tackle complex reasoning across diverse domains. OctoTools introduces standardized tool cards to encapsulate tool functionality, a planner for both high-level and low-level planning, and an executor to carry out tool usage. We validate OctoTools' generality across 16 diverse tasks (including MathVista, MMLU-Pro, MedQA, and GAIA-Text), achieving substantial average accuracy gains of 9.3% over GPT-4o. Furthermore, OctoTools outperforms AutoGen, GPT-Functions and LangChain by up to 10.6% when given the same set of tools. Through comprehensive analysis and ablations, OctoTools demonstrates advantages in task planning, effective tool usage, and multi-step problem solving. </p> </div> </dd> <dt> <a name='item245'>[245]</a> <a href ="/abs/2502.11298" title="Abstract" id="2502.11298"> arXiv:2502.11298 </a> (cross-list from cs.NI) [<a href="/pdf/2502.11298" title="Download PDF" id="pdf-2502.11298" aria-labelledby="pdf-2502.11298">pdf</a>, <a href="https://arxiv.org/html/2502.11298v1" title="View HTML" id="html-2502.11298" aria-labelledby="html-2502.11298" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11298" title="Other formats" id="oth-2502.11298" aria-labelledby="oth-2502.11298">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Integrating Language Models for Enhanced Network State Monitoring in DRL-Based SFC Provisioning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Moshiri,+P+F">Parisa Fard Moshiri</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Onsu,+M+A">Murat Arda Onsu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lohan,+P">Poonam Lohan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kantarci,+B">Burak Kantarci</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Janulewicz,+E">Emil Janulewicz</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 6 pages, 5 figures, submitted to 30th IEEE International Symposium on Computers and Communications (ISCC) 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Networking and Internet Architecture (cs.NI)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> Efficient Service Function Chain (SFC) provisioning and Virtual Network Function (VNF) placement are critical for enhancing network performance in modern architectures such as Software-Defined Networking (SDN) and Network Function Virtualization (NFV). While Deep Reinforcement Learning (DRL) aids decision-making in dynamic network environments, its reliance on structured inputs and predefined rules limits adaptability in unforeseen scenarios. Additionally, incorrect actions by a DRL agent may require numerous training iterations to correct, potentially reinforcing suboptimal policies and degrading performance. This paper integrates DRL with Language Models (LMs), specifically Bidirectional Encoder Representations from Transformers (BERT) and DistilBERT, to enhance network management. By feeding final VNF allocations from DRL into the LM, the system can process and respond to queries related to SFCs, DCs, and VNFs, enabling real-time insights into resource utilization, bottleneck detection, and future demand planning. The LMs are fine-tuned to our domain-specific dataset using Low-Rank Adaptation (LoRA). Results show that BERT outperforms DistilBERT with a lower test loss (0.28 compared to 0.36) and higher confidence (0.83 compared to 0.74), though BERT requires approximately 46% more processing time. </p> </div> </dd> <dt> <a name='item246'>[246]</a> <a href ="/abs/2502.11304" title="Abstract" id="2502.11304"> arXiv:2502.11304 </a> (cross-list from cs.AI) [<a href="/pdf/2502.11304" title="Download PDF" id="pdf-2502.11304" aria-labelledby="pdf-2502.11304">pdf</a>, <a href="https://arxiv.org/html/2502.11304v1" title="View HTML" id="html-2502.11304" aria-labelledby="html-2502.11304" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11304" title="Other formats" id="oth-2502.11304" aria-labelledby="oth-2502.11304">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Leveraging Multimodal-LLMs Assisted by Instance Segmentation for Intelligent Traffic Monitoring </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Onsu,+M+A">Murat Arda Onsu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lohan,+P">Poonam Lohan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kantarci,+B">Burak Kantarci</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Syed,+A">Aisha Syed</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Andrews,+M">Matthew Andrews</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kennedy,+S">Sean Kennedy</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 6 pages, 7 figures, submitted to 30th IEEE International Symposium on Computers and Communications (ISCC) 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> A robust and efficient traffic monitoring system is essential for smart cities and Intelligent Transportation Systems (ITS), using sensors and cameras to track vehicle movements, optimize traffic flow, reduce congestion, enhance road safety, and enable real-time adaptive traffic control. Traffic monitoring models must comprehensively understand dynamic urban conditions and provide an intuitive user interface for effective management. This research leverages the LLaVA visual grounding multimodal large language model (LLM) for traffic monitoring tasks on the real-time Quanser Interactive Lab simulation platform, covering scenarios like intersections, congestion, and collisions. Cameras placed at multiple urban locations collect real-time images from the simulation, which are fed into the LLaVA model with queries for analysis. An instance segmentation model integrated into the cameras highlights key elements such as vehicles and pedestrians, enhancing training and throughput. The system achieves 84.3% accuracy in recognizing vehicle locations and 76.4% in determining steering direction, outperforming traditional models. </p> </div> </dd> <dt> <a name='item247'>[247]</a> <a href ="/abs/2502.11308" title="Abstract" id="2502.11308"> arXiv:2502.11308 </a> (cross-list from cs.CR) [<a href="/pdf/2502.11308" title="Download PDF" id="pdf-2502.11308" aria-labelledby="pdf-2502.11308">pdf</a>, <a href="https://arxiv.org/html/2502.11308v1" title="View HTML" id="html-2502.11308" aria-labelledby="html-2502.11308" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11308" title="Other formats" id="oth-2502.11308" aria-labelledby="oth-2502.11308">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> ALGEN: Few-shot Inversion Attacks on Textual Embeddings using Alignment and Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Y">Yiyi Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+Q">Qiongkai Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bjerva,+J">Johannes Bjerva</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 18 pages, 13 tables, 6 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Cryptography and Security (cs.CR)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> With the growing popularity of Large Language Models (LLMs) and vector databases, private textual data is increasingly processed and stored as numerical embeddings. However, recent studies have proven that such embeddings are vulnerable to inversion attacks, where original text is reconstructed to reveal sensitive information. Previous research has largely assumed access to millions of sentences to train attack models, e.g., through data leakage or nearly unrestricted API access. With our method, a single data point is sufficient for a partially successful inversion attack. With as little as 1k data samples, performance reaches an optimum across a range of black-box encoders, without training on leaked data. We present a Few-shot Textual Embedding Inversion Attack using ALignment and GENeration (ALGEN), by aligning victim embeddings to the attack space and using a generative model to reconstruct text. We find that ALGEN attacks can be effectively transferred across domains and languages, revealing key information. We further examine a variety of defense mechanisms against ALGEN, and find that none are effective, highlighting the vulnerabilities posed by inversion attacks. By significantly lowering the cost of inversion and proving that embedding spaces can be aligned through one-step optimization, we establish a new textual embedding inversion paradigm with broader applications for embedding alignment in NLP. </p> </div> </dd> <dt> <a name='item248'>[248]</a> <a href ="/abs/2502.11356" title="Abstract" id="2502.11356"> arXiv:2502.11356 </a> (cross-list from cs.LG) [<a href="/pdf/2502.11356" title="Download PDF" id="pdf-2502.11356" aria-labelledby="pdf-2502.11356">pdf</a>, <a href="https://arxiv.org/html/2502.11356v1" title="View HTML" id="html-2502.11356" aria-labelledby="html-2502.11356" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11356" title="Other formats" id="oth-2502.11356" aria-labelledby="oth-2502.11356">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SAIF: A Sparse Autoencoder Framework for Interpreting and Steering Instruction Following of Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=He,+Z">Zirui He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+H">Haiyan Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qiao,+Y">Yiran Qiao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+F">Fan Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Payani,+A">Ali Payani</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+J">Jing Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Du,+M">Mengnan Du</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 21 pages, 11 figures, 6 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> The ability of large language models (LLMs) to follow instructions is crucial for their practical applications, yet the underlying mechanisms remain poorly understood. This paper presents a novel framework that leverages sparse autoencoders (SAE) to interpret how instruction following works in these models. We demonstrate how the features we identify can effectively steer model outputs to align with given instructions. Through analysis of SAE latent activations, we identify specific latents responsible for instruction following behavior. Our findings reveal that instruction following capabilities are encoded by a distinct set of instruction-relevant SAE latents. These latents both show semantic proximity to relevant instructions and demonstrate causal effects on model behavior. Our research highlights several crucial factors for achieving effective steering performance: precise feature identification, the role of final layer, and optimal instruction positioning. Additionally, we demonstrate that our methodology scales effectively across SAEs and LLMs of varying sizes. </p> </div> </dd> <dt> <a name='item249'>[249]</a> <a href ="/abs/2502.11360" title="Abstract" id="2502.11360"> arXiv:2502.11360 </a> (cross-list from cs.CV) [<a href="/pdf/2502.11360" title="Download PDF" id="pdf-2502.11360" aria-labelledby="pdf-2502.11360">pdf</a>, <a href="https://arxiv.org/html/2502.11360v1" title="View HTML" id="html-2502.11360" aria-labelledby="html-2502.11360" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11360" title="Other formats" id="oth-2502.11360" aria-labelledby="oth-2502.11360">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> GeoDANO: Geometric VLM with Domain Agnostic Vision Encoder </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Cho,+S">Seunghyuk Cho</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qin,+Z">Zhenyue Qin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Y">Yang Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Choi,+Y">Youngbin Choi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+S">Seungbeom Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+D">Dongwoo Kim</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 14 pages, 7 figures, 5 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> We introduce GeoDANO, a geometric vision-language model (VLM) with a domain-agnostic vision encoder, for solving plane geometry problems. Although VLMs have been employed for solving geometry problems, their ability to recognize geometric features remains insufficiently analyzed. To address this gap, we propose a benchmark that evaluates the recognition of visual geometric features, including primitives such as dots and lines, and relations such as orthogonality. Our preliminary study shows that vision encoders often used in general-purpose VLMs, e.g., OpenCLIP, fail to detect these features and struggle to generalize across domains. We develop GeoCLIP, a CLIP based model trained on synthetic geometric diagram-caption pairs to overcome the limitation. Benchmark results show that GeoCLIP outperforms existing vision encoders in recognizing geometric features. We then propose our VLM, GeoDANO, which augments GeoCLIP with a domain adaptation strategy for unseen diagram styles. GeoDANO outperforms specialized methods for plane geometry problems and GPT-4o on MathVerse. </p> </div> </dd> <dt> <a name='item250'>[250]</a> <a href ="/abs/2502.11367" title="Abstract" id="2502.11367"> arXiv:2502.11367 </a> (cross-list from cs.LG) [<a href="/pdf/2502.11367" title="Download PDF" id="pdf-2502.11367" aria-labelledby="pdf-2502.11367">pdf</a>, <a href="https://arxiv.org/html/2502.11367v1" title="View HTML" id="html-2502.11367" aria-labelledby="html-2502.11367" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11367" title="Other formats" id="oth-2502.11367" aria-labelledby="oth-2502.11367">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Sparse Autoencoder Features for Classifications and Transferability </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Gallifant,+J">Jack Gallifant</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+S">Shan Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sasse,+K">Kuleen Sasse</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Aerts,+H">Hugo Aerts</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hartvigsen,+T">Thomas Hartvigsen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bitterman,+D+S">Danielle S. Bitterman</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> Sparse Autoencoders (SAEs) provide potentials for uncovering structured, human-interpretable representations in Large Language Models (LLMs), making them a crucial tool for transparent and controllable AI systems. We systematically analyze SAE for interpretable feature extraction from LLMs in safety-critical classification tasks. Our framework evaluates (1) model-layer selection and scaling properties, (2) SAE architectural configurations, including width and pooling strategies, and (3) the effect of binarizing continuous SAE activations. SAE-derived features achieve macro F1 > 0.8, outperforming hidden-state and BoW baselines while demonstrating cross-model transfer from Gemma 2 2B to 9B-IT models. These features generalize in a zero-shot manner to cross-lingual toxicity detection and visual classification tasks. Our analysis highlights the significant impact of pooling strategies and binarization thresholds, showing that binarization offers an efficient alternative to traditional feature selection while maintaining or improving performance. These findings establish new best practices for SAE-based interpretability and enable scalable, transparent deployment of LLMs in real-world applications. Full repo: <a href="https://github.com/shan23chen/MOSAIC" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item251'>[251]</a> <a href ="/abs/2502.11379" title="Abstract" id="2502.11379"> arXiv:2502.11379 </a> (cross-list from cs.CR) [<a href="/pdf/2502.11379" title="Download PDF" id="pdf-2502.11379" aria-labelledby="pdf-2502.11379">pdf</a>, <a href="https://arxiv.org/html/2502.11379v1" title="View HTML" id="html-2502.11379" aria-labelledby="html-2502.11379" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11379" title="Other formats" id="oth-2502.11379" aria-labelledby="oth-2502.11379">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CCJA: Context-Coherent Jailbreak Attack for Aligned Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+G">Guanghao Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qiu,+P">Panjia Qiu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fan,+M">Mingyuan Fan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+C">Cen Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chu,+M">Mingyuan Chu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+X">Xin Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+J">Jun Zhou</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Cryptography and Security (cs.CR)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> Despite explicit alignment efforts for large language models (LLMs), they can still be exploited to trigger unintended behaviors, a phenomenon known as "jailbreaking." Current jailbreak attack methods mainly focus on discrete prompt manipulations targeting closed-source LLMs, relying on manually crafted prompt templates and persuasion rules. However, as the capabilities of open-source LLMs improve, ensuring their safety becomes increasingly crucial. In such an environment, the accessibility of model parameters and gradient information by potential attackers exacerbates the severity of jailbreak threats. To address this research gap, we propose a novel \underline{C}ontext-\underline{C}oherent \underline{J}ailbreak \underline{A}ttack (CCJA). We define jailbreak attacks as an optimization problem within the embedding space of masked language models. Through combinatorial optimization, we effectively balance the jailbreak attack success rate with semantic coherence. Extensive evaluations show that our method not only maintains semantic consistency but also surpasses state-of-the-art baselines in attack effectiveness. Additionally, by integrating semantically coherent jailbreak prompts generated by our method into widely used black-box methodologies, we observe a notable enhancement in their success rates when targeting closed-source commercial LLMs. This highlights the security threat posed by open-source LLMs to commercial counterparts. We will open-source our code if the paper is accepted. </p> </div> </dd> <dt> <a name='item252'>[252]</a> <a href ="/abs/2502.11435" title="Abstract" id="2502.11435"> arXiv:2502.11435 </a> (cross-list from cs.AI) [<a href="/pdf/2502.11435" title="Download PDF" id="pdf-2502.11435" aria-labelledby="pdf-2502.11435">pdf</a>, <a href="https://arxiv.org/html/2502.11435v1" title="View HTML" id="html-2502.11435" aria-labelledby="html-2502.11435" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11435" title="Other formats" id="oth-2502.11435" aria-labelledby="oth-2502.11435">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SMART: Self-Aware Agent for Tool Overuse Mitigation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Qian,+C">Cheng Qian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Acikgoz,+E+C">Emre Can Acikgoz</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+H">Hongru Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+X">Xiusi Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sil,+A">Avirup Sil</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hakkani-T%C3%BCr,+D">Dilek Hakkani-T眉r</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tur,+G">Gokhan Tur</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ji,+H">Heng Ji</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 18 pages, 8 tables, 7 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL); Machine Learning (cs.LG) </div> <p class='mathjax'> Current Large Language Model (LLM) agents demonstrate strong reasoning and tool use capabilities, but often lack self-awareness, failing to balance these approaches effectively. This imbalance leads to Tool Overuse, where models unnecessarily rely on external tools for tasks solvable with parametric knowledge, increasing computational overhead. Inspired by human metacognition, we introduce SMART (Strategic Model-Aware Reasoning with Tools), a paradigm that enhances an agent's self-awareness to optimize task handling and reduce tool overuse. To support this paradigm, we introduce SMART-ER, a dataset spanning three domains, where reasoning alternates between parametric knowledge and tool-dependent steps, with each step enriched by rationales explaining when tools are necessary. Through supervised training, we develop SMARTAgent, a family of models that dynamically balance parametric knowledge and tool use. Evaluations show that SMARTAgent reduces tool use by 24% while improving performance by over 37%, enabling 7B-scale models to match its 70B counterpart and GPT-4o. Additionally, SMARTAgent generalizes to out-of-distribution test data like GSM8K and MINTQA, maintaining accuracy with just one-fifth the tool calls. These highlight the potential of strategic tool use to enhance reasoning, mitigate overuse, and bridge the gap between model size and performance, advancing intelligent and resource-efficient agent designs. </p> </div> </dd> <dt> <a name='item253'>[253]</a> <a href ="/abs/2502.11442" title="Abstract" id="2502.11442"> arXiv:2502.11442 </a> (cross-list from cs.IR) [<a href="/pdf/2502.11442" title="Download PDF" id="pdf-2502.11442" aria-labelledby="pdf-2502.11442">pdf</a>, <a href="https://arxiv.org/html/2502.11442v1" title="View HTML" id="html-2502.11442" aria-labelledby="html-2502.11442" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11442" title="Other formats" id="oth-2502.11442" aria-labelledby="oth-2502.11442">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Multi-Turn Multi-Modal Question Clarification for Enhanced Conversational Understanding </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ramezan,+K">Kimia Ramezan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bavandpour,+A+A">Alireza Amiri Bavandpour</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+Y">Yifei Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Siro,+C">Clemencia Siro</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Aliannejadi,+M">Mohammad Aliannejadi</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Information Retrieval (cs.IR)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Machine Learning (cs.LG) </div> <p class='mathjax'> Conversational query clarification enables users to refine their search queries through interactive dialogue, improving search effectiveness. Traditional approaches rely on text-based clarifying questions, which often fail to capture complex user preferences, particularly those involving visual attributes. While recent work has explored single-turn multi-modal clarification with images alongside text, such methods do not fully support the progressive nature of user intent refinement over multiple turns. Motivated by this, we introduce the Multi-turn Multi-modal Clarifying Questions (MMCQ) task, which combines text and visual modalities to refine user queries in a multi-turn conversation. To facilitate this task, we create a large-scale dataset named ClariMM comprising over 13k multi-turn interactions and 33k question-answer pairs containing multi-modal clarifying questions. We propose Mario, a retrieval framework that employs a two-phase ranking strategy: initial retrieval with BM25, followed by a multi-modal generative re-ranking model that integrates textual and visual information from conversational history. Our experiments show that multi-turn multi-modal clarification outperforms uni-modal and single-turn approaches, improving MRR by 12.88%. The gains are most significant in longer interactions, demonstrating the value of progressive refinement for complex queries. </p> </div> </dd> <dt> <a name='item254'>[254]</a> <a href ="/abs/2502.11466" title="Abstract" id="2502.11466"> arXiv:2502.11466 </a> (cross-list from cs.LG) [<a href="/pdf/2502.11466" title="Download PDF" id="pdf-2502.11466" aria-labelledby="pdf-2502.11466">pdf</a>, <a href="https://arxiv.org/html/2502.11466v1" title="View HTML" id="html-2502.11466" aria-labelledby="html-2502.11466" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11466" title="Other formats" id="oth-2502.11466" aria-labelledby="oth-2502.11466">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> GiFT: Gibbs Fine-Tuning for Code Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+H">Haochen Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+W">Wanjin Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+X">Xin Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+Z">Zhiqi Shen</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computation and Language (cs.CL); Software Engineering (cs.SE) </div> <p class='mathjax'> Training Large Language Models (LLMs) with synthetic data is a prevalent practice in code generation. A key approach is self-training, where LLMs are iteratively trained on self-generated correct code snippets. In this case, the self-generated codes are drawn from a conditional distribution, conditioned on a specific seed description. However, the seed description is not the only valid representation that aligns with its intended meaning. With all valid descriptions and codes forming a joint space, codes drawn from the conditional distribution would lead to an underrepresentation of the full description-code space. As such, we propose Gibbs Fine-Tuning (GiFT), a novel self-training method inspired by Gibbs sampling. GiFT allows self-generated data to be drawn from the marginal distribution of the joint space, thereby mitigating the biases inherent in conditional sampling. We provide a theoretical analysis demonstrating the potential benefits of fine-tuning LLMs with code derived from the marginal distribution. Furthermore, we propose a perplexity-based code selection method to mitigate the imbalanced long-tail distribution of the self-generated codes. Empirical evaluation of two LLMs across four datasets demonstrates that GiFT achieves superior performance, particularly on more challenging benchmarks. </p> </div> </dd> <dt> <a name='item255'>[255]</a> <a href ="/abs/2502.11482" title="Abstract" id="2502.11482"> arXiv:2502.11482 </a> (cross-list from cs.LG) [<a href="/pdf/2502.11482" title="Download PDF" id="pdf-2502.11482" aria-labelledby="pdf-2502.11482">pdf</a>, <a href="https://arxiv.org/html/2502.11482v1" title="View HTML" id="html-2502.11482" aria-labelledby="html-2502.11482" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11482" title="Other formats" id="oth-2502.11482" aria-labelledby="oth-2502.11482">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> DATA: Decomposed Attention-based Task Adaptation for Rehearsal-Free Continual Learning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liao,+H">Huanxuan Liao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+S">Shizhu He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hao,+Y">Yupu Hao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+J">Jun Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+K">Kang Liu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> Continual learning (CL) is essential for Large Language Models (LLMs) to adapt to evolving real-world demands, yet they are susceptible to catastrophic forgetting (CF). While traditional CF solutions rely on expensive data rehearsal, recent rehearsal-free methods employ model-based and regularization-based strategies to address this issue. However, these approaches often neglect the model's plasticity, which is crucial to achieving optimal performance on newly learned tasks. Consequently, a key challenge in CL is striking a balance between preserving plasticity and mitigating CF. To tackle this challenge, we propose the $\textbf{D}$ecomposed $\textbf{A}$ttention-based $\textbf{T}$ask $\textbf{A}$daptation (DATA), which explicitly decouples and learns both task-specific and task-shared knowledge using high-rank and low-rank task adapters (e.g., LoRAs). For new tasks, DATA dynamically adjusts the weights of adapters of different ranks based on their relevance and distinction from previous tasks, allowing the model to acquire new task-specific skills while effectively retaining previously learned knowledge. Specifically, we implement a decomposed component weighting strategy comprising learnable components that collectively generate attention-based weights, allowing the model to integrate and utilize diverse knowledge from each DATA. Extensive experiments on three widely used benchmarks demonstrate that our proposed method achieves state-of-the-art performance. Notably, our approach significantly enhances model plasticity and mitigates CF by extending learnable components and employing stochastic restoration during training iterations. </p> </div> </dd> <dt> <a name='item256'>[256]</a> <a href ="/abs/2502.11492" title="Abstract" id="2502.11492"> arXiv:2502.11492 </a> (cross-list from cs.AI) [<a href="/pdf/2502.11492" title="Download PDF" id="pdf-2502.11492" aria-labelledby="pdf-2502.11492">pdf</a>, <a href="https://arxiv.org/html/2502.11492v1" title="View HTML" id="html-2502.11492" aria-labelledby="html-2502.11492" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11492" title="Other formats" id="oth-2502.11492" aria-labelledby="oth-2502.11492">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Why Vision Language Models Struggle with Visual Arithmetic? Towards Enhanced Chart and Geometry Understanding </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+K">Kung-Hsiang Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qin,+C">Can Qin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qiu,+H">Haoyi Qiu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Laban,+P">Philippe Laban</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Joty,+S">Shafiq Joty</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiong,+C">Caiming Xiong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+C">Chien-Sheng Wu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Vision Language Models (VLMs) have achieved remarkable progress in multimodal tasks, yet they often struggle with visual arithmetic, seemingly simple capabilities like object counting or length comparison, which are essential for relevant complex tasks like chart understanding and geometric reasoning. In this work, we first investigate the root causes of this deficiency through a suite of probing tasks focusing on basic visual arithmetic. Our analysis reveals that while pre-trained vision encoders typically capture sufficient information, the text decoder often fails to decode it correctly for arithmetic reasoning. To address this, we propose CogAlign, a novel post-training strategy inspired by Piaget's theory of cognitive development. CogAlign trains VLMs to recognize invariant properties under visual transformations. We demonstrate that this approach significantly improves the performance of three diverse VLMs on our proposed probing tasks. Furthermore, CogAlign enhances performance by an average of 4.6% on CHOCOLATE and 2.9% on MATH-VISION, outperforming or matching supervised fine-tuning methods while requiring only 60% less training data. These results highlight the effectiveness and generalizability of CogAlign in improving fundamental visual arithmetic capabilities and their transfer to downstream tasks. </p> </div> </dd> <dt> <a name='item257'>[257]</a> <a href ="/abs/2502.11554" title="Abstract" id="2502.11554"> arXiv:2502.11554 </a> (cross-list from cs.HC) [<a href="/pdf/2502.11554" title="Download PDF" id="pdf-2502.11554" aria-labelledby="pdf-2502.11554">pdf</a>, <a href="https://arxiv.org/html/2502.11554v1" title="View HTML" id="html-2502.11554" aria-labelledby="html-2502.11554" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11554" title="Other formats" id="oth-2502.11554" aria-labelledby="oth-2502.11554">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Toward Metaphor-Fluid Conversation Design for Voice User Interfaces </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Desai,+S">Smit Desai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chin,+J">Jessie Chin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+D">Dakuo Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cowan,+B">Benjamin Cowan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Twidale,+M">Michael Twidale</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Human-Computer Interaction (cs.HC)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Computers and Society (cs.CY); Emerging Technologies (cs.ET) </div> <p class='mathjax'> Metaphors play a critical role in shaping user experiences with Voice User Interfaces (VUIs), yet existing designs often rely on static, human-centric metaphors that fail to adapt to diverse contexts and user needs. This paper introduces Metaphor-Fluid Design, a novel approach that dynamically adjusts metaphorical representations based on conversational use-contexts. We compare this approach to a Default VUI, which characterizes the present implementation of commercial VUIs commonly designed around the persona of an assistant, offering a uniform interaction style across contexts. In Study 1 (N=130), metaphors were mapped to four key use-contexts-commands, information seeking, sociality, and error recovery-along the dimensions of formality and hierarchy, revealing distinct preferences for task-specific metaphorical designs. Study 2 (N=91) evaluates a Metaphor-Fluid VUI against a Default VUI, showing that the Metaphor-Fluid VUI enhances perceived intention to adopt, enjoyment, and likability by aligning better with user expectations for different contexts. However, individual differences in metaphor preferences highlight the need for personalization. These findings challenge the one-size-fits-all paradigm of VUI design and demonstrate the potential of Metaphor-Fluid Design to create more adaptive and engaging human-AI interactions. </p> </div> </dd> <dt> <a name='item258'>[258]</a> <a href ="/abs/2502.11645" title="Abstract" id="2502.11645"> arXiv:2502.11645 </a> (cross-list from cs.GT) [<a href="/pdf/2502.11645" title="Download PDF" id="pdf-2502.11645" aria-labelledby="pdf-2502.11645">pdf</a>, <a href="/format/2502.11645" title="Other formats" id="oth-2502.11645" aria-labelledby="oth-2502.11645">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Deviation Ratings: A General, Clone-Invariant Rating Method </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Marris,+L">Luke Marris</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+S">Siqi Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gemp,+I">Ian Gemp</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Piliouras,+G">Georgios Piliouras</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lanctot,+M">Marc Lanctot</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Science and Game Theory (cs.GT)</span>; Computation and Language (cs.CL); Multiagent Systems (cs.MA); Other Statistics (stat.OT) </div> <p class='mathjax'> Many real-world multi-agent or multi-task evaluation scenarios can be naturally modelled as normal-form games due to inherent strategic (adversarial, cooperative, and mixed motive) interactions. These strategic interactions may be agentic (e.g. players trying to win), fundamental (e.g. cost vs quality), or complementary (e.g. niche finding and specialization). In such a formulation, it is the strategies (actions, policies, agents, models, tasks, prompts, etc.) that are rated. However, the rating problem is complicated by redundancy and complexity of N-player strategic interactions. Repeated or similar strategies can distort ratings for those that counter or complement them. Previous work proposed ``clone invariant'' ratings to handle such redundancies, but this was limited to two-player zero-sum (i.e. strictly competitive) interactions. This work introduces the first N-player general-sum clone invariant rating, called deviation ratings, based on coarse correlated equilibria. The rating is explored on several domains including LLMs evaluation. </p> </div> </dd> <dt> <a name='item259'>[259]</a> <a href ="/abs/2502.11678" title="Abstract" id="2502.11678"> arXiv:2502.11678 </a> (cross-list from cs.CY) [<a href="/pdf/2502.11678" title="Download PDF" id="pdf-2502.11678" aria-labelledby="pdf-2502.11678">pdf</a>, <a href="https://arxiv.org/html/2502.11678v1" title="View HTML" id="html-2502.11678" aria-labelledby="html-2502.11678" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11678" title="Other formats" id="oth-2502.11678" aria-labelledby="oth-2502.11678">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Exploring LLM-based Student Simulation for Metacognitive Cultivation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+H">Haoxuan Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+J">Jifan Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cong,+X">Xin Cong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dang,+Y">Yang Dang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhan,+Y">Yisi Zhan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+H">Huiqin Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zhiyuan Liu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 12 pages, 8 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computers and Society (cs.CY)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Metacognitive education plays a crucial role in cultivating students' self-regulation and reflective thinking, providing essential support for those with learning difficulties through academic advising. Simulating students with insufficient learning capabilities using large language models offers a promising approach to refining pedagogical methods without ethical concerns. However, existing simulations often fail to authentically represent students' learning struggles and face challenges in evaluation due to the lack of reliable metrics and ethical constraints in data collection. To address these issues, we propose a pipeline for automatically generating and filtering high-quality simulated student agents. Our approach leverages a two-round automated scoring system validated by human experts and employs a score propagation module to obtain more consistent scores across the student graph. Experimental results demonstrate that our pipeline efficiently identifies high-quality student agents, and we discuss the traits that influence the simulation's effectiveness. By simulating students with varying degrees of learning difficulties, our work paves the way for broader applications in personalized learning and educational assessment. </p> </div> </dd> <dt> <a name='item260'>[260]</a> <a href ="/abs/2502.11767" title="Abstract" id="2502.11767"> arXiv:2502.11767 </a> (cross-list from cs.LG) [<a href="/pdf/2502.11767" title="Download PDF" id="pdf-2502.11767" aria-labelledby="pdf-2502.11767">pdf</a>, <a href="https://arxiv.org/html/2502.11767v1" title="View HTML" id="html-2502.11767" aria-labelledby="html-2502.11767" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11767" title="Other formats" id="oth-2502.11767" aria-labelledby="oth-2502.11767">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> From Selection to Generation: A Survey of LLM-based Active Learning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xia,+Y">Yu Xia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mukherjee,+S">Subhojyoti Mukherjee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xie,+Z">Zhouhang Xie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+J">Junda Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+X">Xintong Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Aponte,+R">Ryan Aponte</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lyu,+H">Hanjia Lyu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Barrow,+J">Joe Barrow</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+H">Hongjie Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dernoncourt,+F">Franck Dernoncourt</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kveton,+B">Branislav Kveton</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+T">Tong Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+R">Ruiyi Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gu,+J">Jiuxiang Gu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ahmed,+N+K">Nesreen K. Ahmed</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yu Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+X">Xiang Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Deilamsalehy,+H">Hanieh Deilamsalehy</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+S">Sungchul Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+Z">Zhengmian Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+Y">Yue Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lipka,+N">Nedim Lipka</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yoon,+S">Seunghyun Yoon</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+T+K">Ting-Hao Kenneth Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zichao Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mathur,+P">Puneet Mathur</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pal,+S">Soumyabrata Pal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mukherjee,+K">Koyel Mukherjee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Z">Zhehao Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Park,+N">Namyong Park</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nguyen,+T+H">Thien Huu Nguyen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+J">Jiebo Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rossi,+R+A">Ryan A. Rossi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=McAuley,+J">Julian McAuley</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Active Learning (AL) has been a powerful paradigm for improving model efficiency and performance by selecting the most informative data points for labeling and training. In recent active learning frameworks, Large Language Models (LLMs) have been employed not only for selection but also for generating entirely new data instances and providing more cost-effective annotations. Motivated by the increasing importance of high-quality data and efficient model training in the era of LLMs, we present a comprehensive survey on LLM-based Active Learning. We introduce an intuitive taxonomy that categorizes these techniques and discuss the transformative roles LLMs can play in the active learning loop. We further examine the impact of AL on LLM learning paradigms and its applications across various domains. Finally, we identify open challenges and propose future research directions. This survey aims to serve as an up-to-date resource for researchers and practitioners seeking to gain an intuitive understanding of LLM-based AL techniques and deploy them to new applications. </p> </div> </dd> <dt> <a name='item261'>[261]</a> <a href ="/abs/2502.11799" title="Abstract" id="2502.11799"> arXiv:2502.11799 </a> (cross-list from cs.AI) [<a href="/pdf/2502.11799" title="Download PDF" id="pdf-2502.11799" aria-labelledby="pdf-2502.11799">pdf</a>, <a href="https://arxiv.org/html/2502.11799v1" title="View HTML" id="html-2502.11799" aria-labelledby="html-2502.11799" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11799" title="Other formats" id="oth-2502.11799" aria-labelledby="oth-2502.11799">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Table-Critic: A Multi-Agent Framework for Collaborative Criticism and Refinement in Table Reasoning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+P">Peiying Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+G">Guoxin Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+J">Jingjing Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Despite the remarkable capabilities of large language models (LLMs) in various reasoning tasks, they still struggle with table reasoning tasks, particularly in maintaining consistency throughout multi-step reasoning processes. While existing approaches have explored various decomposition strategies, they often lack effective mechanisms to identify and correct errors in intermediate reasoning steps, leading to cascading error propagation. To address these issues, we propose Table-Critic, a novel multi-agent framework that facilitates collaborative criticism and iterative refinement of the reasoning process until convergence to correct solutions. Our framework consists of four specialized agents: a Judge for error identification, a Critic for comprehensive critiques, a Refiner for process improvement, and a Curator for pattern distillation. To effectively deal with diverse and unpredictable error types, we introduce a self-evolving template tree that systematically accumulates critique knowledge through experience-driven learning and guides future reflections. Extensive experiments have demonstrated that Table-Critic achieves substantial improvements over existing methods, achieving superior accuracy and error correction rates while maintaining computational efficiency and lower solution degradation rate. </p> </div> </dd> <dt> <a name='item262'>[262]</a> <a href ="/abs/2502.11859" title="Abstract" id="2502.11859"> arXiv:2502.11859 </a> (cross-list from cs.CV) [<a href="/pdf/2502.11859" title="Download PDF" id="pdf-2502.11859" aria-labelledby="pdf-2502.11859">pdf</a>, <a href="https://arxiv.org/html/2502.11859v1" title="View HTML" id="html-2502.11859" aria-labelledby="html-2502.11859" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11859" title="Other formats" id="oth-2502.11859" aria-labelledby="oth-2502.11859">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Defining and Evaluating Visual Language Models' Basic Spatial Abilities: A Perspective from Psychometrics </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+W">Wenrui Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lyu,+D">Dalin Lyu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+W">Weihang Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+J">Jie Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+C">Chen Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yong Li</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> The Theory of Multiple Intelligences underscores the hierarchical nature of cognitive capabilities. To advance Spatial Artificial Intelligence, we pioneer a psychometric framework defining five Basic Spatial Abilities (BSAs) in Visual Language Models (VLMs): Spatial Perception, Spatial Relation, Spatial Orientation, Mental Rotation, and Spatial Visualization. Benchmarking 13 mainstream VLMs through nine validated psychometric experiments reveals significant gaps versus humans (average score 24.95 vs. 68.38), with three key findings: 1) VLMs mirror human hierarchies (strongest in 2D orientation, weakest in 3D rotation) with independent BSAs (Pearson's r<0.4); 2) Smaller models such as Qwen2-VL-7B surpass larger counterparts, with Qwen leading (30.82) and InternVL2 lagging (19.6); 3) Interventions like chain-of-thought (0.100 accuracy gain) and 5-shot training (0.259 improvement) show limits from architectural constraints. Identified barriers include weak geometry encoding and missing dynamic simulation. By linking psychometric BSAs to VLM capabilities, we provide a diagnostic toolkit for spatial intelligence evaluation, methodological foundations for embodied AI development, and a cognitive science-informed roadmap for achieving human-like spatial intelligence. </p> </div> </dd> <dt> <a name='item263'>[263]</a> <a href ="/abs/2502.11880" title="Abstract" id="2502.11880"> arXiv:2502.11880 </a> (cross-list from cs.LG) [<a href="/pdf/2502.11880" title="Download PDF" id="pdf-2502.11880" aria-labelledby="pdf-2502.11880">pdf</a>, <a href="https://arxiv.org/html/2502.11880v1" title="View HTML" id="html-2502.11880" aria-labelledby="html-2502.11880" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11880" title="Other formats" id="oth-2502.11880" aria-labelledby="oth-2502.11880">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Bitnet.cpp: Efficient Edge Inference for Ternary LLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+J">Jinheng Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+H">Hansong Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+T">Ting Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+S">Shijie Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xia,+Y">Yan Xia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+T">Ting Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wei,+J">Jianyu Wei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+S">Shuming Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+H">Hongyu Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wei,+F">Furu Wei</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 18 pages, 11 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Distributed, Parallel, and Cluster Computing (cs.DC) </div> <p class='mathjax'> The advent of 1-bit large language models (LLMs), led by BitNet b1.58, has spurred interest in ternary LLMs. Despite this, research and practical applications focusing on efficient edge inference for ternary LLMs remain scarce. To bridge this gap, we introduce <a href="http://Bitnet.cpp" rel="external noopener nofollow" class="link-external link-http">this http URL</a>, an inference system optimized for BitNet b1.58 and ternary LLMs. Given that mixed-precision matrix multiplication (mpGEMM) constitutes the bulk of inference time in ternary LLMs, <a href="http://Bitnet.cpp" rel="external noopener nofollow" class="link-external link-http">this http URL</a> incorporates a novel mpGEMM library to facilitate sub-2-bits-per-weight, efficient and lossless inference. The library features two core solutions: Ternary Lookup Table (TL), which addresses spatial inefficiencies of previous bit-wise methods, and Int2 with a Scale (I2_S), which ensures lossless edge inference, both enabling high-speed inference. Our experiments show that <a href="http://Bitnet.cpp" rel="external noopener nofollow" class="link-external link-http">this http URL</a> achieves up to a 6.25x increase in speed over full-precision baselines and up to 2.32x over low-bit baselines, setting new benchmarks in the field. Additionally, we expand TL to element-wise lookup table (ELUT) for low-bit LLMs in the appendix, presenting both theoretical and empirical evidence of its considerable potential. <a href="http://Bitnet.cpp" rel="external noopener nofollow" class="link-external link-http">this http URL</a> is publicly available at <a href="https://github.com/microsoft/BitNet/tree/paper" rel="external noopener nofollow" class="link-external link-https">this https URL</a> , offering a sophisticated solution for the efficient and practical deployment of edge LLMs. </p> </div> </dd> <dt> <a name='item264'>[264]</a> <a href ="/abs/2502.11881" title="Abstract" id="2502.11881"> arXiv:2502.11881 </a> (cross-list from cs.AI) [<a href="/pdf/2502.11881" title="Download PDF" id="pdf-2502.11881" aria-labelledby="pdf-2502.11881">pdf</a>, <a href="https://arxiv.org/html/2502.11881v1" title="View HTML" id="html-2502.11881" aria-labelledby="html-2502.11881" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11881" title="Other formats" id="oth-2502.11881" aria-labelledby="oth-2502.11881">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Hypothesis-Driven Theory-of-Mind Reasoning for Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+H">Hyunwoo Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sclar,+M">Melanie Sclar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhi-Xuan,+T">Tan Zhi-Xuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ying,+L">Lance Ying</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Levine,+S">Sydney Levine</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Y">Yang Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tenenbaum,+J+B">Joshua B. Tenenbaum</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Choi,+Y">Yejin Choi</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Existing LLM reasoning methods have shown impressive capabilities across various tasks, such as solving math and coding problems. However, applying these methods to scenarios without ground-truth answers or rule-based verification methods - such as tracking the mental states of an agent - remains challenging. Inspired by the sequential Monte Carlo algorithm, we introduce thought-tracing, an inference-time reasoning algorithm designed to trace the mental states of specific agents by generating hypotheses and weighting them based on observations without relying on ground-truth solutions to questions in datasets. Our algorithm is modeled after the Bayesian theory-of-mind framework, using LLMs to approximate probabilistic inference over agents' evolving mental states based on their perceptions and actions. We evaluate thought-tracing on diverse theory-of-mind benchmarks, demonstrating significant performance improvements compared to baseline LLMs. Our experiments also reveal interesting behaviors of the recent reasoning models - e.g., o1 and R1 - on theory-of-mind, highlighting the difference of social reasoning compared to other domains. </p> </div> </dd> <dt> <a name='item265'>[265]</a> <a href ="/abs/2502.11882" title="Abstract" id="2502.11882"> arXiv:2502.11882 </a> (cross-list from cs.AI) [<a href="/pdf/2502.11882" title="Download PDF" id="pdf-2502.11882" aria-labelledby="pdf-2502.11882">pdf</a>, <a href="https://arxiv.org/html/2502.11882v1" title="View HTML" id="html-2502.11882" aria-labelledby="html-2502.11882" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11882" title="Other formats" id="oth-2502.11882" aria-labelledby="oth-2502.11882">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Leveraging Dual Process Theory in Language Agent Framework for Real-time Simultaneous Human-AI Collaboration </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+S">Shao Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+X">Xihuai Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+W">Wenhao Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+C">Chaoran Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+J">Junru Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+T">Tingyu Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qiu,+L">Lin Qiu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+X">Xuezhi Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cai,+X">Xunliang Cai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yao,+W">Wen Yao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+W">Weinan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+X">Xinbing Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wen,+Y">Ying Wen</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Preprint under review </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL); Human-Computer Interaction (cs.HC); Machine Learning (cs.LG); Multiagent Systems (cs.MA) </div> <p class='mathjax'> Agents built on large language models (LLMs) have excelled in turn-by-turn human-AI collaboration but struggle with simultaneous tasks requiring real-time interaction. Latency issues and the challenge of inferring variable human strategies hinder their ability to make autonomous decisions without explicit instructions. Through experiments with current independent System 1 and System 2 methods, we validate the necessity of using Dual Process Theory (DPT) in real-time tasks. We propose DPT-Agent, a novel language agent framework that integrates System 1 and System 2 for efficient real-time simultaneous human-AI collaboration. DPT-Agent's System 1 uses a Finite-state Machine (FSM) and code-as-policy for fast, intuitive, and controllable decision-making. DPT-Agent's System 2 integrates Theory of Mind (ToM) and asynchronous reflection to infer human intentions and perform reasoning-based autonomous decisions. We demonstrate the effectiveness of DPT-Agent through further experiments with rule-based agents and human collaborators, showing significant improvements over mainstream LLM-based frameworks. To the best of our knowledge, DPT-Agent is the first language agent framework that achieves successful real-time simultaneous human-AI collaboration autonomously. Code of DPT-Agent can be found in <a href="https://github.com/sjtu-marl/DPT-Agent" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item266'>[266]</a> <a href ="/abs/2502.11886" title="Abstract" id="2502.11886"> arXiv:2502.11886 </a> (cross-list from cs.LG) [<a href="/pdf/2502.11886" title="Download PDF" id="pdf-2502.11886" aria-labelledby="pdf-2502.11886">pdf</a>, <a href="https://arxiv.org/html/2502.11886v1" title="View HTML" id="html-2502.11886" aria-labelledby="html-2502.11886" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11886" title="Other formats" id="oth-2502.11886" aria-labelledby="oth-2502.11886">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LIMR: Less is More for RL Scaling </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+X">Xuefeng Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zou,+H">Haoyang Zou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+P">Pengfei Liu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 6pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> In this paper, we ask: what truly determines the effectiveness of RL training data for enhancing language models' reasoning capabilities? While recent advances like o1, Deepseek R1, and Kimi1.5 demonstrate RL's potential, the lack of transparency about training data requirements has hindered systematic progress. Starting directly from base models without distillation, we challenge the assumption that scaling up RL training data inherently improves performance. we demonstrate that a strategically selected subset of just 1,389 samples can outperform the full 8,523-sample dataset. We introduce Learning Impact Measurement (LIM), an automated method to evaluate and prioritize training samples based on their alignment with model learning trajectories, enabling efficient resource utilization and scalable implementation. Our method achieves comparable or even superior performance using only 1,389 samples versus the full 8,523 samples dataset. Notably, while recent data-efficient approaches (e.g., LIMO and s1) show promise with 32B-scale models, we find it significantly underperforms at 7B-scale through supervised fine-tuning (SFT). In contrast, our RL-based LIMR achieves 16.7% higher accuracy on AIME24 and outperforms LIMO and s1 by 13.0% and 22.2% on MATH500. These results fundamentally reshape our understanding of RL scaling in LLMs, demonstrating that precise sample selection, rather than data scale, may be the key to unlocking enhanced reasoning capabilities. For reproducible research and future innovation, we are open-sourcing LIMR, including implementation of LIM, training and evaluation code, curated datasets, and trained models at <a href="https://github.com/GAIR-NLP/LIMR" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item267'>[267]</a> <a href ="/abs/2502.11919" title="Abstract" id="2502.11919"> arXiv:2502.11919 </a> (cross-list from cs.HC) [<a href="/pdf/2502.11919" title="Download PDF" id="pdf-2502.11919" aria-labelledby="pdf-2502.11919">pdf</a>, <a href="https://arxiv.org/html/2502.11919v1" title="View HTML" id="html-2502.11919" aria-labelledby="html-2502.11919" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11919" title="Other formats" id="oth-2502.11919" aria-labelledby="oth-2502.11919">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> From Text to Trust: Empowering AI-assisted Decision Making with Adaptive LLM-powered Analysis </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Z">Zhuoyan Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+H">Hangxiao Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+Z">Zhuoran Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiao,+Z">Ziang Xiao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yin,+M">Ming Yin</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> CHI 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Human-Computer Interaction (cs.HC)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> AI-assisted decision making becomes increasingly prevalent, yet individuals often fail to utilize AI-based decision aids appropriately especially when the AI explanations are absent, potentially as they do not %understand reflect on AI's decision recommendations critically. Large language models (LLMs), with their exceptional conversational and analytical capabilities, present great opportunities to enhance AI-assisted decision making in the absence of AI explanations by providing natural-language-based analysis of AI's decision recommendation, e.g., how each feature of a decision making task might contribute to the AI recommendation. In this paper, via a randomized experiment, we first show that presenting LLM-powered analysis of each task feature, either sequentially or concurrently, does not significantly improve people's AI-assisted decision performance. To enable decision makers to better leverage LLM-powered analysis, we then propose an algorithmic framework to characterize the effects of LLM-powered analysis on human decisions and dynamically decide which analysis to present. Our evaluation with human subjects shows that this approach effectively improves decision makers' appropriate reliance on AI in AI-assisted decision making. </p> </div> </dd> <dt> <a name='item268'>[268]</a> <a href ="/abs/2502.12025" title="Abstract" id="2502.12025"> arXiv:2502.12025 </a> (cross-list from cs.AI) [<a href="/pdf/2502.12025" title="Download PDF" id="pdf-2502.12025" aria-labelledby="pdf-2502.12025">pdf</a>, <a href="/format/2502.12025" title="Other formats" id="oth-2502.12025" aria-labelledby="oth-2502.12025">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SafeChain: Safety of Language Models with Long Chain-of-Thought Reasoning Capabilities </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+F">Fengqing Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+Z">Zhangchen Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yuetai Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Niu,+L">Luyao Niu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiang,+Z">Zhen Xiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+B">Bo Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+B+Y">Bill Yuchen Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Poovendran,+R">Radha Poovendran</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Emerging large reasoning models (LRMs), such as DeepSeek-R1 models, leverage long chain-of-thought (CoT) reasoning to generate structured intermediate steps, enhancing their reasoning capabilities. However, long CoT does not inherently guarantee safe outputs, potentially leading to harmful consequences such as the introduction of security vulnerabilities in code or the spread of misinformation. Current research on large language model (LLM) safety usually focuses on short-answer responses, overlooking the long CoT style outputs of LRMs. To bridge this gap, we conduct a systematic study of LRM safety. First, we investigate safety evaluators calibrated against human annotations. Using our newly developed metrics, we thoroughly assess the safety of 12 state-of-the-art LRMs on StrongReject and WildJailbreak datasets. Our results show that LRMs are not safe compared to their reasoning advance. Further, we perform a fine-grained analysis of the reasoning trace and final answer. We find that three decoding strategies-ZeroThink, LessThink, and MoreThink-can improve model safety without additional training. However, these strategies either use constrained reasoning traces or incur high inference costs. To better strengthen LRM safety, we introduce SafeChain, the first-of-its-kind safety training dataset in CoT style. We fine-tune two LRMs with SafeChain, showing that it not only enhances model safety but also preserves performance across 6 reasoning benchmarks. </p> </div> </dd> <dt> <a name='item269'>[269]</a> <a href ="/abs/2502.12081" title="Abstract" id="2502.12081"> arXiv:2502.12081 </a> (cross-list from cs.CV) [<a href="/pdf/2502.12081" title="Download PDF" id="pdf-2502.12081" aria-labelledby="pdf-2502.12081">pdf</a>, <a href="https://arxiv.org/html/2502.12081v1" title="View HTML" id="html-2502.12081" aria-labelledby="html-2502.12081" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12081" title="Other formats" id="oth-2502.12081" aria-labelledby="oth-2502.12081">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Unhackable Temporal Rewarding for Scalable Video MLLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+E">En Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+K">Kangheng Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+L">Liang Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wei,+Y">Yana Wei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+Z">Zining Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wei,+H">Haoran Wei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+J">Jianjian Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ge,+Z">Zheng Ge</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+X">Xiangyu Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+J">Jingyu Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tao,+W">Wenbing Tao</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted by ICLR2025. Project Page: <a href="https://ahnsun.github.io/UTR/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> In the pursuit of superior video-processing MLLMs, we have encountered a perplexing paradox: the "anti-scaling law", where more data and larger models lead to worse performance. This study unmasks the culprit: "temporal hacking", a phenomenon where models shortcut by fixating on select frames, missing the full video narrative. In this work, we systematically establish a comprehensive theory of temporal hacking, defining it from a reinforcement learning perspective, introducing the Temporal Perplexity (TPL) score to assess this misalignment, and proposing the Unhackable Temporal Rewarding (UTR) framework to mitigate the temporal hacking. Both theoretically and empirically, TPL proves to be a reliable indicator of temporal modeling quality, correlating strongly with frame activation patterns. Extensive experiments reveal that UTR not only counters temporal hacking but significantly elevates video comprehension capabilities. This work not only advances video-AI systems but also illuminates the critical importance of aligning proxy rewards with true objectives in MLLM development. </p> </div> </dd> <dt> <a name='item270'>[270]</a> <a href ="/abs/2502.12085" title="Abstract" id="2502.12085"> arXiv:2502.12085 </a> (cross-list from cs.LG) [<a href="/pdf/2502.12085" title="Download PDF" id="pdf-2502.12085" aria-labelledby="pdf-2502.12085">pdf</a>, <a href="https://arxiv.org/html/2502.12085v1" title="View HTML" id="html-2502.12085" aria-labelledby="html-2502.12085" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12085" title="Other formats" id="oth-2502.12085" aria-labelledby="oth-2502.12085">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> APB: Accelerating Distributed Long-Context Inference by Passing Compressed Context Blocks across GPUs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+Y">Yuxiang Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+M">Mingye Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+X">Xu Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiao,+C">Chaojun Xiao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+W">Weilin Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ao,+S">Sun Ao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+H">Hao Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+J">Jie Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zhiyuan Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+M">Maosong Sun</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Preprint </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> While long-context inference is crucial for advancing large language model (LLM) applications, its prefill speed remains a significant bottleneck. Current approaches, including sequence parallelism strategies and compute reduction through approximate attention mechanisms, still fall short of delivering optimal inference efficiency. This hinders scaling the inputs to longer sequences and processing long-context queries in a timely manner. To address this, we introduce APB, an efficient long-context inference framework that leverages multi-host approximate attention to enhance prefill speed by reducing compute and enhancing parallelism simultaneously. APB introduces a communication mechanism for essential key-value pairs within a sequence parallelism framework, enabling a faster inference speed while maintaining task performance. We implement APB by incorporating a tailored FlashAttn kernel alongside optimized distribution strategies, supporting diverse models and parallelism configurations. APB achieves speedups of up to 9.2x, 4.2x, and 1.6x compared with FlashAttn, RingAttn, and StarAttn, respectively, without any observable task performance degradation. We provide the implementation and experiment code of APB in <a href="https://github.com/thunlp/APB" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item271'>[271]</a> <a href ="/abs/2502.12094" title="Abstract" id="2502.12094"> arXiv:2502.12094 </a> (cross-list from cs.AI) [<a href="/pdf/2502.12094" title="Download PDF" id="pdf-2502.12094" aria-labelledby="pdf-2502.12094">pdf</a>, <a href="https://arxiv.org/html/2502.12094v1" title="View HTML" id="html-2502.12094" aria-labelledby="html-2502.12094" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12094" title="Other formats" id="oth-2502.12094" aria-labelledby="oth-2502.12094">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Study on Leveraging Search and Self-Feedback for Agent Reasoning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=K,+K">Karthikeyan K</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+M">Michelle Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mansimov,+E">Elman Mansimov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Margatina,+K">Katerina Margatina</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pratik,+A">Anurag Pratik</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bonadiman,+D">Daniele Bonadiman</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sunkara,+M">Monica Sunkara</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yi Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Benajiba,+Y">Yassine Benajiba</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Under review </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Recent works have demonstrated that incorporating search during inference can significantly improve reasoning capabilities of language agents. Some approaches may make use of the ground truth or rely on model's own generated feedback. The search algorithm uses this feedback to then produce values that will update its criterion for exploring and exploiting various reasoning paths. In this study, we investigate how search and model's self-feedback can be leveraged for reasoning tasks. First, we explore differences in ground-truth feedback and self-feedback during search for math reasoning. Second, we observe limitations in applying search techniques to more complex tasks like tool-calling and design domain-specific approaches to address these gaps. Our experiments reveal challenges related to generalization when solely relying on self-feedback during search. For search to work effectively, either access to the ground-truth is needed or feedback mechanisms need to be carefully designed for the specific task. </p> </div> </dd> <dt> <a name='item272'>[272]</a> <a href ="/abs/2502.12118" title="Abstract" id="2502.12118"> arXiv:2502.12118 </a> (cross-list from cs.LG) [<a href="/pdf/2502.12118" title="Download PDF" id="pdf-2502.12118" aria-labelledby="pdf-2502.12118">pdf</a>, <a href="https://arxiv.org/html/2502.12118v1" title="View HTML" id="html-2502.12118" aria-labelledby="html-2502.12118" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12118" title="Other formats" id="oth-2502.12118" aria-labelledby="oth-2502.12118">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Scaling Test-Time Compute Without Verification or RL is Suboptimal </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Setlur,+A">Amrith Setlur</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rajaraman,+N">Nived Rajaraman</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Levine,+S">Sergey Levine</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kumar,+A">Aviral Kumar</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Despite substantial advances in scaling test-time compute, an ongoing debate in the community is how it should be scaled up to enable continued and efficient improvements with scaling. There are largely two approaches: first, distilling successful search or thinking traces; and second, using verification (e.g., 0/1 outcome rewards, reward models, or verifiers) to guide reinforcement learning (RL) and search algorithms. In this paper, we prove that finetuning LLMs with verifier-based (VB) methods based on RL or search is far superior to verifier-free (VF) approaches based on distilling or cloning search traces, given a fixed amount of compute/data budget. Further, we show that as we scale test-time compute (measured as the output token length) and training data, suboptimality of VF methods scales poorly compared to VB when the base pre-trained LLM presents a heterogeneous distribution over correct solution traces (e.g., different lengths, styles, etc.) and admits a non-sharp distribution over rewards on traces sampled from it. We formalize this condition using anti-concentration [Erd艖s, 1945]. This implies a stronger result that VB methods scale better asymptotically, with the performance gap between VB and VF methods widening as test-time budget grows. We corroborate our theory empirically on both didactic and math reasoning problems with 3/8/32B-sized pre-trained LLMs, where we find verification is crucial for scaling test-time compute. </p> </div> </dd> <dt> <a name='item273'>[273]</a> <a href ="/abs/2502.12119" title="Abstract" id="2502.12119"> arXiv:2502.12119 </a> (cross-list from cs.CV) [<a href="/pdf/2502.12119" title="Download PDF" id="pdf-2502.12119" aria-labelledby="pdf-2502.12119">pdf</a>, <a href="https://arxiv.org/html/2502.12119v1" title="View HTML" id="html-2502.12119" aria-labelledby="html-2502.12119" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12119" title="Other formats" id="oth-2502.12119" aria-labelledby="oth-2502.12119">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> PRISM: Self-Pruning Intrinsic Selection Method for Training-Free Multimodal Data Selection </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Bi,+J">Jinhe Bi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yifan Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yan,+D">Danqi Yan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiao,+X">Xun Xiao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hecker,+A">Artur Hecker</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tresp,+V">Volker Tresp</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+Y">Yunpu Ma</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> Visual instruction tuning refines pre-trained Multimodal Large Language Models (MLLMs) to enhance their real-world task performance. However, the rapid expansion of visual instruction datasets introduces significant data redundancy, leading to excessive computational costs. Existing data selection methods predominantly rely on proxy models or loss-based metrics, both of which impose substantial computational overheads due to the necessity of model inference and backpropagation. To address this challenge, we propose PRISM, a novel training-free approach for efficient multimodal data selection. Unlike existing methods, PRISM eliminates the reliance on proxy models, warm-up pretraining, and gradient-based optimization. Instead, it leverages Pearson correlation analysis to quantify the intrinsic visual encoding properties of MLLMs, computing a task-specific correlation score to identify high-value instances. This not only enbles data-efficient selection,but maintains the original performance. Empirical evaluations across multiple MLLMs demonstrate that PRISM reduces the overall time required for visual instruction tuning and data selection to just 30% of conventional methods, while surpassing fully fine-tuned models across eight multimodal and three language understanding benchmarks, achieving a 101.7% relative improvement in final performance. </p> </div> </dd> <dt> <a name='item274'>[274]</a> <a href ="/abs/2502.12120" title="Abstract" id="2502.12120"> arXiv:2502.12120 </a> (cross-list from cs.LG) [<a href="/pdf/2502.12120" title="Download PDF" id="pdf-2502.12120" aria-labelledby="pdf-2502.12120">pdf</a>, <a href="https://arxiv.org/html/2502.12120v1" title="View HTML" id="html-2502.12120" aria-labelledby="html-2502.12120" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12120" title="Other formats" id="oth-2502.12120" aria-labelledby="oth-2502.12120">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LLMs on the Line: Data Determines Loss-to-Loss Scaling Laws </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Mayilvahanan,+P">Prasanna Mayilvahanan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wiedemer,+T">Thadd盲us Wiedemer</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mallick,+S">Sayak Mallick</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bethge,+M">Matthias Bethge</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Brendel,+W">Wieland Brendel</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> Scaling laws guide the development of large language models (LLMs) by offering estimates for the optimal balance of model size, tokens, and compute. More recently, loss-to-loss scaling laws that relate losses across pretraining datasets and downstream tasks have emerged as a powerful tool for understanding and improving LLM performance. In this work, we investigate which factors most strongly influence loss-to-loss scaling. Our experiments reveal that the pretraining data and tokenizer determine the scaling trend. In contrast, model size, optimization hyperparameters, and even significant architectural differences, such as between transformer-based models like Llama and state-space models like Mamba, have limited impact. Consequently, practitioners should carefully curate suitable pretraining datasets for optimal downstream performance, while architectures and other settings can be freely optimized for training efficiency. </p> </div> </dd> <dt> <a name='item275'>[275]</a> <a href ="/abs/2502.12149" title="Abstract" id="2502.12149"> arXiv:2502.12149 </a> (cross-list from cs.MA) [<a href="/pdf/2502.12149" title="Download PDF" id="pdf-2502.12149" aria-labelledby="pdf-2502.12149">pdf</a>, <a href="https://arxiv.org/html/2502.12149v1" title="View HTML" id="html-2502.12149" aria-labelledby="html-2502.12149" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12149" title="Other formats" id="oth-2502.12149" aria-labelledby="oth-2502.12149">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> HARBOR: Exploring Persona Dynamics in Multi-Agent Competition </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+K">Kenan Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiong,+L">Li Xiong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+F">Fei Liu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Multiagent Systems (cs.MA)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> We investigate factors contributing to LLM agents' success in competitive multi-agent environments, using auctions as a testbed where agents bid to maximize profit. The agents are equipped with bidding domain knowledge, distinct personas that reflect item preferences, and a memory of auction history. Our work extends the classic auction scenario by creating a realistic environment where multiple agents bid on houses, weighing aspects such as size, location, and budget to secure the most desirable homes at the lowest prices. Particularly, we investigate three key questions: (a) How does a persona influence an agent's behavior in a competitive setting? (b) Can an agent effectively profile its competitors' behavior during auctions? (c) How can persona profiling be leveraged to create an advantage using strategies such as theory of mind? Through a series of experiments, we analyze the behaviors of LLM agents and shed light on new findings. Our testbed, called HARBOR, offers a valuable platform for deepening our understanding of multi-agent workflows in competitive environments. </p> </div> </dd> </dl> <dl id='articles'> <h3>Replacement submissions (showing 218 of 218 entries)</h3> <dt> <a name='item276'>[276]</a> <a href ="/abs/1708.09151" title="Abstract" id="1708.09151"> arXiv:1708.09151 </a> (replaced) [<a href="/pdf/1708.09151" title="Download PDF" id="pdf-1708.09151" aria-labelledby="pdf-1708.09151">pdf</a>, <a href="https://arxiv.org/html/1708.09151v3" title="View HTML" id="html-1708.09151" aria-labelledby="html-1708.09151" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/1708.09151" title="Other formats" id="oth-1708.09151" aria-labelledby="oth-1708.09151">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Paradigm Completion for Derivational Morphology </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Cotterell,+R">Ryan Cotterell</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Vylomova,+E">Ekaterina Vylomova</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Khayrallah,+H">Huda Khayrallah</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kirov,+C">Christo Kirov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yarowsky,+D">David Yarowsky</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> EMNLP 2017 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The generation of complex derived word forms has been an overlooked problem in NLP; we fill this gap by applying neural sequence-to-sequence models to the task. We overview the theoretical motivation for a paradigmatic treatment of derivational morphology, and introduce the task of derivational paradigm completion as a parallel to inflectional paradigm completion. State-of-the-art neural models, adapted from the inflection task, are able to learn a range of derivation patterns, and outperform a non-neural baseline by 16.4%. However, due to semantic, historical, and lexical considerations involved in derivational morphology, future work will be needed to achieve performance parity with inflection-generating systems. </p> </div> </dd> <dt> <a name='item277'>[277]</a> <a href ="/abs/2104.08620" title="Abstract" id="2104.08620"> arXiv:2104.08620 </a> (replaced) [<a href="/pdf/2104.08620" title="Download PDF" id="pdf-2104.08620" aria-labelledby="pdf-2104.08620">pdf</a>, <a href="https://arxiv.org/html/2104.08620v4" title="View HTML" id="html-2104.08620" aria-labelledby="html-2104.08620" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2104.08620" title="Other formats" id="oth-2104.08620" aria-labelledby="oth-2104.08620">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Decrypting Cryptic Crosswords: Semantically Complex Wordplay Puzzles as a Target for NLP </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Rozner,+J">Josh Rozner</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Potts,+C">Christopher Potts</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mahowald,+K">Kyle Mahowald</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Cryptic crosswords, the dominant crossword variety in the UK, are a promising target for advancing NLP systems that seek to process semantically complex, highly compositional language. Cryptic clues read like fluent natural language but are adversarially composed of two parts: a definition and a wordplay cipher requiring character-level manipulations. Expert humans use creative intelligence to solve cryptics, flexibly combining linguistic, world, and domain knowledge. In this paper, we make two main contributions. First, we present a dataset of cryptic clues as a challenging new benchmark for NLP systems that seek to process compositional language in more creative, human-like ways. After showing that three non-neural approaches and T5, a state-of-the-art neural language model, do not achieve good performance, we make our second main contribution: a novel curriculum approach, in which the model is first fine-tuned on related tasks such as unscrambling <a href="http://words.We" rel="external noopener nofollow" class="link-external link-http">this http URL</a> also introduce a challenging data split, examine the meta-linguistic capabilities of subword-tokenized models, and investigate model systematicity by perturbing the wordplay part of clues, showing that T5 exhibits behavior partially consistent with human solving strategies. Although our curricular approach considerably improves on the T5 baseline, our best-performing model still fails to generalize to the extent that humans can. Thus, cryptic crosswords remain an unsolved challenge for NLP systems and a potential source of future innovation. </p> </div> </dd> <dt> <a name='item278'>[278]</a> <a href ="/abs/2309.06089" title="Abstract" id="2309.06089"> arXiv:2309.06089 </a> (replaced) [<a href="/pdf/2309.06089" title="Download PDF" id="pdf-2309.06089" aria-labelledby="pdf-2309.06089">pdf</a>, <a href="https://arxiv.org/html/2309.06089v3" title="View HTML" id="html-2309.06089" aria-labelledby="html-2309.06089" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2309.06089" title="Other formats" id="oth-2309.06089" aria-labelledby="oth-2309.06089">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Measuring Catastrophic Forgetting in Cross-Lingual Transfer Paradigms: Exploring Tuning Strategies </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Koloski,+B">Boshko Koloski</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=%C5%A0krlj,+B">Bla啪 艩krlj</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Robnik-%C5%A0ikonja,+M">Marko Robnik-艩ikonja</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pollak,+S">Senja Pollak</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted to IEEE Access </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> The cross-lingual transfer is a promising technique to solve tasks in less-resourced languages. In this empirical study, we compare two fine-tuning approaches combined with zero-shot and full-shot learning approaches for large language models in a cross-lingual setting. As fine-tuning strategies, we compare parameter-efficient adapter methods with fine-tuning of all parameters. As cross-lingual transfer strategies, we compare the intermediate-training (\textit{IT}) that uses each language sequentially and cross-lingual validation (\textit{CLV}) that uses a target language already in the validation phase of fine-tuning. We assess the success of transfer and the extent of catastrophic forgetting in a source language due to cross-lingual transfer, i.e., how much previously acquired knowledge is lost when we learn new information in a different language. The results on two different classification problems, hate speech detection and product reviews, each containing datasets in several languages, show that the \textit{IT} cross-lingual strategy outperforms \textit{CLV} for the target language. Our findings indicate that, in the majority of cases, the \textit{CLV} strategy demonstrates superior retention of knowledge in the base language (English) compared to the \textit{IT} strategy, when evaluating catastrophic forgetting in multiple cross-lingual transfers. </p> </div> </dd> <dt> <a name='item279'>[279]</a> <a href ="/abs/2310.00833" title="Abstract" id="2310.00833"> arXiv:2310.00833 </a> (replaced) [<a href="/pdf/2310.00833" title="Download PDF" id="pdf-2310.00833" aria-labelledby="pdf-2310.00833">pdf</a>, <a href="https://arxiv.org/html/2310.00833v2" title="View HTML" id="html-2310.00833" aria-labelledby="html-2310.00833" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2310.00833" title="Other formats" id="oth-2310.00833" aria-labelledby="oth-2310.00833">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Necessary and Sufficient Watermark for Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Takezawa,+Y">Yuki Takezawa</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sato,+R">Ryoma Sato</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bao,+H">Han Bao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Niwa,+K">Kenta Niwa</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yamada,+M">Makoto Yamada</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> TMLR 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> In recent years, large language models (LLMs) have achieved remarkable performances in various NLP tasks. They can generate texts that are indistinguishable from those written by humans. Such remarkable performance of LLMs increases their risk of being used for malicious purposes, such as generating fake news articles. Therefore, it is necessary to develop methods for distinguishing texts written by LLMs from those written by humans. Watermarking is one of the most powerful methods for achieving this. Although existing watermarking methods have successfully detected texts generated by LLMs, they significantly degrade the quality of the generated texts. In this study, we propose the Necessary and Sufficient Watermark (NS-Watermark) for inserting watermarks into generated texts without degrading the text quality. More specifically, we derive minimum constraints required to be imposed on the generated texts to distinguish whether LLMs or humans write the texts. Then, we formulate the NS-Watermark as a constrained optimization problem and propose an efficient algorithm to solve it. Through the experiments, we demonstrate that the NS-Watermark can generate more natural texts than existing watermarking methods and distinguish more accurately between texts written by LLMs and those written by humans. Especially in machine translation tasks, the NS-Watermark can outperform the existing watermarking method by up to 30 BLEU scores. </p> </div> </dd> <dt> <a name='item280'>[280]</a> <a href ="/abs/2311.09730" title="Abstract" id="2311.09730"> arXiv:2311.09730 </a> (replaced) [<a href="/pdf/2311.09730" title="Download PDF" id="pdf-2311.09730" aria-labelledby="pdf-2311.09730">pdf</a>, <a href="https://arxiv.org/html/2311.09730v2" title="View HTML" id="html-2311.09730" aria-labelledby="html-2311.09730" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2311.09730" title="Other formats" id="oth-2311.09730" aria-labelledby="oth-2311.09730">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Sociodemographic Prompting is Not Yet an Effective Approach for Simulating Subjective Judgments with LLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+H">Huaman Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pei,+J">Jiaxin Pei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Choi,+M">Minje Choi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jurgens,+D">David Jurgens</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Computers and Society (cs.CY); Human-Computer Interaction (cs.HC); Machine Learning (cs.LG) </div> <p class='mathjax'> Human judgments are inherently subjective and are actively affected by personal traits such as gender and ethnicity. While Large Language Models (LLMs) are widely used to simulate human responses across diverse contexts, their ability to account for demographic differences in subjective tasks remains uncertain. In this study, leveraging the POPQUORN dataset, we evaluate nine popular LLMs on their ability to understand demographic differences in two subjective judgment tasks: politeness and offensiveness. We find that in zero-shot settings, most models' predictions for both tasks align more closely with labels from White participants than those from Asian or Black participants, while only a minor gender bias favoring women appears in the politeness task. Furthermore, sociodemographic prompting does not consistently improve and, in some cases, worsens LLMs' ability to perceive language from specific sub-populations. These findings highlight potential demographic biases in LLMs when performing subjective judgment tasks and underscore the limitations of sociodemographic prompting as a strategy to achieve pluralistic alignment. Code and data are available at: <a href="https://github.com/Jiaxin-Pei/LLM-as-Subjective-Judge" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item281'>[281]</a> <a href ="/abs/2401.17809" title="Abstract" id="2401.17809"> arXiv:2401.17809 </a> (replaced) [<a href="/pdf/2401.17809" title="Download PDF" id="pdf-2401.17809" aria-labelledby="pdf-2401.17809">pdf</a>, <a href="https://arxiv.org/html/2401.17809v4" title="View HTML" id="html-2401.17809" aria-labelledby="html-2401.17809" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2401.17809" title="Other formats" id="oth-2401.17809" aria-labelledby="oth-2401.17809">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SWEA: Updating Factual Knowledge in Large Language Models via Subject Word Embedding Altering </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+X">Xiaopeng Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+S">Shasha Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+S">Shezheng Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+H">Huijun Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ji,+B">Bin Ji</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+X">Xi Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+J">Jun Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+J">Jie Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+X">Xiaodong Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+J">Jing Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+W">Weimin Zhang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> AAAI25. Our code is available at <a href="https://github.com/xpq-tech/SWEA" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> The general capabilities of large language models (LLMs) make them the infrastructure for various AI applications, but updating their inner knowledge requires significant resources. Recent model editing is a promising technique for efficiently updating a small amount of knowledge of LLMs and has attracted much attention. In particular, local editing methods, which directly update model parameters, are proven suitable for updating small amounts of knowledge. Local editing methods update weights by computing least squares closed-form solutions and identify edited knowledge by vector-level matching in inference, which achieve promising results. However, these methods still require a lot of time and resources to complete the computation. Moreover, vector-level matching lacks reliability, and such updates disrupt the original organization of the model's parameters. To address these issues, we propose a detachable and expandable Subject Word Embedding Altering (SWEA) framework, which finds the editing embeddings through token-level matching and adds them to the subject word embeddings in Transformer input. To get these editing embeddings, we propose optimizing then suppressing fusion method, which first optimizes learnable embedding vectors for the editing target and then suppresses the Knowledge Embedding Dimensions (KEDs) to obtain final editing embeddings. We thus propose SWEA$\oplus$OS method for editing factual knowledge in LLMs. We demonstrate the overall state-of-the-art (SOTA) performance of SWEA$\oplus$OS on the CounterFact and zsRE datasets. To further validate the reasoning ability of SWEA$\oplus$OS in editing knowledge, we evaluate it on the more complex RippleEdits benchmark. The results demonstrate that SWEA$\oplus$OS possesses SOTA reasoning ability. </p> </div> </dd> <dt> <a name='item282'>[282]</a> <a href ="/abs/2402.01737" title="Abstract" id="2402.01737"> arXiv:2402.01737 </a> (replaced) [<a href="/pdf/2402.01737" title="Download PDF" id="pdf-2402.01737" aria-labelledby="pdf-2402.01737">pdf</a>, <a href="https://arxiv.org/html/2402.01737v3" title="View HTML" id="html-2402.01737" aria-labelledby="html-2402.01737" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2402.01737" title="Other formats" id="oth-2402.01737" aria-labelledby="oth-2402.01737">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Assistive Large Language Model Agents for Socially-Aware Negotiation Dialogues </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Hua,+Y">Yuncheng Hua</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qu,+L">Lizhen Qu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Haffari,+G">Gholamreza Haffari</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 28 pages, 3 figures, 14 tables; The paper has been published in the Findings of the Association for Computational Linguistics: EMNLP 2024 </div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> Findings of the Association for Computational Linguistics: EMNLP 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> We develop assistive agents based on Large Language Models (LLMs) that aid interlocutors in business negotiations. Specifically, we simulate business negotiations by letting two LLM-based agents engage in role play. A third LLM acts as a remediator agent to rewrite utterances violating norms for improving negotiation outcomes. We introduce a simple tuning-free and label-free In-Context Learning (ICL) method to identify high-quality ICL exemplars for the remediator, where we propose a novel select criteria, called value impact, to measure the quality of the negotiation outcomes. We provide rich empirical evidence to demonstrate its effectiveness in negotiations across three different negotiation topics. We have released our source code and the generated dataset at: <a href="https://github.com/tk1363704/SADAS" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item283'>[283]</a> <a href ="/abs/2402.02243" title="Abstract" id="2402.02243"> arXiv:2402.02243 </a> (replaced) [<a href="/pdf/2402.02243" title="Download PDF" id="pdf-2402.02243" aria-labelledby="pdf-2402.02243">pdf</a>, <a href="/format/2402.02243" title="Other formats" id="oth-2402.02243" aria-labelledby="oth-2402.02243">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Language Writ Large: LLMs, ChatGPT, Grounding, Meaning and Understanding </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Harnad,+S">Stevan Harnad</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 54 pages, 29 references </div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> Frontiers in Artificial Intelligence 7: 1490698 (2025) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Neurons and Cognition (q-bio.NC) </div> <p class='mathjax'> Apart from what (little) OpenAI may be concealing from us, we all know (roughly) how ChatGPT works (its huge text database, its statistics, its vector representations, and their huge number of parameters, its next-word training, and so on). But none of us can say (hand on heart) that we are not surprised by what ChatGPT has proved to be able to do with these resources. This has even driven some of us to conclude that ChatGPT actually understands. It is not true that it understands. But it is also not true that we understand how it can do what it can do. I will suggest some hunches about benign biases: convergent constraints that emerge at LLM scale that may be helping ChatGPT do so much better than we would have expected. These biases are inherent in the nature of language itself, at LLM scale, and they are closely linked to what it is that ChatGPT lacks, which is direct sensorimotor grounding to connect its words to their referents and its propositions to their meanings. These convergent biases are related to (1) the parasitism of indirect verbal grounding on direct sensorimotor grounding, (2) the circularity of verbal definition, (3) the mirroring of language production and comprehension, (4) iconicity in propositions at LLM scale, (5) computational counterparts of human categorical perception in category learning by neural nets, and perhaps also (6) a conjecture by Chomsky about the laws of thought. The exposition will be in the form of a dialogue with ChatGPT-4. </p> </div> </dd> <dt> <a name='item284'>[284]</a> <a href ="/abs/2402.11068" title="Abstract" id="2402.11068"> arXiv:2402.11068 </a> (replaced) [<a href="/pdf/2402.11068" title="Download PDF" id="pdf-2402.11068" aria-labelledby="pdf-2402.11068">pdf</a>, <a href="https://arxiv.org/html/2402.11068v2" title="View HTML" id="html-2402.11068" aria-labelledby="html-2402.11068" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2402.11068" title="Other formats" id="oth-2402.11068" aria-labelledby="oth-2402.11068">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Large Language Models for Causal Discovery: Current Landscape and Future Directions </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wan,+G">Guangya Wan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+Y">Yunsheng Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Y">Yuqi Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+M">Mengxuan Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+S">Sheng Li</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Causal discovery (CD) and Large Language Models (LLMs) have emerged as transformative fields in artificial intelligence that have evolved largely independently. While CD specializes in uncovering cause-effect relationships from data, and LLMs excel at natural language processing and generation, their integration presents unique opportunities for advancing causal understanding. This survey examines how LLMs are transforming CD across three key dimensions: direct causal extraction from text, integration of domain knowledge into statistical methods, and refinement of causal structures. We systematically analyze approaches that leverage LLMs for CD tasks, highlighting their innovative use of metadata and natural language for causal inference. Our analysis reveals both LLMs' potential to enhance traditional CD methods and their current limitations as imperfect expert systems. We identify key research gaps, outline evaluation frameworks and benchmarks for LLM-based causal discovery, and advocate future research efforts for leveraging LLMs in causality research. As the first comprehensive examination of the synergy between LLMs and CD, this work lays the groundwork for future advances in the field. </p> </div> </dd> <dt> <a name='item285'>[285]</a> <a href ="/abs/2402.12649" title="Abstract" id="2402.12649"> arXiv:2402.12649 </a> (replaced) [<a href="/pdf/2402.12649" title="Download PDF" id="pdf-2402.12649" aria-labelledby="pdf-2402.12649">pdf</a>, <a href="https://arxiv.org/html/2402.12649v2" title="View HTML" id="html-2402.12649" aria-labelledby="html-2402.12649" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2402.12649" title="Other formats" id="oth-2402.12649" aria-labelledby="oth-2402.12649">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Bias in Language Models: Beyond Trick Tests and Toward RUTEd Evaluation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lum,+K">Kristian Lum</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Anthis,+J+R">Jacy Reese Anthis</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Robinson,+K">Kevin Robinson</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nagpal,+C">Chirag Nagpal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=D'Amour,+A">Alexander D'Amour</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Applications (stat.AP) </div> <p class='mathjax'> Standard benchmarks of bias and fairness in large language models (LLMs) measure the association between social attributes implied in user prompts and short LLM responses. In the commonly studied domain of gender-occupation bias, we test whether these benchmarks are robust to lengthening the LLM responses as a measure of Realistic Use and Tangible Effects (i.e., RUTEd evaluations). From the current literature, we adapt three standard bias metrics (neutrality, skew, and stereotype), and we develop analogous RUTEd evaluations from three contexts of real-world use: children's bedtime stories, user personas, and English language learning exercises. We find that standard bias metrics have no significant correlation with the more realistic bias metrics. For example, selecting the least biased model based on the standard "trick tests" coincides with selecting the least biased model as measured in more realistic use no more than random chance. We suggest that there is not yet evidence to justify standard benchmarks as reliable proxies of real-world biases, and we encourage further development of context-specific RUTEd evaluations. </p> </div> </dd> <dt> <a name='item286'>[286]</a> <a href ="/abs/2403.08211" title="Abstract" id="2403.08211"> arXiv:2403.08211 </a> (replaced) [<a href="/pdf/2403.08211" title="Download PDF" id="pdf-2403.08211" aria-labelledby="pdf-2403.08211">pdf</a>, <a href="https://arxiv.org/html/2403.08211v3" title="View HTML" id="html-2403.08211" aria-labelledby="html-2403.08211" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2403.08211" title="Other formats" id="oth-2403.08211" aria-labelledby="oth-2403.08211">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Large Language Models are Contrastive Reasoners </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yao,+L">Liang Yao</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Prompting methods play a crucial role in enhancing the capabilities of pre-trained large language models (LLMs). We explore how contrastive prompting (CP) significantly improves the ability of large language models to perform complex reasoning. We demonstrate that LLMs are decent contrastive reasoners by simply adding "Let's give a correct and a wrong answer." before LLMs provide answers. Experiments on various large language models show that zero-shot contrastive prompting improves the performance of standard zero-shot prompting on a range of arithmetic, commonsense, and symbolic reasoning tasks without any hand-crafted few-shot examples, such as increasing the accuracy on GSM8K from 35.9% to 88.8% and AQUA-RAT from 41.3% to 62.2% with the state-of-the-art GPT-4 model. Our method not only surpasses zero-shot CoT and few-shot CoT in most arithmetic and commonsense reasoning tasks but also can seamlessly integrate with existing prompting methods, resulting in improved or comparable results when compared to state-of-the-art methods. Our code is available at <a href="https://github.com/yao8839836/cp" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item287'>[287]</a> <a href ="/abs/2403.09606" title="Abstract" id="2403.09606"> arXiv:2403.09606 </a> (replaced) [<a href="/pdf/2403.09606" title="Download PDF" id="pdf-2403.09606" aria-labelledby="pdf-2403.09606">pdf</a>, <a href="https://arxiv.org/html/2403.09606v2" title="View HTML" id="html-2403.09606" aria-labelledby="html-2403.09606" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2403.09606" title="Other formats" id="oth-2403.09606" aria-labelledby="oth-2403.09606">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Large Language Models and Causal Inference in Collaboration: A Comprehensive Survey </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+X">Xiaoyu Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+P">Paiheng Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+J">Junda Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+J">Jiaxin Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+Y">Yifan Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+Y">Yuhang Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+F">Fuxiao Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guan,+T">Tianrui Guan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+H">Haoliang Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+T">Tong Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=McAuley,+J">Julian McAuley</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ai,+W">Wei Ai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+F">Furong Huang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Findings of the Association for Computational Linguistics: NAACL 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Causal inference has shown potential in enhancing the predictive accuracy, fairness, robustness, and explainability of Natural Language Processing (NLP) models by capturing causal relationships among variables. The emergence of generative Large Language Models (LLMs) has significantly impacted various NLP domains, particularly through their advanced reasoning capabilities. This survey focuses on evaluating and improving LLMs from a causal view in the following areas: understanding and improving the LLMs' reasoning capacity, addressing fairness and safety issues in LLMs, complementing LLMs with explanations, and handling multimodality. Meanwhile, LLMs' strong reasoning capacities can in turn contribute to the field of causal inference by aiding causal relationship discovery and causal effect estimations. This review explores the interplay between causal inference frameworks and LLMs from both perspectives, emphasizing their collective potential to further the development of more advanced and equitable artificial intelligence systems. </p> </div> </dd> <dt> <a name='item288'>[288]</a> <a href ="/abs/2403.17706" title="Abstract" id="2403.17706"> arXiv:2403.17706 </a> (replaced) [<a href="/pdf/2403.17706" title="Download PDF" id="pdf-2403.17706" aria-labelledby="pdf-2403.17706">pdf</a>, <a href="https://arxiv.org/html/2403.17706v2" title="View HTML" id="html-2403.17706" aria-labelledby="html-2403.17706" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2403.17706" title="Other formats" id="oth-2403.17706" aria-labelledby="oth-2403.17706">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Large Language Model Guided Topic Refinement Mechanism for Short Text Modeling </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chang,+S">Shuyu Chang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+R">Rui Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ren,+P">Peng Ren</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Q">Qi Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+H">Haiping Huang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Extended version of paper accepted at DASFAA 2025 (16 pages, 6 figures) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Modeling topics effectively in short texts, such as tweets and news snippets, is crucial to capturing rapidly evolving social trends. Existing topic models often struggle to accurately capture the underlying semantic patterns of short texts, primarily due to the sparse nature of such data. This nature of texts leads to an unavoidable lack of co-occurrence information, which hinders the coherence and granularity of mined topics. This paper introduces a novel model-agnostic mechanism, termed Topic Refinement, which leverages the advanced text comprehension capabilities of Large Language Models (LLMs) for short-text topic modeling. Unlike traditional methods, this post-processing mechanism enhances the quality of topics extracted by various topic modeling methods through prompt engineering. We guide LLMs in identifying semantically intruder words within the extracted topics and suggesting coherent alternatives to replace these words. This process mimics human-like identification, evaluation, and refinement of the extracted topics. Extensive experiments on four diverse datasets demonstrate that Topic Refinement boosts the topic quality and improves the performance in topic-related text classification tasks. </p> </div> </dd> <dt> <a name='item289'>[289]</a> <a href ="/abs/2403.19318" title="Abstract" id="2403.19318"> arXiv:2403.19318 </a> (replaced) [<a href="/pdf/2403.19318" title="Download PDF" id="pdf-2403.19318" aria-labelledby="pdf-2403.19318">pdf</a>, <a href="https://arxiv.org/html/2403.19318v3" title="View HTML" id="html-2403.19318" aria-labelledby="html-2403.19318" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2403.19318" title="Other formats" id="oth-2403.19318" aria-labelledby="oth-2403.19318">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> TableLLM: Enabling Tabular Data Manipulation by LLMs in Real Office Usage Scenarios </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+X">Xiaokang Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+S">Sijia Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+B">Bohan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+Z">Zeyao Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+J">Jing Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+G">Guanlin Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yao,+Z">Zijun Yao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+K">Kangli Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+J">Jinchang Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang-Li,+D">Daniel Zhang-Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+J">Jifan Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+S">Shu Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+J">Juanzi Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+J">Jie Tang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> <a href="https://tablellm.github.io/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> We introduce TableLLM, a robust large language model (LLM) with 8 billion parameters, purpose-built for proficiently handling tabular data manipulation tasks, whether they are embedded within documents or spreadsheets, catering to real-world office scenarios. We propose a distant supervision method for training, which comprises a reasoning process extension strategy, aiding in training LLMs to understand reasoning patterns more effectively as well as a cross-way validation strategy, ensuring the quality of the automatically generated data. To evaluate the performance of TableLLM, we have crafted benchmarks tailored to address both document and spreadsheet formats as well as constructed a well-organized evaluation pipeline capable of handling both scenarios. Thorough evaluations underscore the advantages of TableLLM when compared to various existing general-purpose and tabular data-focused LLMs. We have publicly released the model checkpoint, source code, benchmarks, and a web application for user interaction. Our codes and data are publicly available at <a href="https://github.com/TableLLM/TableLLM" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item290'>[290]</a> <a href ="/abs/2404.02115" title="Abstract" id="2404.02115"> arXiv:2404.02115 </a> (replaced) [<a href="/pdf/2404.02115" title="Download PDF" id="pdf-2404.02115" aria-labelledby="pdf-2404.02115">pdf</a>, <a href="https://arxiv.org/html/2404.02115v3" title="View HTML" id="html-2404.02115" aria-labelledby="html-2404.02115" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2404.02115" title="Other formats" id="oth-2404.02115" aria-labelledby="oth-2404.02115">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> GINopic: Topic Modeling with Graph Isomorphism Network </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Adhya,+S">Suman Adhya</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sanyal,+D+K">Debarshi Kumar Sanyal</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted as a long paper for NAACL 2024 main conference </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Topic modeling is a widely used approach for analyzing and exploring large document collections. Recent research efforts have incorporated pre-trained contextualized language models, such as BERT embeddings, into topic modeling. However, they often neglect the intrinsic informational value conveyed by mutual dependencies between words. In this study, we introduce GINopic, a topic modeling framework based on graph isomorphism networks to capture the correlation between words. By conducting intrinsic (quantitative as well as qualitative) and extrinsic evaluations on diverse benchmark datasets, we demonstrate the effectiveness of GINopic compared to existing topic models and highlight its potential for advancing topic modeling. </p> </div> </dd> <dt> <a name='item291'>[291]</a> <a href ="/abs/2404.05966" title="Abstract" id="2404.05966"> arXiv:2404.05966 </a> (replaced) [<a href="/pdf/2404.05966" title="Download PDF" id="pdf-2404.05966" aria-labelledby="pdf-2404.05966">pdf</a>, <a href="https://arxiv.org/html/2404.05966v2" title="View HTML" id="html-2404.05966" aria-labelledby="html-2404.05966" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2404.05966" title="Other formats" id="oth-2404.05966" aria-labelledby="oth-2404.05966">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> THOUGHTSCULPT: Reasoning with Intermediate Revision and Search </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chi,+Y">Yizhou Chi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+K">Kevin Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Klein,+D">Dan Klein</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted to NAACL 2025 Findings. Code and data available at <a href="https://github.com/cyzus/thoughtsculpt" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> We present THOUGHTSCULPT, a general reasoning and search method for tasks with outputs that can be decomposed into components. THOUGHTSCULPT explores a search tree of potential solutions using Monte Carlo Tree Search (MCTS), building solutions one action at a time and evaluating according to any domain-specific heuristic, which in practice is often simply an LLM evaluator. Critically, our action space includes revision actions: THOUGHTSCULPT may choose to revise part of its previous output rather than continuing to build the rest of its output. Empirically, THOUGHTSCULPT outperforms state-of-the-art reasoning methods across three challenging tasks: Story Outline Improvement (up to +30% interestingness), Mini-Crosswords Solving (up to +16% word success rate), and Constrained Generation (up to +10% concept coverage). </p> </div> </dd> <dt> <a name='item292'>[292]</a> <a href ="/abs/2404.09077" title="Abstract" id="2404.09077"> arXiv:2404.09077 </a> (replaced) [<a href="/pdf/2404.09077" title="Download PDF" id="pdf-2404.09077" aria-labelledby="pdf-2404.09077">pdf</a>, <a href="https://arxiv.org/html/2404.09077v2" title="View HTML" id="html-2404.09077" aria-labelledby="html-2404.09077" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2404.09077" title="Other formats" id="oth-2404.09077" aria-labelledby="oth-2404.09077">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CuriousLLM: Elevating Multi-Document Question Answering with LLM-Enhanced Knowledge Graph Reasoning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+Z">Zukang Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+Z">Zixuan Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+X">Xuan Zhu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Information Retrieval (cs.IR); Machine Learning (cs.LG) </div> <p class='mathjax'> Large Language Models (LLMs) have achieved significant success in open-domain question answering. However, they continue to face challenges such as hallucinations and knowledge cutoffs. These issues can be mitigated through in-context learning by providing LLMs with relevant context before generating answers. Recent literature proposes Knowledge Graph Prompting (KGP) which integrates knowledge graphs with an LLM-based traversal agent to substantially enhance document retrieval quality. However, KGP requires costly fine-tuning with large datasets and remains prone to hallucination. In this paper, we propose CuriousLLM, an enhancement that integrates a curiosity-driven reasoning mechanism into an LLM agent. This mechanism enables the agent to generate relevant follow-up questions, thereby guiding the information retrieval process more efficiently. Central to our approach is the development of the new Follow-upQA dataset, which includes questions and supporting evidence as input, with follow-up questions serving as ground truths. These follow-up questions either inquire about what is still missing to fully answer the user's query or use special tokens to signify that the retrieved evidence is sufficient. Our experiments show that CuriousLLM significantly boosts LLM performance in multi-document question answering (MD-QA), circumventing the substantial computational costs and latency from the original KGP framework. </p> </div> </dd> <dt> <a name='item293'>[293]</a> <a href ="/abs/2405.01474" title="Abstract" id="2405.01474"> arXiv:2405.01474 </a> (replaced) [<a href="/pdf/2405.01474" title="Download PDF" id="pdf-2405.01474" aria-labelledby="pdf-2405.01474">pdf</a>, <a href="https://arxiv.org/html/2405.01474v3" title="View HTML" id="html-2405.01474" aria-labelledby="html-2405.01474" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2405.01474" title="Other formats" id="oth-2405.01474" aria-labelledby="oth-2405.01474">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Understanding Figurative Meaning through Explainable Visual Entailment </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Saakyan,+A">Arkadiy Saakyan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kulkarni,+S">Shreyas Kulkarni</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chakrabarty,+T">Tuhin Chakrabarty</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Muresan,+S">Smaranda Muresan</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> NAACL 2025 Main Conference </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Large Vision-Language Models (VLMs) have demonstrated strong capabilities in tasks requiring a fine-grained understanding of literal meaning in images and text, such as visual question-answering or visual entailment. However, there has been little exploration of the capabilities of these models when presented with images and captions containing figurative meaning, such as metaphors or humor. To close this gap, we propose a new task framing the figurative meaning understanding problem as an explainable visual entailment task, where the model has to predict whether the image (premise) entails a caption (hypothesis) and justify the predicted label with a textual explanation. The figurative phenomena can be present in the image, in the caption, or both. Using a human-AI collaboration approach, we build the accompanying expert-verified dataset V-FLUTE, containing 6,027 {image, caption, label, explanation} instances spanning five diverse figurative phenomena: metaphors, similes, idioms, sarcasm, and humor. Through automatic evaluation, we find that VLMs struggle to generalize from literal to figurative meaning, particularly when it is present in images. Further, we identify common types of errors in VLM reasoning (hallucination and incomplete or unsound reasoning) across classes of models via human evaluation. </p> </div> </dd> <dt> <a name='item294'>[294]</a> <a href ="/abs/2405.02079" title="Abstract" id="2405.02079"> arXiv:2405.02079 </a> (replaced) [<a href="/pdf/2405.02079" title="Download PDF" id="pdf-2405.02079" aria-labelledby="pdf-2405.02079">pdf</a>, <a href="https://arxiv.org/html/2405.02079v2" title="View HTML" id="html-2405.02079" aria-labelledby="html-2405.02079" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2405.02079" title="Other formats" id="oth-2405.02079" aria-labelledby="oth-2405.02079">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Argumentative Large Language Models for Explainable and Contestable Decision-Making </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Freedman,+G">Gabriel Freedman</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dejl,+A">Adam Dejl</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gorur,+D">Deniz Gorur</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yin,+X">Xiang Yin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rago,+A">Antonio Rago</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Toni,+F">Francesca Toni</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 18 pages, 18 figures, Accepted to AAAI 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> The profusion of knowledge encoded in large language models (LLMs) and their ability to apply this knowledge zero-shot in a range of settings makes them promising candidates for use in decision-making. However, they are currently limited by their inability to provide outputs which can be faithfully explained and effectively contested to correct mistakes. In this paper, we attempt to reconcile these strengths and weaknesses by introducing \emph{argumentative LLMs (ArgLLMs)}, a method for augmenting LLMs with argumentative reasoning. Concretely, ArgLLMs construct argumentation frameworks, which then serve as the basis for formal reasoning in support of decision-making. The interpretable nature of these argumentation frameworks and formal reasoning means that any decision made by ArgLLMs may be explained and contested. We evaluate ArgLLMs' performance experimentally in comparison with state-of-the-art techniques, in the context of the decision-making task of claim verification. We also define novel properties to characterise contestability and assess ArgLLMs formally in terms of these properties. </p> </div> </dd> <dt> <a name='item295'>[295]</a> <a href ="/abs/2405.05345" title="Abstract" id="2405.05345"> arXiv:2405.05345 </a> (replaced) [<a href="/pdf/2405.05345" title="Download PDF" id="pdf-2405.05345" aria-labelledby="pdf-2405.05345">pdf</a>, <a href="https://arxiv.org/html/2405.05345v2" title="View HTML" id="html-2405.05345" aria-labelledby="html-2405.05345" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2405.05345" title="Other formats" id="oth-2405.05345" aria-labelledby="oth-2405.05345">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> QuaLLM: An LLM-based Framework to Extract Quantitative Insights from Online Forums </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Rao,+V+N">Varun Nagaraj Rao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Agarwal,+E">Eesha Agarwal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dalal,+S">Samantha Dalal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Calacci,+D">Dan Calacci</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Monroy-Hern%C3%A1ndez,+A">Andr茅s Monroy-Hern谩ndez</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted to NAACL Findings (2025), cite appropriately. Preliminary version presented at CHI LLM as Research Tools Workshop (2024) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Human-Computer Interaction (cs.HC) </div> <p class='mathjax'> Online discussion forums provide crucial data to understand the concerns of a wide range of real-world communities. However, the typical qualitative and quantitative methodologies used to analyze those data, such as thematic analysis and topic modeling, are infeasible to scale or require significant human effort to translate outputs to human readable forms. This study introduces QuaLLM, a novel LLM-based framework to analyze and extract quantitative insights from text data on online forums. The framework consists of a novel prompting and human evaluation methodology. We applied this framework to analyze over one million comments from two of Reddit's rideshare worker communities, marking the largest study of its type. We uncover significant worker concerns regarding AI and algorithmic platform decisions, responding to regulatory calls about worker insights. In short, our work sets a new precedent for AI-assisted quantitative data analysis to surface concerns from online forums. </p> </div> </dd> <dt> <a name='item296'>[296]</a> <a href ="/abs/2405.14075" title="Abstract" id="2405.14075"> arXiv:2405.14075 </a> (replaced) [<a href="/pdf/2405.14075" title="Download PDF" id="pdf-2405.14075" aria-labelledby="pdf-2405.14075">pdf</a>, <a href="https://arxiv.org/html/2405.14075v2" title="View HTML" id="html-2405.14075" aria-labelledby="html-2405.14075" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2405.14075" title="Other formats" id="oth-2405.14075" aria-labelledby="oth-2405.14075">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> $T^2$ of Thoughts: Temperature Tree Elicits Reasoning in Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Cai,+C">Chengkun Cai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+X">Xu Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Du,+Y">Yucheng Du</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+H">Haoliang Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+L">Lei Li</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 8 pages, 4 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Large Language Models (LLMs) have emerged as powerful tools in artificial intelligence, especially in complex decision-making scenarios, but their static problem-solving strategies often limit their adaptability to dynamic environments. We explore the enhancement of reasoning capabilities in LLMs through Temperature Tree ($T^2$) prompting via a heuristic algorithm, termed as $T^2$ of Thoughts ($T^2oT$). The primary focus is on enhancing decision-making processes by dynamically adjusting search parameters, especially temperature, to improve accuracy without increasing computational demands. We empirically validate that our hybrid $T^2oT$ approach yields enhancements in, single-solution accuracy, multi-solution generation and text generation quality. Our findings suggest that while dynamic search depth adjustments based on temperature can yield mixed results, a fixed search depth, when coupled with adaptive capabilities of $T^2oT$, provides a more reliable and versatile problem-solving strategy. This work highlights the potential for future explorations in optimizing algorithmic interactions with foundational language models, particularly illustrated by our development for the Game of 24 and Creative Writing tasks. </p> </div> </dd> <dt> <a name='item297'>[297]</a> <a href ="/abs/2405.16720" title="Abstract" id="2405.16720"> arXiv:2405.16720 </a> (replaced) [<a href="/pdf/2405.16720" title="Download PDF" id="pdf-2405.16720" aria-labelledby="pdf-2405.16720">pdf</a>, <a href="https://arxiv.org/html/2405.16720v3" title="View HTML" id="html-2405.16720" aria-labelledby="html-2405.16720" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2405.16720" title="Other formats" id="oth-2405.16720" aria-labelledby="oth-2405.16720">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Large Scale Knowledge Washing </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yu Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+R">Ruihan Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+Z">Zexue He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+X">Xiusi Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=McAuley,+J">Julian McAuley</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large language models show impressive abilities in memorizing world knowledge, which leads to concerns regarding memorization of private information, toxic or sensitive knowledge, and copyrighted content. We introduce the problem of Large Scale Knowledge Washing, focusing on unlearning an extensive amount of factual knowledge. Previous unlearning methods usually define the reverse loss and update the model via backpropagation, which may affect the model's fluency and reasoning ability or even destroy the model due to extensive training with the reverse loss. Existing works introduce additional data from downstream tasks to prevent the model from losing capabilities, which requires downstream task awareness. Controlling the tradeoff of unlearning and maintaining existing capabilities is also challenging. To this end, we propose LAW (Large Scale Washing) to update the MLP layers in decoder-only large language models to perform knowledge washing, as inspired by model editing methods and based on the hypothesis that knowledge and reasoning are disentanglable. We derive a new objective with the knowledge to be unlearned to update the weights of certain MLP layers. Experimental results demonstrate the effectiveness of LAW in forgetting target knowledge while maintaining reasoning ability. The code will be open-sourced at <a href="https://github.com/wangyu-ustc/LargeScaleWashing" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item298'>[298]</a> <a href ="/abs/2405.19799" title="Abstract" id="2405.19799"> arXiv:2405.19799 </a> (replaced) [<a href="/pdf/2405.19799" title="Download PDF" id="pdf-2405.19799" aria-labelledby="pdf-2405.19799">pdf</a>, <a href="https://arxiv.org/html/2405.19799v3" title="View HTML" id="html-2405.19799" aria-labelledby="html-2405.19799" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2405.19799" title="Other formats" id="oth-2405.19799" aria-labelledby="oth-2405.19799">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Unsupervised Mutual Learning of Discourse Parsing and Topic Segmentation in Dialogue </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+J">Jiahui Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+F">Feng Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+A">Anningzhe Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=D'Haro,+L+F">Luis Fernando D'Haro</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+H">Haizhou Li</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> In dialogue systems, discourse plays a crucial role in managing conversational focus and coordinating interactions. It consists of two key structures: rhetorical structure and topic structure. The former captures the logical flow of conversations, while the latter detects transitions between topics. Together, they improve the ability of a dialogue system to track conversation dynamics and generate contextually relevant high-quality responses. These structures are typically identified through discourse parsing and topic segmentation, respectively. However, existing supervised methods rely on costly manual annotations, while unsupervised methods often focus on a single task, overlooking the deep linguistic interplay between rhetorical and topic structures. To address these issues, we first introduce a unified representation that integrates rhetorical and topic structures, ensuring semantic consistency between them. Under the unified representation, we further propose two linguistically grounded hypotheses based on discourse theories: (1) Local Discourse Coupling, where rhetorical cues dynamically enhance topic-aware information flow, and (2) Global Topology Constraint, where topic structure patterns probabilistically constrain rhetorical relation distributions. Building on the unified representation and two hypotheses, we propose an unsupervised mutual learning framework (UMLF) that jointly models rhetorical and topic structures, allowing them to mutually reinforce each other without requiring additional annotations. We evaluate our approach on two rhetorical datasets and three topic segmentation datasets. Experimental results demonstrate that our method surpasses all strong baselines built on pre-trained language models. Furthermore, when applied to LLMs, our framework achieves notable improvements, demonstrating its effectiveness in improving discourse structure modeling. </p> </div> </dd> <dt> <a name='item299'>[299]</a> <a href ="/abs/2405.20582" title="Abstract" id="2405.20582"> arXiv:2405.20582 </a> (replaced) [<a href="/pdf/2405.20582" title="Download PDF" id="pdf-2405.20582" aria-labelledby="pdf-2405.20582">pdf</a>, <a href="/format/2405.20582" title="Other formats" id="oth-2405.20582" aria-labelledby="oth-2405.20582">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> The Point of View of a Sentiment: Towards Clinician Bias Detection in Psychiatric Notes </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Valentine,+A+A">Alissa A. Valentine</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lepow,+L+A">Lauren A. Lepow</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chan,+L">Lili Chan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Charney,+A+W">Alexander W. Charney</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Landi,+I">Isotta Landi</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Oral presentation at NAACL 2024 Queer in AI Workshop </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Negative patient descriptions and stigmatizing language can contribute to generating healthcare disparities in two ways: (1) read by patients, they can harm their trust and engagement with the medical center; (2) read by physicians, they may negatively influence their perspective of a future patient. In psychiatry, the patient-clinician therapeutic alliance is a major determinant of clinical outcomes. Therefore, language usage in psychiatric clinical notes may not only create healthcare disparities, but also perpetuate them. Recent advances in NLP systems have facilitated the efforts to detect discriminatory language in healthcare. However, such attempts have only focused on the perspectives of the medical center and its physicians. Considering both physicians and non-physicians' point of view is a more translatable approach to identifying potentially harmful language in clinical notes. By leveraging pre-trained and large language models (PLMs and LLMs), this work aims to characterize potentially harmful language usage in psychiatric notes by identifying the sentiment expressed in sentences describing patients based on the reader's point of view. Extracting 39 sentences from the Mount Sinai Health System containing psychiatric lexicon, we fine-tuned three PLMs (RoBERTa, GatorTron, and GatorTron + Task Adaptation) and implemented zero-shot and few-shot ICL approaches for three LLMs (GPT-3.5, Llama-3.1, and Mistral) to classify the sentiment of the sentences according to the physician or non-physician point of view. Results showed that GPT-3.5 aligned best to physician point of view and Mistral aligned best to non-physician point of view. These results underline the importance of recognizing the reader's point of view, not only for improving the note writing process, but also for the quantification, identification, and reduction of bias in computational systems for downstream analyses. </p> </div> </dd> <dt> <a name='item300'>[300]</a> <a href ="/abs/2406.06326" title="Abstract" id="2406.06326"> arXiv:2406.06326 </a> (replaced) [<a href="/pdf/2406.06326" title="Download PDF" id="pdf-2406.06326" aria-labelledby="pdf-2406.06326">pdf</a>, <a href="https://arxiv.org/html/2406.06326v4" title="View HTML" id="html-2406.06326" aria-labelledby="html-2406.06326" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.06326" title="Other formats" id="oth-2406.06326" aria-labelledby="oth-2406.06326">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Self-Tuning: Instructing LLMs to Effectively Acquire New Knowledge through Self-Teaching </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+X">Xiaoying Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Peng,+B">Baolin Peng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tian,+Y">Ye Tian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+J">Jingyan Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yipeng Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mi,+H">Haitao Mi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Meng,+H">Helen Meng</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 35 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large language models (LLMs) often struggle to provide up-to-date information due to their one-time training and the constantly evolving nature of the world. To keep LLMs current, existing approaches typically involve continued pre-training on new documents. However, they frequently face difficulties in extracting stored knowledge. Motivated by the remarkable success of the Feynman Technique in efficient human learning, we introduce Self-Tuning, a learning framework aimed at improving an LLM's ability to effectively acquire new knowledge from unseen raw documents through self-teaching. Specifically, we develop a Self-Teaching strategy that augments the documents with a set of knowledge-intensive tasks created in a self-supervised manner, focusing on three crucial aspects: memorization, comprehension, and self-reflection. Additionally, we introduce three Wiki-Newpages-2023-QA datasets to facilitate an in-depth analysis of an LLM's knowledge acquisition ability concerning memorization, extraction, and reasoning. Extensive experimental results on various models, e.g., Llama2-7B reveal that Self-Tuning consistently exhibits superior performance across all knowledge acquisition tasks and excels in preserving previous knowledge. </p> </div> </dd> <dt> <a name='item301'>[301]</a> <a href ="/abs/2406.09325" title="Abstract" id="2406.09325"> arXiv:2406.09325 </a> (replaced) [<a href="/pdf/2406.09325" title="Download PDF" id="pdf-2406.09325" aria-labelledby="pdf-2406.09325">pdf</a>, <a href="https://arxiv.org/html/2406.09325v3" title="View HTML" id="html-2406.09325" aria-labelledby="html-2406.09325" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.09325" title="Other formats" id="oth-2406.09325" aria-labelledby="oth-2406.09325">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> REVS: Unlearning Sensitive Information in Language Models via Rank Editing in the Vocabulary Space </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ashuach,+T">Tomer Ashuach</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tutek,+M">Martin Tutek</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Belinkov,+Y">Yonatan Belinkov</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 18 pages, 3 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Language models (LMs) risk inadvertently memorizing and divulging sensitive or personally identifiable information (PII) seen in training data, causing privacy concerns. Current approaches to address this issue involve costly dataset scrubbing, or model filtering through unlearning and model editing, which can be bypassed through extraction attacks. We propose REVS, a novel non-gradient-based method for unlearning sensitive information from LMs. REVS identifies and modifies a small subset of neurons relevant for constituent tokens that form sensitive information. To adequately evaluate our method on truly sensitive information, we curate three datasets: email and URL datasets naturally memorized by the models, and a synthetic social security number dataset that we tune the models to memorize. Compared to other methods, REVS demonstrates superior performance in unlearning sensitive information and robustness to extraction attacks, while retaining underlying model integrity. </p> </div> </dd> <dt> <a name='item302'>[302]</a> <a href ="/abs/2406.10400" title="Abstract" id="2406.10400"> arXiv:2406.10400 </a> (replaced) [<a href="/pdf/2406.10400" title="Download PDF" id="pdf-2406.10400" aria-labelledby="pdf-2406.10400">pdf</a>, <a href="https://arxiv.org/html/2406.10400v2" title="View HTML" id="html-2406.10400" aria-labelledby="html-2406.10400" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.10400" title="Other formats" id="oth-2406.10400" aria-labelledby="oth-2406.10400">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Self-Reflection Makes Large Language Models Safer, Less Biased, and Ideologically Neutral </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+F">Fengyuan Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=AlDahoul,+N">Nouar AlDahoul</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Eady,+G">Gregory Eady</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zaki,+Y">Yasir Zaki</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rahwan,+T">Talal Rahwan</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Previous studies proposed that the reasoning capabilities of large language models (LLMs) can be improved through self-reflection, i.e., letting LLMs reflect on their own output to identify and correct mistakes in the initial responses. However, earlier experiments offer mixed results when it comes to the benefits of self-reflection. Furthermore, prior studies on self-reflection are predominantly concerned with the reasoning capabilities of models, ignoring the potential for self-reflection in safety, bias, and ideological leaning. Here, by conducting a series of experiments testing LLM's self-reflection capability in various tasks using a variety of prompts and different LLMs, we make several contributions to the literature. First, we reconcile conflicting findings regarding the benefit of self-reflection, by demonstrating that the outcome of self-reflection is sensitive to prompt wording -- both the original prompt that are used to elicit an initial answer and the subsequent prompt used to self-reflect. Specifically, although self-reflection may improve the reasoning capability of LLMs when the initial response is simple, the technique cannot improve upon the state-of-the-art chain-of-thought (CoT) prompting. Second, we show that self-reflection can lead to safer (75.8\% reduction in toxic responses while preserving 97.8\% non-toxic ones), less biased (77\% reduction in gender biased responses, while preserving 94.3\% unbiased ones), and more ideologically neutral responses (100\% reduction in partisan leaning response, while preserving 87.7\% non-partisan ones). The paper concludes by discussing the implications of our findings on the deployment of large language models. We release our experiments at <a href="https://github.com/Michael98Liu/self-reflection" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item303'>[303]</a> <a href ="/abs/2406.11139" title="Abstract" id="2406.11139"> arXiv:2406.11139 </a> (replaced) [<a href="/pdf/2406.11139" title="Download PDF" id="pdf-2406.11139" aria-labelledby="pdf-2406.11139">pdf</a>, <a href="https://arxiv.org/html/2406.11139v3" title="View HTML" id="html-2406.11139" aria-labelledby="html-2406.11139" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.11139" title="Other formats" id="oth-2406.11139" aria-labelledby="oth-2406.11139">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Breaking Boundaries: Investigating the Effects of Model Editing on Cross-linguistic Performance </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Banerjee,+S">Somnath Banerjee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Halder,+A">Avik Halder</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mandal,+R">Rajarshi Mandal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Layek,+S">Sayan Layek</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Soboroff,+I">Ian Soboroff</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hazra,+R">Rima Hazra</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mukherjee,+A">Animesh Mukherjee</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted at NAACL 2025 (Industry track) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The integration of pretrained language models (PLMs) like BERT and GPT has revolutionized NLP, particularly for English, but it has also created linguistic imbalances. This paper strategically identifies the need for linguistic equity by examining several knowledge editing techniques in multilingual contexts. We evaluate the performance of models such as Mistral, TowerInstruct, OpenHathi, Tamil-Llama, and Kan-Llama across languages including English, German, French, Italian, Spanish, Hindi, Tamil, and Kannada. Our research identifies significant discrepancies in normal and merged models concerning cross-lingual consistency. We employ strategies like 'each language for itself' (ELFI) and 'each language for others' (ELFO) to stress-test these models. Our findings demonstrate the potential for LLMs to overcome linguistic barriers, laying the groundwork for future research in achieving linguistic inclusivity in AI technologies. </p> </div> </dd> <dt> <a name='item304'>[304]</a> <a href ="/abs/2406.11288" title="Abstract" id="2406.11288"> arXiv:2406.11288 </a> (replaced) [<a href="/pdf/2406.11288" title="Download PDF" id="pdf-2406.11288" aria-labelledby="pdf-2406.11288">pdf</a>, <a href="https://arxiv.org/html/2406.11288v3" title="View HTML" id="html-2406.11288" aria-labelledby="html-2406.11288" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.11288" title="Other formats" id="oth-2406.11288" aria-labelledby="oth-2406.11288">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MFC-Bench: Benchmarking Multimodal Fact-Checking with Large Vision-Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Shengkang Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+H">Hongzhan Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+Z">Ziyang Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ye,+Z">Zhen Ye</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+G">Guang Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+J">Jing Ma</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 28 pages, 9 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Large vision-language models (LVLMs) have significantly improved multimodal reasoning tasks, such as visual question answering and image captioning. These models embed multimodal facts within their parameters, rather than relying on external knowledge bases to store factual information explicitly. However, the content discerned by LVLMs may deviate from factuality due to inherent bias or incorrect inference. To address this issue, we introduce MFC-Bench, a rigorous and comprehensive benchmark designed to evaluate the factual accuracy of LVLMs across three stages of verdict prediction for MFC: Manipulation, Out-of-Context, and Veracity Classification. Through our evaluation on MFC-Bench, we benchmarked a dozen diverse and representative LVLMs, uncovering that current models still fall short in multimodal fact-checking and demonstrate insensitivity to various forms of manipulated content. We hope that MFC-Bench could raise attention to the trustworthy AI potentially assisted by LVLMs in the future. The MFC-Bench and accompanying resources are publicly accessible at <a href="https://github.com/wskbest/MFC-Bench" rel="external noopener nofollow" class="link-external link-https">this https URL</a>, contributing to ongoing research in the multimodal fact-checking field. </p> </div> </dd> <dt> <a name='item305'>[305]</a> <a href ="/abs/2406.11632" title="Abstract" id="2406.11632"> arXiv:2406.11632 </a> (replaced) [<a href="/pdf/2406.11632" title="Download PDF" id="pdf-2406.11632" aria-labelledby="pdf-2406.11632">pdf</a>, <a href="https://arxiv.org/html/2406.11632v4" title="View HTML" id="html-2406.11632" aria-labelledby="html-2406.11632" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.11632" title="Other formats" id="oth-2406.11632" aria-labelledby="oth-2406.11632">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Unveiling the Power of Source: Source-based Minimum Bayes Risk Decoding for Neural Machine Translation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lyu,+B">Boxuan Lyu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kamigaito,+H">Hidetaka Kamigaito</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Funakoshi,+K">Kotaro Funakoshi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Okumura,+M">Manabu Okumura</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Maximum a posteriori decoding, a commonly used method for neural machine translation (NMT), aims to maximize the estimated posterior probability. However, high estimated probability does not always lead to high translation quality. Minimum Bayes Risk (MBR) decoding (\citealp{kumar2004minimum}) offers an alternative by seeking hypotheses with the highest expected utility. <br>Inspired by Quality Estimation (QE) reranking which uses the QE model as a ranker (\citealp{fernandes-etal-2022-quality}), we propose source-based MBR (sMBR) decoding, a novel approach that utilizes quasi-sources (generated via paraphrasing or back-translation) as ``support hypotheses'' and a reference-free quality estimation metric as the utility function, marking the first work to solely use sources in MBR decoding. Experiments show that sMBR outperforms QE reranking and the standard MBR decoding. Our findings suggest that sMBR is a promising approach for NMT decoding. </p> </div> </dd> <dt> <a name='item306'>[306]</a> <a href ="/abs/2406.11785" title="Abstract" id="2406.11785"> arXiv:2406.11785 </a> (replaced) [<a href="/pdf/2406.11785" title="Download PDF" id="pdf-2406.11785" aria-labelledby="pdf-2406.11785">pdf</a>, <a href="https://arxiv.org/html/2406.11785v3" title="View HTML" id="html-2406.11785" aria-labelledby="html-2406.11785" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.11785" title="Other formats" id="oth-2406.11785" aria-labelledby="oth-2406.11785">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CELL your Model: Contrastive Explanations for Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Luss,+R">Ronny Luss</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Miehling,+E">Erik Miehling</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dhurandhar,+A">Amit Dhurandhar</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> The advent of black-box deep neural network classification models has sparked the need to explain their decisions. However, in the case of generative AI, such as large language models (LLMs), there is no class prediction to explain. Rather, one can ask why an LLM output a particular response to a given prompt. In this paper, we answer this question by proposing a contrastive explanation method requiring simply black-box/query access. Our explanations suggest that an LLM outputs a reply to a given prompt because if the prompt was slightly modified, the LLM would have given a different response that is either less preferable or contradicts the original response. The key insight is that contrastive explanations simply require a scoring function that has meaning to the user and not necessarily a specific real valued quantity (viz. class label). To this end, we offer a novel budgeted algorithm, our main algorithmic contribution, which intelligently creates contrasts based on such a scoring function while adhering to a query budget, necessary for longer contexts. We show the efficacy of our method on important natural language tasks such as open-text generation and chatbot conversations. </p> </div> </dd> <dt> <a name='item307'>[307]</a> <a href ="/abs/2406.12221" title="Abstract" id="2406.12221"> arXiv:2406.12221 </a> (replaced) [<a href="/pdf/2406.12221" title="Download PDF" id="pdf-2406.12221" aria-labelledby="pdf-2406.12221">pdf</a>, <a href="https://arxiv.org/html/2406.12221v2" title="View HTML" id="html-2406.12221" aria-labelledby="html-2406.12221" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.12221" title="Other formats" id="oth-2406.12221" aria-labelledby="oth-2406.12221">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> On-Policy Self-Alignment with Fine-grained Knowledge Feedback for Hallucination Mitigation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wen,+X">Xueru Wen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+X">Xinyu Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guan,+X">Xinyan Guan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+Y">Yaojie Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+H">Hongyu Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+B">Ben He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+X">Xianpei Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+L">Le Sun</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Hallucination occurs when large language models exhibit behavior that deviates from the boundaries of their knowledge during response generation. To address this critical issue, previous learning-based methods attempt to finetune models but are limited by off-policy sampling and coarse-grained feedback. In this paper, we present \textit{\b{R}einforcement \b{L}earning \b{f}or \b{H}allucination} (RLFH), an on-policy self-alignment approach that enables LLMs to actively explore their knowledge boundaries and self-correct generation behavior through fine-grained feedback signals. RLFH introduces a self-assessment framework where the policy serves as its own judge. Through this framework, responses are automatically decomposed into atomic facts and their truthfulness and informativeness are assessed against external knowledge sources. The resulting fine-grained feedback at the statement level are then converted into token-level dense reward signals. This enables online reinforcement learning to achieve precise and timely optimization without human intervention. Comprehensive evaluations on HotpotQA, SQuADv2, and Biography benchmarks validate RLFH's effectiveness in hallucination mitigation. </p> </div> </dd> <dt> <a name='item308'>[308]</a> <a href ="/abs/2406.13144" title="Abstract" id="2406.13144"> arXiv:2406.13144 </a> (replaced) [<a href="/pdf/2406.13144" title="Download PDF" id="pdf-2406.13144" aria-labelledby="pdf-2406.13144">pdf</a>, <a href="https://arxiv.org/html/2406.13144v5" title="View HTML" id="html-2406.13144" aria-labelledby="html-2406.13144" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.13144" title="Other formats" id="oth-2406.13144" aria-labelledby="oth-2406.13144">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> DialSim: A Real-Time Simulator for Evaluating Long-Term Multi-Party Dialogue Understanding of Conversation Systems </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+J">Jiho Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chay,+W">Woosog Chay</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hwang,+H">Hyeonji Hwang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kyung,+D">Daeun Kyung</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chung,+H">Hyunseung Chung</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cho,+E">Eunbyeol Cho</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jo,+Y">Yohan Jo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Choi,+E">Edward Choi</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Recent advancements in Large Language Models (LLMs) have significantly enhanced the capabilities of conversation systems, making them applicable to various fields (e.g., education). Despite their progress, the evaluation of the systems often overlooks the complexities of real-world conversations, such as real-time interactions, multi-party dialogues, and extended contextual dependencies. To bridge this gap, we introduce DialSim, a real-time dialogue simulator. In this simulator, a conversation system is assigned the role of a character from popular TV shows, requiring it to respond to spontaneous questions using past dialogue information and to distinguish between known and unknown information. Key features of DialSim include assessing the system's ability to respond within a reasonable time limit, handling long-term multi-party dialogues, and evaluating performance under randomized questioning with LongDialQA, a novel, high-quality question-answering dataset. Our experiments using DialSim reveal the strengths and weaknesses of the latest conversation systems, offering valuable insights for future advancements in conversational AI. DialSim is available at <a href="https://dialsim.github.io/" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item309'>[309]</a> <a href ="/abs/2406.14326" title="Abstract" id="2406.14326"> arXiv:2406.14326 </a> (replaced) [<a href="/pdf/2406.14326" title="Download PDF" id="pdf-2406.14326" aria-labelledby="pdf-2406.14326">pdf</a>, <a href="/format/2406.14326" title="Other formats" id="oth-2406.14326" aria-labelledby="oth-2406.14326">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> medIKAL: Integrating Knowledge Graphs as Assistants of LLMs for Enhanced Clinical Diagnosis on EMRs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Jia,+M">Mingyi Jia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Duan,+J">Junwen Duan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+Y">Yan Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+J">Jianxin Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Electronic Medical Records (EMRs), while integral to modern healthcare, present challenges for clinical reasoning and diagnosis due to their complexity and information redundancy. To address this, we proposed medIKAL (Integrating Knowledge Graphs as Assistants of LLMs), a framework that combines Large Language Models (LLMs) with knowledge graphs (KGs) to enhance diagnostic capabilities. medIKAL assigns weighted importance to entities in medical records based on their type, enabling precise localization of candidate diseases within KGs. It innovatively employs a residual network-like approach, allowing initial diagnosis by the LLM to be merged into KG search results. Through a path-based reranking algorithm and a fill-in-the-blank style prompt template, it further refined the diagnostic process. We validated medIKAL's effectiveness through extensive experiments on a newly introduced open-sourced Chinese EMR dataset, demonstrating its potential to improve clinical diagnosis in real-world settings. </p> </div> </dd> <dt> <a name='item310'>[310]</a> <a href ="/abs/2406.15490" title="Abstract" id="2406.15490"> arXiv:2406.15490 </a> (replaced) [<a href="/pdf/2406.15490" title="Download PDF" id="pdf-2406.15490" aria-labelledby="pdf-2406.15490">pdf</a>, <a href="https://arxiv.org/html/2406.15490v2" title="View HTML" id="html-2406.15490" aria-labelledby="html-2406.15490" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.15490" title="Other formats" id="oth-2406.15490" aria-labelledby="oth-2406.15490">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Causal Discovery Inspired Unsupervised Domain Adaptation for Emotion-Cause Pair Extraction </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Hua,+Y">Yuncheng Hua</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+Y">Yujin Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+S">Shuo Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+T">Tao Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qu,+L">Lizhen Qu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bain,+C">Chris Bain</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bassed,+R">Richard Bassed</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Haffari,+G">Gholamreza Haffari</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 18 pages, 6 figures, 5 tables. The paper has been published in the Findings of the Association for Computational Linguistics: EMNLP 2024 </div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> Findings of the Association for Computational Linguistics: EMNLP 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> This paper tackles the task of emotion-cause pair extraction in the unsupervised domain adaptation setting. The problem is challenging as the distributions of the events causing emotions in target domains are dramatically different than those in source domains, despite the distributions of emotional expressions between domains are overlapped. Inspired by causal discovery, we propose a novel deep latent model in the variational autoencoder (VAE) framework, which not only captures the underlying latent structures of data but also utilizes the easily transferable knowledge of emotions as the bridge to link the distributions of events in different domains. To facilitate knowledge transfer across domains, we also propose a novel variational posterior regularization technique to disentangle the latent representations of emotions from those of events in order to mitigate the damage caused by the spurious correlations related to the events in source domains. Through extensive experiments, we demonstrate that our model outperforms the strongest baseline by approximately 11.05\% on a Chinese benchmark and 2.45\% on a English benchmark in terms of weighted-average F1 score. We have released our source code and the generated dataset publicly at: <a href="https://github.com/tk1363704/CAREL-VAE" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item311'>[311]</a> <a href ="/abs/2406.15938" title="Abstract" id="2406.15938"> arXiv:2406.15938 </a> (replaced) [<a href="/pdf/2406.15938" title="Download PDF" id="pdf-2406.15938" aria-labelledby="pdf-2406.15938">pdf</a>, <a href="https://arxiv.org/html/2406.15938v4" title="View HTML" id="html-2406.15938" aria-labelledby="html-2406.15938" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.15938" title="Other formats" id="oth-2406.15938" aria-labelledby="oth-2406.15938">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> RuleR: Improving LLM Controllability by Rule-based Data Recycling </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+M">Ming Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+H">Han Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+C">Chenguang Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nguyen,+D">Dang Nguyen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+D">Dianqi Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+T">Tianyi Zhou</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> NAACL2025 main, Camera-ready </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Large language models (LLMs) still lack delicate controllability over their responses, which is critical to enhancing their performance and the user experience. However, curating supervised fine-tuning (SFT) datasets to improve LLM controllability usually relies on human experts or proprietary LLMs, which requires additional costs. To bridge this gap, we propose Rule-based Data Recycling (RuleR), a data augmentation method incorporating multiple constraints into the original data samples according to predefined rules, which creates new training tasks to consolidate the controllability of LLMs. Instead of creating new data from scratch, RuleR "recycles" existing data by simply applying rule-based edits to their responses and appending the rule-instructions in their original instructions. Experimental results demonstrate RuleR's effectiveness in improving LLM controllability while maintaining general instruction-following capabilities. </p> </div> </dd> <dt> <a name='item312'>[312]</a> <a href ="/abs/2406.15948" title="Abstract" id="2406.15948"> arXiv:2406.15948 </a> (replaced) [<a href="/pdf/2406.15948" title="Download PDF" id="pdf-2406.15948" aria-labelledby="pdf-2406.15948">pdf</a>, <a href="https://arxiv.org/html/2406.15948v3" title="View HTML" id="html-2406.15948" aria-labelledby="html-2406.15948" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.15948" title="Other formats" id="oth-2406.15948" aria-labelledby="oth-2406.15948">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Teaching LLMs to Abstain across Languages via Multilingual Feedback </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+S">Shangbin Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shi,+W">Weijia Shi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yike Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ding,+W">Wenxuan Ding</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ahia,+O">Orevaoghene Ahia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+S+S">Shuyue Stella Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Balachandran,+V">Vidhisha Balachandran</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sitaram,+S">Sunayana Sitaram</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tsvetkov,+Y">Yulia Tsvetkov</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> EMNLP 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Multilingual LLMs often have knowledge disparities across languages, with larger gaps in under-resourced languages. Teaching LLMs to abstain in the face of knowledge gaps is thus a promising strategy to mitigate hallucinations in multilingual settings. However, previous studies on LLM abstention primarily focus on English; we find that directly applying existing solutions beyond English results in up to 20.5% performance gaps between high and low-resource languages, potentially due to LLMs' drop in calibration and reasoning beyond a few resource-rich languages. To this end, we propose strategies to enhance LLM abstention by learning from multilingual feedback, where LLMs self-reflect on proposed answers in one language by generating multiple feedback items in related languages: we show that this helps identifying the knowledge gaps across diverse languages, cultures, and communities. Extensive experiments demonstrate that our multilingual feedback approach outperforms various strong baselines, achieving up to 9.2% improvement for low-resource languages across three black-box and open models on three datasets, featuring open-book, closed-book, and commonsense QA. Further analysis reveals that multilingual feedback is both an effective and a more equitable abstain strategy to serve diverse language speakers, and cultural factors have great impact on language selection and LLM abstention behavior, highlighting future directions for multilingual and multi-cultural reliable language modeling. </p> </div> </dd> <dt> <a name='item313'>[313]</a> <a href ="/abs/2406.16288" title="Abstract" id="2406.16288"> arXiv:2406.16288 </a> (replaced) [<a href="/pdf/2406.16288" title="Download PDF" id="pdf-2406.16288" aria-labelledby="pdf-2406.16288">pdf</a>, <a href="https://arxiv.org/html/2406.16288v2" title="View HTML" id="html-2406.16288" aria-labelledby="html-2406.16288" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.16288" title="Other formats" id="oth-2406.16288" aria-labelledby="oth-2406.16288">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> PlagBench: Exploring the Duality of Large Language Models in Plagiarism Generation and Detection </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+J">Jooyoung Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Agrawal,+T">Toshini Agrawal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Uchendu,+A">Adaku Uchendu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Le,+T">Thai Le</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+J">Jinghui Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+D">Dongwon Lee</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> This is a camera-ready version of NAACL 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Recent studies have raised concerns about the potential threats large language models (LLMs) pose to academic integrity and copyright protection. Yet, their investigation is predominantly focused on literal copies of original texts. Also, how LLMs can facilitate the detection of LLM-generated plagiarism remains largely unexplored. To address these gaps, we introduce \textbf{\sf PlagBench}, a dataset of 46.5K synthetic text pairs that represent three major types of plagiarism: verbatim copying, paraphrasing, and summarization. These samples are generated by three advanced LLMs. We rigorously validate the quality of PlagBench through a combination of fine-grained automatic evaluation and human annotation. We then utilize this dataset for two purposes: (1) to examine LLMs' ability to transform original content into accurate paraphrases and summaries, and (2) to evaluate the plagiarism detection performance of five modern LLMs alongside three specialized plagiarism checkers. Our results show that GPT-3.5 Turbo can produce high-quality paraphrases and summaries without significantly increasing text complexity compared to GPT-4 Turbo. However, in terms of detection, GPT-4 outperforms other LLMs and commercial detection tools by 20%, highlights the evolving capabilities of LLMs not only in content generation but also in plagiarism detection. Data and source code are available at <a href="https://github.com/Brit7777/plagbench" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item314'>[314]</a> <a href ="/abs/2406.17519" title="Abstract" id="2406.17519"> arXiv:2406.17519 </a> (replaced) [<a href="/pdf/2406.17519" title="Download PDF" id="pdf-2406.17519" aria-labelledby="pdf-2406.17519">pdf</a>, <a href="https://arxiv.org/html/2406.17519v2" title="View HTML" id="html-2406.17519" aria-labelledby="html-2406.17519" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.17519" title="Other formats" id="oth-2406.17519" aria-labelledby="oth-2406.17519">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Entropy-Based Decoding for Retrieval-Augmented Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Qiu,+Z">Zexuan Qiu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ou,+Z">Zijing Ou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+B">Bin Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+J">Jingjing Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+A">Aiwei Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=King,+I">Irwin King</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> NAACL 2025 Main Conference </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Augmenting Large Language Models (LLMs) with retrieved external knowledge has proven effective for improving the factual accuracy of generated responses. Despite their success, retrieval-augmented LLMs still face the distractibility issue, where the generated responses are negatively influenced by noise from both external and internal knowledge sources. In this paper, we introduce a novel, training-free decoding method guided by entropy considerations to mitigate this issue. Our approach utilizes entropy-based document-parallel ensemble decoding to prioritize low-entropy distributions from retrieved documents, thereby enhancing the extraction of relevant information of context. Additionally, it incorporates a contrastive decoding mechanism that contrasts the obtained low-entropy ensemble distribution with the high-entropy distribution derived from the model's internal knowledge across layers, which ensures a greater emphasis on reliable external information. Extensive experiments on open-domain question answering datasets demonstrate the superiority of our method. </p> </div> </dd> <dt> <a name='item315'>[315]</a> <a href ="/abs/2407.00924" title="Abstract" id="2407.00924"> arXiv:2407.00924 </a> (replaced) [<a href="/pdf/2407.00924" title="Download PDF" id="pdf-2407.00924" aria-labelledby="pdf-2407.00924">pdf</a>, <a href="https://arxiv.org/html/2407.00924v2" title="View HTML" id="html-2407.00924" aria-labelledby="html-2407.00924" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2407.00924" title="Other formats" id="oth-2407.00924" aria-labelledby="oth-2407.00924">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> EXCGEC: A Benchmark for Edit-Wise Explainable Chinese Grammatical Error Correction </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ye,+J">Jingheng Ye</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qin,+S">Shang Qin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yinghui Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cheng,+X">Xuxin Cheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qin,+L">Libo Qin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zheng,+H">Hai-Tao Zheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+Y">Ying Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xing,+P">Peng Xing</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+Z">Zishan Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cheng,+G">Guo Cheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+W">Wenhao Jiang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted to AAAI 2025. 19 pages with an appendix, 10 tables, and 9 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Existing studies explore the explainability of Grammatical Error Correction (GEC) in a limited scenario, where they ignore the interaction between corrections and explanations and have not established a corresponding comprehensive benchmark. To bridge the gap, this paper first introduces the task of EXplainable GEC (EXGEC), which focuses on the integral role of correction and explanation tasks. To facilitate the task, we propose EXCGEC, a tailored benchmark for Chinese EXGEC consisting of 8,216 explanation-augmented samples featuring the design of hybrid edit-wise explanations. We then benchmark several series of LLMs in multi-task learning settings, including post-explaining and pre-explaining. To promote the development of the task, we also build a comprehensive evaluation suite by leveraging existing automatic metrics and conducting human evaluation experiments to demonstrate the human consistency of the automatic metrics for free-text explanations. Our experiments reveal the effectiveness of evaluating free-text explanations using traditional metrics like METEOR and ROUGE, and the inferior performance of multi-task models compared to the pipeline solution, indicating its challenges to establish positive effects in learning both tasks. </p> </div> </dd> <dt> <a name='item316'>[316]</a> <a href ="/abs/2407.02039" title="Abstract" id="2407.02039"> arXiv:2407.02039 </a> (replaced) [<a href="/pdf/2407.02039" title="Download PDF" id="pdf-2407.02039" aria-labelledby="pdf-2407.02039">pdf</a>, <a href="https://arxiv.org/html/2407.02039v2" title="View HTML" id="html-2407.02039" aria-labelledby="html-2407.02039" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2407.02039" title="Other formats" id="oth-2407.02039" aria-labelledby="oth-2407.02039">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Prompt Stability Scoring for Text Annotation with Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Barrie,+C">Christopher Barrie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Palaiologou,+E">Elli Palaiologou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=T%C3%B6rnberg,+P">Petter T枚rnberg</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 39 pages, 5 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Researchers are increasingly using language models (LMs) for text annotation. These approaches rely only on a prompt telling the model to return a given output according to a set of instructions. The reproducibility of LM outputs may nonetheless be vulnerable to small changes in the prompt design. This calls into question the replicability of classification routines. To tackle this problem, researchers have typically tested a variety of semantically similar prompts to determine what we call ``prompt stability." These approaches remain ad-hoc and task specific. In this article, we propose a general framework for diagnosing prompt stability by adapting traditional approaches to intra- and inter-coder reliability scoring. We call the resulting metric the Prompt Stability Score (PSS) and provide a Python package \texttt{promptstability} for its estimation. Using six different datasets and twelve outcomes, we classify $\sim$3.1m rows of data and $\sim$300m input tokens to: a) diagnose when prompt stability is low; and b) demonstrate the functionality of the package. We conclude by providing best practice recommendations for applied researchers. </p> </div> </dd> <dt> <a name='item317'>[317]</a> <a href ="/abs/2407.08952" title="Abstract" id="2407.08952"> arXiv:2407.08952 </a> (replaced) [<a href="/pdf/2407.08952" title="Download PDF" id="pdf-2407.08952" aria-labelledby="pdf-2407.08952">pdf</a>, <a href="https://arxiv.org/html/2407.08952v3" title="View HTML" id="html-2407.08952" aria-labelledby="html-2407.08952" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2407.08952" title="Other formats" id="oth-2407.08952" aria-labelledby="oth-2407.08952">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Detect, Investigate, Judge and Determine: A Knowledge-guided Framework for Few-shot Fake News Detection </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Y">Ye Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+J">Jiajun Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+X">Xukai Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+H">Haoyu Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yanghai Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+K">Kai Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+X">Xiaofang Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+E">Enhong Chen</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Few-Shot Fake News Detection (FS-FND) aims to distinguish inaccurate news from real ones in extremely low-resource scenarios. This task has garnered increased attention due to the widespread dissemination and harmful impact of fake news on social media. Large Language Models (LLMs) have demonstrated competitive performance with the help of their rich prior knowledge and excellent in-context learning abilities. However, existing methods face significant limitations, such as the Understanding Ambiguity and Information Scarcity, which significantly undermine the potential of LLMs. To address these shortcomings, we propose a Dual-perspective Knowledge-guided Fake News Detection (DKFND) model, designed to enhance LLMs from both inside and outside perspectives. Specifically, DKFND first identifies the knowledge concepts of each news article through a Detection Module. Subsequently, DKFND creatively designs an Investigation Module to retrieve inside and outside valuable information concerning to the current news, followed by another Judge Module to evaluate the relevance and confidence of them. Finally, a Determination Module further derives two respective predictions and obtain the final result. Extensive experiments on two public datasets show the efficacy of our proposed method, particularly in low-resource settings. </p> </div> </dd> <dt> <a name='item318'>[318]</a> <a href ="/abs/2407.10275" title="Abstract" id="2407.10275"> arXiv:2407.10275 </a> (replaced) [<a href="/pdf/2407.10275" title="Download PDF" id="pdf-2407.10275" aria-labelledby="pdf-2407.10275">pdf</a>, <a href="https://arxiv.org/html/2407.10275v2" title="View HTML" id="html-2407.10275" aria-labelledby="html-2407.10275" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2407.10275" title="Other formats" id="oth-2407.10275" aria-labelledby="oth-2407.10275">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Cross-Lingual Multi-Hop Knowledge Editing </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Khandelwal,+A">Aditi Khandelwal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Singh,+H">Harman Singh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gu,+H">Hengrui Gu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+T">Tianlong Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+K">Kaixiong Zhou</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Large language models are often expected to constantly adapt to new sources of knowledge and knowledge editing techniques aim to efficiently patch the outdated model knowledge, with minimal modification. Most prior works focus on monolingual knowledge editing in English, even though new information can emerge in any language from any part of the world. We propose the Cross-Lingual Multi-Hop Knowledge Editing paradigm, for measuring and analyzing the performance of various SoTA knowledge editing techniques in a cross-lingual setup. Specifically, we create a parallel cross-lingual benchmark, CROLIN-MQUAKE for measuring the knowledge editing capabilities. Our extensive analysis over various knowledge editing techniques uncover significant gaps in performance between the cross-lingual and English-centric setting. Following this, we propose a significantly improved system for cross-lingual multi-hop knowledge editing, CLEVER-CKE. CLEVER-CKE is based on a retrieve, verify and generate knowledge editing framework, where a retriever is formulated to recall edited facts and support an LLM to adhere to knowledge edits. We develop language-aware and hard-negative based contrastive objectives for improving the cross-lingual and fine-grained fact retrieval and verification process used in this framework. Extensive experiments on three LLMs, eight languages, and two datasets show CLEVER-CKE's significant gains of up to 30% over prior methods. </p> </div> </dd> <dt> <a name='item319'>[319]</a> <a href ="/abs/2407.11686" title="Abstract" id="2407.11686"> arXiv:2407.11686 </a> (replaced) [<a href="/pdf/2407.11686" title="Download PDF" id="pdf-2407.11686" aria-labelledby="pdf-2407.11686">pdf</a>, <a href="https://arxiv.org/html/2407.11686v4" title="View HTML" id="html-2407.11686" aria-labelledby="html-2407.11686" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2407.11686" title="Other formats" id="oth-2407.11686" aria-labelledby="oth-2407.11686">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CCoE: A Compact and Efficient LLM Framework with Multi-Expert Collaboration for Resource-Limited Settings </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+S">Shaomang Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pan,+J">Jianfeng Pan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Peng,+M">Min Peng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zheng,+H">Hanzhong Zheng</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Large Language Models (LLMs) have achieved exceptional performance across diverse domains through training on massive datasets. However, scaling LLMs to support multiple downstream domain applications remains a significant challenge, especially under resource constraints. Existing approaches often struggle to balance performance across multiple domains with resource efficiency, limiting their broader applicability. To address this, we introduce the CCoE architecture, a modular framework that seamlessly integrates domain-specific experts into a unified LLM. By leveraging independently trained expert subnetworks on a shared backbone partition, CCoE achieves state-of-the-art performance while significantly reducing the resource requirements for multi-expert deployments. Furthermore, rule-based gating and expert planning in CCoE enable flexible task allocation, promoting expert collaboration to handle complex reasoning tasks. CCoE not only reduces inference costs but also provides a flexible and scalable solution for integrating domain expertise across diverse applications. Experiments on five domains demonstrate that CCoE achieves comparable performance to current domain-specific LLMs. Moreover, compared to existing multi-domain model ensemble methods, CCoE reduces memory usage by 61.3%, while improving inference efficiency by 0.76x over parameter-efficient multi-expert integration approaches. </p> </div> </dd> <dt> <a name='item320'>[320]</a> <a href ="/abs/2407.14482" title="Abstract" id="2407.14482"> arXiv:2407.14482 </a> (replaced) [<a href="/pdf/2407.14482" title="Download PDF" id="pdf-2407.14482" aria-labelledby="pdf-2407.14482">pdf</a>, <a href="https://arxiv.org/html/2407.14482v3" title="View HTML" id="html-2407.14482" aria-labelledby="html-2407.14482" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2407.14482" title="Other formats" id="oth-2407.14482" aria-labelledby="oth-2407.14482">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> ChatQA 2: Bridging the Gap to Proprietary LLMs in Long Context and RAG Capabilities </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+P">Peng Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ping,+W">Wei Ping</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+X">Xianchao Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+C">Chejian Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zihan Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shoeybi,+M">Mohammad Shoeybi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Catanzaro,+B">Bryan Catanzaro</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted at ICLR 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Information Retrieval (cs.IR); Machine Learning (cs.LG) </div> <p class='mathjax'> In this work, we introduce ChatQA 2, an Llama 3.0-based model with a 128K context window, designed to bridge the gap between open-source LLMs and leading proprietary models (e.g., GPT-4-Turbo-2024-04-09) in long context understanding and retrieval-augmented generation (RAG) capabilities. These two capabilities are complementary to each other and essential for LLMs to process large volumes of information that cannot fit into a single prompt. We present a detailed continued training recipe to extend the context window of Llama3-70B-base from 8K to 128K tokens, along with a three-stage instruction tuning process to enhance the model's instruction-following, RAG performance, and long-context understanding capabilities. Our results demonstrate that the Llama3-ChatQA-2-70B model outperforms most existing state-of-the-art models, including GPT-4-Turbo-2024-04-09, Qwen2-72B-Instruct, and Llama3.1-70B-Instruct, on ultra-long tasks beyond 100K tokens, as well as on the RAG benchmark using only a 4K context window, showing the strong long context capability across varying sequence lengths. We further provide extensive comparisons between direct long-context and RAG solutions using the same state-of-the-art long-context LLMs. Interestingly, we find that the performance of strong long-context LLMs using RAG improves when retrieving a larger number of chunks. With a large set of top-k chunks, RAG consistently outperforms direct long-context solution using the same state-of-the-art long-context models (e.g., Llama3-ChatQA-2-70B and Qwen2-72B-Instruct) on both 32K and 128K benchmarks. We open-source the model weights, training data, and the evaluation setup for the for the community: <a href="https://chatqa2-project.github.io/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item321'>[321]</a> <a href ="/abs/2407.16724" title="Abstract" id="2407.16724"> arXiv:2407.16724 </a> (replaced) [<a href="/pdf/2407.16724" title="Download PDF" id="pdf-2407.16724" aria-labelledby="pdf-2407.16724">pdf</a>, <a href="https://arxiv.org/html/2407.16724v3" title="View HTML" id="html-2407.16724" aria-labelledby="html-2407.16724" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2407.16724" title="Other formats" id="oth-2407.16724" aria-labelledby="oth-2407.16724">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Structure-aware Domain Knowledge Injection for Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+K">Kai Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Z">Ze Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fu,+Z">Zhihang Fu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+W">Wei Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+R">Rongxin Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+F">Fan Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Y">Yaowu Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Y">Yue Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ye,+J">Jieping Ye</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Preprint. Code is available at <a href="https://github.com/alibaba/struxgpt" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> This paper introduces a pioneering methodology, termed StructTuning, to efficiently transform foundation Large Language Models (LLMs) into domain specialists. It significantly reduces the training corpus needs to a mere 5% while achieving an impressive 100% of traditional knowledge injection performance. Motivated by structured human education, we propose a novel two-stage strategy for knowledge injection and alignment: Structure-aware Continual Pre-Training (SCPT) and Structure-aware Supervised Fine-Tuning (SSFT). In the SCPT phase, we automatically extract the domain knowledge taxonomy and reorganize the training corpora, enabling LLMs to effectively link textual segments to targeted knowledge points within the taxonomy. In the SSFT phase, we explicitly prompt models to elucidate the underlying knowledge structure in their outputs, leveraging the structured domain insight to address practical problems. Our ultimate method was extensively evaluated across model architectures and scales on LongBench and MMedBench datasets, demonstrating superior performance against other knowledge injection methods. We also explored our method's scalability across different training corpus sizes, laying the foundation to enhance domain-specific LLMs with better data utilization. </p> </div> </dd> <dt> <a name='item322'>[322]</a> <a href ="/abs/2408.03618" title="Abstract" id="2408.03618"> arXiv:2408.03618 </a> (replaced) [<a href="/pdf/2408.03618" title="Download PDF" id="pdf-2408.03618" aria-labelledby="pdf-2408.03618">pdf</a>, <a href="/format/2408.03618" title="Other formats" id="oth-2408.03618" aria-labelledby="oth-2408.03618">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Logical Fallacy-Informed Framework for Argument Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Mouchel,+L">Luca Mouchel</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Paul,+D">Debjit Paul</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cui,+S">Shaobo Cui</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=West,+R">Robert West</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bosselut,+A">Antoine Bosselut</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Faltings,+B">Boi Faltings</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Despite the remarkable performance of Large Language Models (LLMs) in natural language processing tasks, they still struggle with generating logically sound arguments, resulting in potential risks such as spreading misinformation. To address this issue, we introduce FIPO, a fallacy-informed framework that leverages preference optimization methods to steer LLMs toward logically sound arguments. FIPO includes a classification loss, to capture the fine-grained information on fallacy types. Our results on argumentation datasets show that our method reduces the fallacy errors by up to 17.5%. Furthermore, our human evaluation results indicate that the quality of the generated arguments by our method significantly outperforms the fine-tuned baselines, as well as other preference optimization methods, such as DPO. These findings highlight the importance of ensuring models are aware of logical fallacies for effective argument generation. Our code is available at <a href="http://github.com/lucamouchel/Logical-Fallacies" rel="external noopener nofollow" class="link-external link-http">this http URL</a>. </p> </div> </dd> <dt> <a name='item323'>[323]</a> <a href ="/abs/2408.04237" title="Abstract" id="2408.04237"> arXiv:2408.04237 </a> (replaced) [<a href="/pdf/2408.04237" title="Download PDF" id="pdf-2408.04237" aria-labelledby="pdf-2408.04237">pdf</a>, <a href="https://arxiv.org/html/2408.04237v2" title="View HTML" id="html-2408.04237" aria-labelledby="html-2408.04237" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2408.04237" title="Other formats" id="oth-2408.04237" aria-labelledby="oth-2408.04237">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Learning to Rewrite: Generalized LLM-Generated Text Detection </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+R">Ran Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hao,+W">Wei Hao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+W">Weiliang Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+J">Junfeng Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mao,+C">Chengzhi Mao</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large language models (LLMs) present significant risks when used to generate non-factual content and spread disinformation at scale. Detecting such LLM-generated content is crucial, yet current detectors often struggle to generalize in open-world contexts. We introduce Learning2Rewrite, a novel framework for detecting AI-generated text with exceptional generalization to unseen domains. Our method leverages the insight that LLMs inherently modify AI-generated content less than human-written text when tasked with rewriting. By training LLMs to minimize alterations on AI-generated inputs, we amplify this disparity, yielding a more distinguishable and generalizable edit distance across diverse text distributions. Extensive experiments on data from 21 independent domains and four major LLMs (GPT-3.5, GPT-4, Gemini, and Llama-3) demonstrate that our detector outperforms state-of-the-art detection methods by up to 23.04% in AUROC for in-distribution tests, 37.26% for out-of-distribution tests, and 48.66% under adversarial attacks. Our unique training objective ensures better generalizability compared to directly training for classification, when leveraging the same amount of parameters. Our findings suggest that reinforcing LLMs' inherent rewriting tendencies offers a robust and scalable solution for detecting AI-generated text. </p> </div> </dd> <dt> <a name='item324'>[324]</a> <a href ="/abs/2408.08590" title="Abstract" id="2408.08590"> arXiv:2408.08590 </a> (replaced) [<a href="/pdf/2408.08590" title="Download PDF" id="pdf-2408.08590" aria-labelledby="pdf-2408.08590">pdf</a>, <a href="https://arxiv.org/html/2408.08590v2" title="View HTML" id="html-2408.08590" aria-labelledby="html-2408.08590" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2408.08590" title="Other formats" id="oth-2408.08590" aria-labelledby="oth-2408.08590">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Mechanistic Interpretation of Syllogistic Reasoning in Auto-Regressive Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+G">Geonhee Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Valentino,+M">Marco Valentino</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Freitas,+A">Andr茅 Freitas</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Recent studies on logical reasoning in Language Models (LMs) have sparked a debate on whether they can learn systematic reasoning principles during pre-training or merely exploit superficial patterns in the training data. This paper presents a mechanistic interpretation of syllogistic reasoning in LMs to advance the understanding of internal dynamics. Specifically, we present a methodology for circuit discovery aimed at interpreting content-independent reasoning mechanisms. Through two distinct intervention methods, we uncover a sufficient and necessary circuit involving middle-term suppression that elucidates how LMs transfer information to derive valid conclusions from premises. Furthermore, we investigate how belief biases manifest in syllogistic reasoning, finding evidence of partial contamination from additional attention heads responsible for encoding commonsense and contextualized knowledge. Finally, we explore the generalization of the discovered mechanisms across various syllogistic schemes, model sizes and architectures, finding that the identified circuit is sufficient and necessary for the schemes on which the models achieve high downstream accuracy (> 60%), and that the activation patterns apply to models of different families. Overall, our findings suggest that LMs indeed learn transferable content-independent reasoning mechanisms, but that, at the same time, such mechanisms do not involve generalizable and abstract logical primitives, being susceptible to contamination by the same world knowledge acquired during pre-training. </p> </div> </dd> <dt> <a name='item325'>[325]</a> <a href ="/abs/2408.11850" title="Abstract" id="2408.11850"> arXiv:2408.11850 </a> (replaced) [<a href="/pdf/2408.11850" title="Download PDF" id="pdf-2408.11850" aria-labelledby="pdf-2408.11850">pdf</a>, <a href="https://arxiv.org/html/2408.11850v3" title="View HTML" id="html-2408.11850" aria-labelledby="html-2408.11850" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2408.11850" title="Other formats" id="oth-2408.11850" aria-labelledby="oth-2408.11850">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> PEARL: Parallel Speculative Decoding with Adaptive Draft Length </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+T">Tianyu Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yun Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lv,+Q">Qitan Lv</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+K">Kai Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+J">Jianchen Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+W">Winston Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+X">Xiao Sun</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> ICLR 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Speculative decoding (SD), where an extra draft model is employed to provide multiple draft tokens first, and then the original target model verifies these tokens in parallel, has shown great power for LLM inference acceleration. However, existing SD methods suffer from the mutual waiting problem, i.e., the target model gets stuck when the draft model is guessing tokens, and vice versa. This problem is directly incurred by the asynchronous execution of the draft model and the target model and is exacerbated due to the fixed draft length in speculative decoding. To address these challenges, we propose a conceptually simple, flexible, and general framework to boost speculative decoding, namely Parallel spEculative decoding with Adaptive dRaft Length (PEARL). Specifically, PEARL proposes pre-verify to verify the first draft token in advance during the drafting phase, and post-verify to generate more draft tokens during the verification phase. PEARL parallels the drafting phase and the verification phase via applying the two strategies, and achieves adaptive draft length for different scenarios, which effectively alleviates the mutual waiting problem. Experiments on various text generation benchmarks demonstrate the effectiveness of our PEARL, leading to a superior speed up performance up to 4.43$\times$ and 1.50$\times$, compared to auto-regressive decoding and vanilla speculative decoding, respectively. Our code is available at <a href="https://github.com/smart-lty/ParallelSpeculativeDecoding" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item326'>[326]</a> <a href ="/abs/2408.14853" title="Abstract" id="2408.14853"> arXiv:2408.14853 </a> (replaced) [<a href="/pdf/2408.14853" title="Download PDF" id="pdf-2408.14853" aria-labelledby="pdf-2408.14853">pdf</a>, <a href="https://arxiv.org/html/2408.14853v2" title="View HTML" id="html-2408.14853" aria-labelledby="html-2408.14853" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2408.14853" title="Other formats" id="oth-2408.14853" aria-labelledby="oth-2408.14853">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Atoxia: Red-teaming Large Language Models with Target Toxic Answers </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Du,+Y">Yuhao Du</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Z">Zhuo Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cheng,+P">Pengyu Cheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wan,+X">Xiang Wan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+A">Anningzhe Gao</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted to Findings of NAACL-2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Cryptography and Security (cs.CR) </div> <p class='mathjax'> Despite the substantial advancements in artificial intelligence, large language models (LLMs) remain being challenged by generation safety. With adversarial jailbreaking prompts, one can effortlessly induce LLMs to output harmful content, causing unexpected negative social impacts. This vulnerability highlights the necessity for robust LLM red-teaming strategies to identify and mitigate such risks before large-scale application. To detect specific types of risks, we propose a novel red-teaming method that $\textbf{A}$ttacks LLMs with $\textbf{T}$arget $\textbf{Toxi}$c $\textbf{A}$nswers ($\textbf{Atoxia}$). Given a particular harmful answer, Atoxia generates a corresponding user query and a misleading answer opening to examine the internal defects of a given LLM. The proposed attacker is trained within a reinforcement learning scheme with the LLM outputting probability of the target answer as the reward. We verify the effectiveness of our method on various red-teaming benchmarks, such as AdvBench and HH-Harmless. The empirical results demonstrate that Atoxia can successfully detect safety risks in not only open-source models but also state-of-the-art black-box models such as GPT-4o. </p> </div> </dd> <dt> <a name='item327'>[327]</a> <a href ="/abs/2408.15549" title="Abstract" id="2408.15549"> arXiv:2408.15549 </a> (replaced) [<a href="/pdf/2408.15549" title="Download PDF" id="pdf-2408.15549" aria-labelledby="pdf-2408.15549">pdf</a>, <a href="https://arxiv.org/html/2408.15549v2" title="View HTML" id="html-2408.15549" aria-labelledby="html-2408.15549" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2408.15549" title="Other formats" id="oth-2408.15549" aria-labelledby="oth-2408.15549">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> WildFeedback: Aligning LLMs With In-situ User Interactions And Feedback </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Shi,+T">Taiwei Shi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zhuoer Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+L">Longqi Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+Y">Ying-Chun Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+Z">Zexue He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wan,+M">Mengting Wan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+P">Pei Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jauhar,+S">Sujay Jauhar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+S">Sihao Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xia,+S">Shan Xia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+H">Hongfei Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+J">Jieyu Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+X">Xiaofeng Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+X">Xia Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Neville,+J">Jennifer Neville</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 24 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> As large language models (LLMs) continue to advance, aligning these models with human preferences has emerged as a critical challenge. Traditional alignment methods, relying on human or LLM annotated datasets, are limited by their resource-intensive nature, inherent subjectivity, misalignment with real-world user preferences, and the risk of feedback loops that amplify model biases. To overcome these limitations, we introduce WildFeedback, a novel framework that leverages in-situ user feedback during conversations with LLMs to create preference datasets automatically. Given a corpus of multi-turn user-LLM conversation, WildFeedback identifies and classifies user feedback to LLM responses between conversation turns. The user feedback is then used to create examples of preferred and dispreferred responses according to users' preference. Our experiments demonstrate that LLMs fine-tuned on WildFeedback dataset exhibit significantly improved alignment with user preferences, as evidenced by both traditional benchmarks and our proposed checklist-guided evaluation. By incorporating in-situ feedback from actual users, WildFeedback addresses the scalability, subjectivity, and bias challenges that plague existing approaches, marking a significant step toward developing LLMs that are more responsive to the diverse and evolving needs of their users. </p> </div> </dd> <dt> <a name='item328'>[328]</a> <a href ="/abs/2408.16756" title="Abstract" id="2408.16756"> arXiv:2408.16756 </a> (replaced) [<a href="/pdf/2408.16756" title="Download PDF" id="pdf-2408.16756" aria-labelledby="pdf-2408.16756">pdf</a>, <a href="https://arxiv.org/html/2408.16756v3" title="View HTML" id="html-2408.16756" aria-labelledby="html-2408.16756" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2408.16756" title="Other formats" id="oth-2408.16756" aria-labelledby="oth-2408.16756">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> How Well Do LLMs Handle Cantonese? Benchmarking Cantonese Capabilities of Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+J">Jiyue Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+P">Pengan Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+L">Liheng Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Sheng Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bao,+Q">Qinghang Bao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kong,+L">Lingpeng Kong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yu Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+C">Chuan Wu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted by NAACL 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The rapid evolution of large language models (LLMs) has transformed the competitive landscape in natural language processing (NLP), particularly for English and other data-rich languages. However, underrepresented languages like Cantonese, spoken by over 85 million people, face significant development gaps, which is particularly concerning given the economic significance of the Guangdong-Hong Kong-Macau Greater Bay Area, and in substantial Cantonese-speaking populations in places like Singapore and North America. Despite its wide use, Cantonese has scant representation in NLP research, especially compared to other languages from similarly developed regions. To bridge these gaps, we outline current Cantonese NLP methods and introduce new benchmarks designed to evaluate LLM performance in factual generation, mathematical logic, complex reasoning, and general knowledge in Cantonese, which aim to advance open-source Cantonese LLM technology. We also propose future research directions and recommended models to enhance Cantonese LLM development. </p> </div> </dd> <dt> <a name='item329'>[329]</a> <a href ="/abs/2409.00557" title="Abstract" id="2409.00557"> arXiv:2409.00557 </a> (replaced) [<a href="/pdf/2409.00557" title="Download PDF" id="pdf-2409.00557" aria-labelledby="pdf-2409.00557">pdf</a>, <a href="https://arxiv.org/html/2409.00557v3" title="View HTML" id="html-2409.00557" aria-labelledby="html-2409.00557" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.00557" title="Other formats" id="oth-2409.00557" aria-labelledby="oth-2409.00557">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Learning to Ask: When LLM Agents Meet Unclear Instruction </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+W">Wenxuan Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shi,+J">Juluan Shi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ling,+Z">Zixuan Ling</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chan,+Y">Yuk-Kit Chan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+C">Chaozheng Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+C">Cheryl Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+Y">Youliang Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+J">Jen-tse Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiao,+W">Wenxiang Jiao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lyu,+M+R">Michael R. Lyu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Software Engineering (cs.SE) </div> <p class='mathjax'> Equipped with the capability to call functions, modern large language models (LLMs) can leverage external tools for addressing a range of tasks unattainable through language skills alone. However, the effective execution of these tools relies heavily not just on the advanced capabilities of LLMs but also on precise user instructions, which often cannot be ensured in the real world. To evaluate the performance of LLMs tool-use under imperfect instructions, we meticulously examine the real-world instructions queried from users, analyze the error patterns, and build a challenging tool-use benchmark called Noisy ToolBench (NoisyToolBench). We find that due to the next-token prediction training objective, LLMs tend to arbitrarily generate the missed argument, which may lead to hallucinations and risks. To address this issue, we propose a novel framework, Ask-when-Needed (AwN), which prompts LLMs to ask questions to users whenever they encounter obstacles due to unclear instructions. Moreover, to reduce the manual labor involved in user-LLM interaction and assess LLMs performance in tool utilization from both accuracy and efficiency perspectives, we design an automated evaluation tool named ToolEvaluator. Our experiments demonstrate that the AwN significantly outperforms existing frameworks for tool learning in the NoisyToolBench. We will release all related code and datasets to support future research. </p> </div> </dd> <dt> <a name='item330'>[330]</a> <a href ="/abs/2409.03257" title="Abstract" id="2409.03257"> arXiv:2409.03257 </a> (replaced) [<a href="/pdf/2409.03257" title="Download PDF" id="pdf-2409.03257" aria-labelledby="pdf-2409.03257">pdf</a>, <a href="https://arxiv.org/html/2409.03257v2" title="View HTML" id="html-2409.03257" aria-labelledby="html-2409.03257" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.03257" title="Other formats" id="oth-2409.03257" aria-labelledby="oth-2409.03257">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Understanding LLM Development Through Longitudinal Study: Insights from the Open Ko-LLM Leaderboard </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Park,+C">Chanjun Park</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+H">Hyeonwoo Kim</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted to NAACL 2025 Industry </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> This paper conducts a longitudinal study over eleven months to address the limitations of prior research on the Open Ko-LLM Leaderboard, which have relied on empirical studies with restricted observation periods of only five months. By extending the analysis duration, we aim to provide a more comprehensive understanding of the progression in developing Korean large language models (LLMs). Our study is guided by three primary research questions: (1) What are the specific challenges in improving LLM performance across diverse tasks on the Open Ko-LLM Leaderboard over time? (2) How does model size impact task performance correlations across various benchmarks? (3) How have the patterns in leaderboard rankings shifted over time on the Open Ko-LLM Leaderboard?. By analyzing 1,769 models over this period, our research offers a comprehensive examination of the ongoing advancements in LLMs and the evolving nature of evaluation frameworks. </p> </div> </dd> <dt> <a name='item331'>[331]</a> <a href ="/abs/2409.09866" title="Abstract" id="2409.09866"> arXiv:2409.09866 </a> (replaced) [<a href="/pdf/2409.09866" title="Download PDF" id="pdf-2409.09866" aria-labelledby="pdf-2409.09866">pdf</a>, <a href="https://arxiv.org/html/2409.09866v2" title="View HTML" id="html-2409.09866" aria-labelledby="html-2409.09866" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.09866" title="Other formats" id="oth-2409.09866" aria-labelledby="oth-2409.09866">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> S2Cap: A Benchmark and a Baseline for Singing Style Captioning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ok,+H">Hyunjong Ok</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+J">Jaeho Lee</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Preprint </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG); Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> <p class='mathjax'> Singing voices contain much richer information than common voices, such as diverse vocal and acoustic characteristics. However, existing open-source audio-text datasets for singing voices capture only a limited set of attributes and lacks acoustic features, leading to limited utility towards downstream tasks, such as style captioning. To fill this gap, we formally consider the task of singing style captioning and introduce S2Cap, a singing voice dataset with comprehensive descriptions of diverse vocal, acoustic and demographic attributes. Based on this dataset, we develop a simple yet effective baseline algorithm for the singing style captioning. The algorithm utilizes two novel technical components: CRESCENDO for mitigating misalignment between pretrained unimodal models, and demixing supervision to regularize the model to focus on the singing voice. Despite its simplicity, the proposed method outperforms state-of-the-art baselines. </p> </div> </dd> <dt> <a name='item332'>[332]</a> <a href ="/abs/2409.12929" title="Abstract" id="2409.12929"> arXiv:2409.12929 </a> (replaced) [<a href="/pdf/2409.12929" title="Download PDF" id="pdf-2409.12929" aria-labelledby="pdf-2409.12929">pdf</a>, <a href="https://arxiv.org/html/2409.12929v2" title="View HTML" id="html-2409.12929" aria-labelledby="html-2409.12929" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.12929" title="Other formats" id="oth-2409.12929" aria-labelledby="oth-2409.12929">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LogicPro: Improving Complex Logical Reasoning via Program-Guided Learning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+J">Jin Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yan,+Y">Yuchen Yan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Y">Yang Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jin,+Y">Yonggang Jin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Peng,+S">Shuai Peng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+M">Mengdi Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cai,+X">Xunliang Cai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+Y">Yixin Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+L">Liangcai Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+Z">Zhi Tang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> In this paper, we propose a new data synthesis method called \textbf{LogicPro}, which leverages LeetCode-style algorithm \underline{Pro}blems and their corresponding \underline{Pro}gram solutions to synthesize Complex \underline{Logic}al Reasoning data in text format. First, we synthesize complex reasoning problems through source algorithm problems and test cases. Then, standard answers and intermediate variable outputs are obtained for each problem based on standard python solutions and test cases. Finally, with the guidance of code intermediate variables, we synthesize the text reasoning process for each reasoning problems. Through this method, we can synthesize data that is difficult, scalable, effective, and comes with golden standard answers and high-quality reasoning processes. As a result, with our 540K synthesized dataset constructed solely from 2,360 algorithm problems, our approach <br>Code and data are publicly available at <a href="https://github.com/jiangjin1999/LogicPro" rel="external noopener nofollow" class="link-external link-https">this https URL</a> achieves significant improvements in multiple models for the datasets \textit{BBH$^{27}$}, \textit{LogicBench}, \textit{DROP}, \textit{AR-LSAT}, and \textit{GSM8K}, etc. outperforming a wide range of existing reasoning datasets. </p> </div> </dd> <dt> <a name='item333'>[333]</a> <a href ="/abs/2409.13338" title="Abstract" id="2409.13338"> arXiv:2409.13338 </a> (replaced) [<a href="/pdf/2409.13338" title="Download PDF" id="pdf-2409.13338" aria-labelledby="pdf-2409.13338">pdf</a>, <a href="https://arxiv.org/html/2409.13338v2" title="View HTML" id="html-2409.13338" aria-labelledby="html-2409.13338" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.13338" title="Other formats" id="oth-2409.13338" aria-labelledby="oth-2409.13338">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Time Awareness in Large Language Models: Benchmarking Fact Recall Across Time </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Herel,+D">David Herel</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bartek,+V">Vojtech Bartek</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jirak,+J">Jiri Jirak</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mikolov,+T">Tomas Mikolov</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Who is the US President? The answer changes depending on when the question is asked. While large language models (LLMs) are evaluated on various reasoning tasks, they often miss a crucial dimension: time. In real-world scenarios, the correctness of answers is frequently tied to temporal context. To address this gap, we present a novel framework and dataset spanning over 8,000 events from 2018 to 2024, annotated with day-level granularity and sourced globally across domains such as politics, science, and business. Our TimeShift evaluation method systematically probes LLMs for temporal reasoning, revealing that base models often outperform instruction-tuned and synthetic-trained counterparts on time-sensitive recall. Additionally, we find that even large-scale models exhibit brittleness in handling paraphrased facts, highlighting unresolved challenges in temporal consistency. By identifying these limitations, our work provides a significant step toward advancing time-aware language models capable of adapting to the dynamic nature of real-world knowledge. </p> </div> </dd> <dt> <a name='item334'>[334]</a> <a href ="/abs/2409.13694" title="Abstract" id="2409.13694"> arXiv:2409.13694 </a> (replaced) [<a href="/pdf/2409.13694" title="Download PDF" id="pdf-2409.13694" aria-labelledby="pdf-2409.13694">pdf</a>, <a href="https://arxiv.org/html/2409.13694v3" title="View HTML" id="html-2409.13694" aria-labelledby="html-2409.13694" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.13694" title="Other formats" id="oth-2409.13694" aria-labelledby="oth-2409.13694">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Multi-Source Knowledge Pruning for Retrieval-Augmented Generation: A Benchmark and Empirical Study </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+S">Shuo Yu</a> (1), <a href="https://arxiv.org/search/cs?searchtype=author&query=Cheng,+M">Mingyue Cheng</a> (1), <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+J">Jiqian Yang</a> (1), <a href="https://arxiv.org/search/cs?searchtype=author&query=Ouyang,+J">Jie Ouyang</a> (1), <a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+Y">Yucong Luo</a> (1), <a href="https://arxiv.org/search/cs?searchtype=author&query=Lei,+C">Chenyi Lei</a> (2), <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Q">Qi Liu</a> (1), <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+E">Enhong Chen</a> (1) ((1) State Key Laboratory of Cognitive Intelligence, University of Science and Technology of China, Hefei, China (2) Kuaishou Technology, Beijing, China)</div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 12 pages, 9 figures; </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Information Retrieval (cs.IR) </div> <p class='mathjax'> Retrieval-augmented generation (RAG) is increasingly recognized as an effective approach to mitigating the hallucination of large language models (LLMs) through the integration of external knowledge. While numerous efforts, most studies focus on a single type of external knowledge source. In contrast, most real-world applications involve diverse knowledge from various sources, a scenario that has been relatively underexplored. The main dilemma is the lack of a suitable dataset incorporating multiple knowledge sources and pre-exploration of the associated issues. To address these challenges, we standardize a benchmark dataset that combines structured and unstructured knowledge across diverse and complementary domains. Building upon the dataset, we identify the limitations of existing methods under such conditions. Therefore, we develop PruningRAG, a plug-and-play RAG framework that uses multi-granularity pruning strategies to more effectively incorporate relevant context and mitigate the negative impact of misleading information. Extensive experimental results demonstrate superior performance of PruningRAG and our insightful findings are also reported. Our dataset and code are publicly available\footnote{<a href="https://github.com/USTCAGI/PruningRAG" rel="external noopener nofollow" class="link-external link-https">this https URL</a>}. </p> </div> </dd> <dt> <a name='item335'>[335]</a> <a href ="/abs/2409.13949" title="Abstract" id="2409.13949"> arXiv:2409.13949 </a> (replaced) [<a href="/pdf/2409.13949" title="Download PDF" id="pdf-2409.13949" aria-labelledby="pdf-2409.13949">pdf</a>, <a href="/format/2409.13949" title="Other formats" id="oth-2409.13949" aria-labelledby="oth-2409.13949">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Mufu: Multilingual Fused Learning for Low-Resource Translation with LLM </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lim,+Z+W">Zheng Wei Lim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gupta,+N">Nitish Gupta</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+H">Honglin Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cohn,+T">Trevor Cohn</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 29 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Multilingual large language models (LLMs) are great translators, but this is largely limited to high-resource languages. For many LLMs, translating in and out of low-resource languages remains a challenging task. To maximize data efficiency in this low-resource setting, we introduce Mufu, which includes a selection of automatically generated multilingual candidates and an instruction to correct inaccurate translations in the prompt. Mufu prompts turn a translation task into a postediting one, and seek to harness the LLM's reasoning capability with auxiliary translation candidates, from which the model is required to assess the input quality, align the semantics cross-lingually, copy from relevant inputs and override instances that are incorrect. Our experiments on En-XX translations over the Flores-200 dataset show LLMs finetuned against Mufu-style prompts are robust to poor quality auxiliary translation candidates, achieving performance superior to NLLB 1.3B distilled model in 64% of low- and very-low-resource language pairs. We then distill these models to reduce inference cost, while maintaining on average 3.1 chrF improvement over finetune-only baseline in low-resource translations. </p> </div> </dd> <dt> <a name='item336'>[336]</a> <a href ="/abs/2409.14509" title="Abstract" id="2409.14509"> arXiv:2409.14509 </a> (replaced) [<a href="/pdf/2409.14509" title="Download PDF" id="pdf-2409.14509" aria-labelledby="pdf-2409.14509">pdf</a>, <a href="https://arxiv.org/html/2409.14509v4" title="View HTML" id="html-2409.14509" aria-labelledby="html-2409.14509" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.14509" title="Other formats" id="oth-2409.14509" aria-labelledby="oth-2409.14509">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Can AI writing be salvaged? Mitigating Idiosyncrasies and Improving Human-AI Alignment in the Writing Process through Edits </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chakrabarty,+T">Tuhin Chakrabarty</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Laban,+P">Philippe Laban</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+C">Chien-Sheng Wu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> NLP+HCI, Behavioral Science </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Computers and Society (cs.CY); Human-Computer Interaction (cs.HC) </div> <p class='mathjax'> LLM-based applications are helping people write, and LLM-generated text is making its way into social media, journalism, and our classrooms. However, the differences between LLM-generated and human-written text remain unclear. To explore this, we hired professional writers to edit paragraphs in several creative domains. We first found these writers agree on undesirable idiosyncrasies in LLM-generated text, formalizing it into a seven-category taxonomy (e.g. cliches, unnecessary exposition). Second, we curated the LAMP corpus: 1,057 LLM-generated paragraphs edited by professional writers according to our taxonomy. Analysis of LAMP reveals that none of the LLMs used in our study (GPT4o, Claude-3.5-Sonnet, Llama-3.1-70b) outperform each other in terms of writing quality, revealing common limitations across model families. Third, we explored automatic editing methods to improve LLM-generated text. A large-scale preference annotation confirms that although experts largely prefer text edited by other experts, automatic editing methods show promise in improving alignment between LLM-generated and human-written text. </p> </div> </dd> <dt> <a name='item337'>[337]</a> <a href ="/abs/2409.18339" title="Abstract" id="2409.18339"> arXiv:2409.18339 </a> (replaced) [<a href="/pdf/2409.18339" title="Download PDF" id="pdf-2409.18339" aria-labelledby="pdf-2409.18339">pdf</a>, <a href="https://arxiv.org/html/2409.18339v2" title="View HTML" id="html-2409.18339" aria-labelledby="html-2409.18339" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.18339" title="Other formats" id="oth-2409.18339" aria-labelledby="oth-2409.18339">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> AER-LLM: Ambiguity-aware Emotion Recognition Leveraging Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Hong,+X">Xin Hong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gong,+Y">Yuan Gong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sethu,+V">Vidhyasaharan Sethu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dang,+T">Ting Dang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted by ICASSP 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Recent advancements in Large Language Models (LLMs) have demonstrated great success in many Natural Language Processing (NLP) tasks. In addition to their cognitive intelligence, exploring their capabilities in emotional intelligence is also crucial, as it enables more natural and empathetic conversational AI. Recent studies have shown LLMs' capability in recognizing emotions, but they often focus on single emotion labels and overlook the complex and ambiguous nature of human emotions. This study is the first to address this gap by exploring the potential of LLMs in recognizing ambiguous emotions, leveraging their strong generalization capabilities and in-context learning. We design zero-shot and few-shot prompting and incorporate past dialogue as context information for ambiguous emotion recognition. Experiments conducted using three datasets indicate significant potential for LLMs in recognizing ambiguous emotions, and highlight the substantial benefits of including context information. Furthermore, our findings indicate that LLMs demonstrate a high degree of effectiveness in recognizing less ambiguous emotions and exhibit potential for identifying more ambiguous emotions, paralleling human perceptual capabilities. </p> </div> </dd> <dt> <a name='item338'>[338]</a> <a href ="/abs/2410.02465" title="Abstract" id="2410.02465"> arXiv:2410.02465 </a> (replaced) [<a href="/pdf/2410.02465" title="Download PDF" id="pdf-2410.02465" aria-labelledby="pdf-2410.02465">pdf</a>, <a href="https://arxiv.org/html/2410.02465v2" title="View HTML" id="html-2410.02465" aria-labelledby="html-2410.02465" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.02465" title="Other formats" id="oth-2410.02465" aria-labelledby="oth-2410.02465">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Revealing the Inherent Instructability of Pre-Trained Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=An,+S">Seokhyun An</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+M">Minji Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+H">Hyounghun Kim</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 31 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Instruction tuning -- supervised fine-tuning using instruction-response pairs -- is a key step in making pre-trained large language models (LLMs) instructable. Meanwhile, LLMs perform multitask learning during their pre-training, acquiring extensive knowledge and capabilities. We hypothesize that the pre-training stage can enable them to develop the ability to comprehend and address instructions. To verify this, we propose Response Tuning (RT), which removes the instruction and its corresponding mapping to the response from instruction tuning. Instead, it focuses solely on establishing the response distribution. Our experiments demonstrate that RT models, trained only on responses, can effectively respond to a wide range of instructions and exhibit helpfulness approaching that of their instruction-tuned counterparts. In addition, we observe that the models can recognize and reject unsafe queries after learning the refusal conditions from training responses. Furthermore, we demonstrate that these observations also hold in an in-context learning setting. These findings support our hypothesis, highlighting the extensive inherent capabilities of pre-trained LLMs. </p> </div> </dd> <dt> <a name='item339'>[339]</a> <a href ="/abs/2410.02743" title="Abstract" id="2410.02743"> arXiv:2410.02743 </a> (replaced) [<a href="/pdf/2410.02743" title="Download PDF" id="pdf-2410.02743" aria-labelledby="pdf-2410.02743">pdf</a>, <a href="https://arxiv.org/html/2410.02743v2" title="View HTML" id="html-2410.02743" aria-labelledby="html-2410.02743" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.02743" title="Other formats" id="oth-2410.02743" aria-labelledby="oth-2410.02743">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MA-RLHF: Reinforcement Learning from Human Feedback with Macro Actions </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chai,+Y">Yekun Chai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+H">Haoran Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fang,+H">Huang Fang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Shuohuan Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+Y">Yu Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+H">Hua Wu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Reinforcement learning from human feedback (RLHF) has demonstrated effectiveness in aligning large language models (LLMs) with human preferences. However, token-level RLHF suffers from the credit assignment problem over long sequences, where delayed rewards make it challenging for the model to discern which actions contributed to preferred outcomes. This hinders learning efficiency and slows <a href="http://convergence.In" rel="external noopener nofollow" class="link-external link-http">this http URL</a> this paper, we propose MA-RLHF, a simple yet effective RLHF framework that incorporates macro actions -- sequences of tokens or higher-level language constructs -- into the learning process. By operating at higher level of abstraction, our approach reduces the temporal distance between actions and rewards, facilitating faster and more accurate credit assignment. This results in more stable policy gradient estimates and enhances learning efficiency within each episode, all without increasing computational complexity during training or inference. We validate our approach through extensive experiments across various model sizes and tasks, including text summarization, dialogue generation, question answering, and program synthesis. Our method achieves substantial performance improvements over standard RLHF, with performance gains of up to 30% in text summarization and code generation, 18% in dialogue, and 8% in question answering tasks. Notably, our approach reaches parity with vanilla RLHF 1.7 ~ 2 times faster in terms of training time and continues to outperform it with further training. We make our code and data publicly available at <a href="https://github.com/ernie-research/MA-RLHF" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item340'>[340]</a> <a href ="/abs/2410.04524" title="Abstract" id="2410.04524"> arXiv:2410.04524 </a> (replaced) [<a href="/pdf/2410.04524" title="Download PDF" id="pdf-2410.04524" aria-labelledby="pdf-2410.04524">pdf</a>, <a href="https://arxiv.org/html/2410.04524v2" title="View HTML" id="html-2410.04524" aria-labelledby="html-2410.04524" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.04524" title="Other formats" id="oth-2410.04524" aria-labelledby="oth-2410.04524">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Toward Secure Tuning: Mitigating Security Risks from Instruction Fine-Tuning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Du,+Y">Yanrui Du</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+S">Sendong Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+J">Jiawei Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+M">Ming Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+D">Danyang Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qi,+S">Shuren Qi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fan,+F">Fenglei Fan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+T">Ting Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qin,+B">Bing Qin</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Instruction fine-tuning has emerged as a critical technique for customizing Large Language Models (LLMs) to specific applications. However, recent studies have highlighted significant security vulnerabilities in fine-tuned LLMs. Existing defense efforts focus more on pre-training and post-training methods, yet there remains underexplored in in-training methods. To fill this gap, we introduce a novel secure-tuning strategy called SWAT. By analyzing how module-level parameters (e.g. Q/K/V/O) affect the security feature space drift, we identify a robust subset of modules, termed Mods_Rob. Our SWAT strategy begins by warming up Mods_Rob to capture low-level features with minimal security risks, followed by training all parameters to achieve optimal task performance. Essentially, this strategy shifts the early learning burden more from global parameters to Mods_Rob, reducing update magnitudes of the non-robust subset. Across various datasets, scenarios, and LLMs, our strategy has demonstrated significant success in mitigating security risks while preserving task performance. Importantly, it can be seamlessly integrated with pre-training and post-training methods, leading to greater improvements. </p> </div> </dd> <dt> <a name='item341'>[341]</a> <a href ="/abs/2410.05248" title="Abstract" id="2410.05248"> arXiv:2410.05248 </a> (replaced) [<a href="/pdf/2410.05248" title="Download PDF" id="pdf-2410.05248" aria-labelledby="pdf-2410.05248">pdf</a>, <a href="https://arxiv.org/html/2410.05248v2" title="View HTML" id="html-2410.05248" aria-labelledby="html-2410.05248" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.05248" title="Other formats" id="oth-2410.05248" aria-labelledby="oth-2410.05248">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SFTMix: Elevating Language Model Instruction Tuning with Mixup Recipe </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xiao,+Y">Yuxin Xiao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+S">Shujian Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+W">Wenxuan Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ghassemi,+M">Marzyeh Ghassemi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+S">Sanqiang Zhao</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> To acquire instruction-following capabilities, large language models (LLMs) undergo instruction tuning, where they are trained on instruction-response pairs using next-token prediction (NTP). Efforts to improve instruction tuning often focus on higher-quality supervised fine-tuning (SFT) datasets, typically requiring data filtering with proprietary LLMs or human annotation. In this paper, we take a different approach by proposing SFTMix, a novel Mixup-based recipe that elevates LLM instruction tuning beyond the conventional NTP paradigm, without relying on well-curated datasets. Observing that LLMs exhibit uneven confidence across the semantic representation space, we argue that examples with different confidence levels should play distinct roles in instruction tuning--confident data is prone to overfitting, while unconfident data is harder to generalize. Based on this insight, SFTMix leverages training dynamics to identify examples with varying confidence levels, interpolates them to bridge the confidence gap, and applies a Mixup-based regularization to support learning on these additional, interpolated examples. By propagating supervision signals across confidence regions and encouraging linear behavior between them, SFTMix mitigates overfitting in confident examples while enhancing generalization in unconfident ones. We demonstrate the effectiveness of SFTMix in both instruction-following and healthcare-specific SFT tasks, with consistent improvements across LLM families and SFT datasets of varying sizes and qualities. Extensive analyses across six directions highlight SFTMix's compatibility with data selection, adaptability to compute-constrained scenarios, and scalability to broader applications. </p> </div> </dd> <dt> <a name='item342'>[342]</a> <a href ="/abs/2410.07173" title="Abstract" id="2410.07173"> arXiv:2410.07173 </a> (replaced) [<a href="/pdf/2410.07173" title="Download PDF" id="pdf-2410.07173" aria-labelledby="pdf-2410.07173">pdf</a>, <a href="https://arxiv.org/html/2410.07173v2" title="View HTML" id="html-2410.07173" aria-labelledby="html-2410.07173" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.07173" title="Other formats" id="oth-2410.07173" aria-labelledby="oth-2410.07173">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Better Language Models Exhibit Higher Visual Alignment </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ruthardt,+J">Jona Ruthardt</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Burghouts,+G+J">Gertjan J. Burghouts</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Belongie,+S">Serge Belongie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Asano,+Y+M">Yuki M. Asano</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> How well do text-only Large Language Models (LLMs) naturally align with the visual world? We provide the first direct analysis by utilizing frozen text representations in a discriminative vision-language model framework and measuring zero-shot generalization on unseen classes. We find decoder-based LLMs exhibit high intrinsic visual alignment. In particular, more capable LLMs reliably demonstrate stronger generalization. Moreover, utilizing frozen LLMs leads to strong gains in cross-lingual settings, where our approach surpasses CLIP's accuracy of 1.4% with 38.7% for Chinese. Our proposed method improves both robustness and generalization and also significantly reduces the need for paired data and compute, making vision-language models more accessible and adaptable. </p> </div> </dd> <dt> <a name='item343'>[343]</a> <a href ="/abs/2410.09412" title="Abstract" id="2410.09412"> arXiv:2410.09412 </a> (replaced) [<a href="/pdf/2410.09412" title="Download PDF" id="pdf-2410.09412" aria-labelledby="pdf-2410.09412">pdf</a>, <a href="https://arxiv.org/html/2410.09412v2" title="View HTML" id="html-2410.09412" aria-labelledby="html-2410.09412" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.09412" title="Other formats" id="oth-2410.09412" aria-labelledby="oth-2410.09412">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> FB-Bench: A Fine-Grained Multi-Task Benchmark for Evaluating LLMs' Responsiveness to Human Feedback </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Youquan Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zheng,+M">Miao Zheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+F">Fan Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dong,+G">Guosheng Dong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cui,+B">Bin Cui</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+W">Weipeng Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+Z">Zenan Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+W">Wentao Zhang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Human feedback is crucial in the interactions between humans and Large Language Models (LLMs). However, existing research primarily focuses on benchmarking LLMs in single-turn dialogues. Even in benchmarks designed for multi-turn dialogues, the user inputs are often independent, neglecting the nuanced and complex nature of human feedback within real-world usage scenarios. To fill this research gap, we introduce FB-Bench, a fine-grained, multi-task benchmark designed to evaluate LLMs' responsiveness to human feedback under real-world usage scenarios in Chinese. Drawing from the two main interaction scenarios, FB-Bench comprises 591 meticulously curated samples, encompassing eight task types, five deficiency types of response, and nine feedback types. We extensively evaluate a broad array of popular LLMs, revealing significant variations in their performance across different interaction scenarios. Further analysis indicates that task, human feedback, and deficiencies of previous responses can also significantly impact LLMs' responsiveness. Our findings underscore both the strengths and limitations of current models, providing valuable insights and directions for future research. Code and datasets are available at <a href="https://github.com/PKU-Baichuan-MLSystemLab/FB-Bench" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item344'>[344]</a> <a href ="/abs/2410.10724" title="Abstract" id="2410.10724"> arXiv:2410.10724 </a> (replaced) [<a href="/pdf/2410.10724" title="Download PDF" id="pdf-2410.10724" aria-labelledby="pdf-2410.10724">pdf</a>, <a href="https://arxiv.org/html/2410.10724v2" title="View HTML" id="html-2410.10724" aria-labelledby="html-2410.10724" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.10724" title="Other formats" id="oth-2410.10724" aria-labelledby="oth-2410.10724">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Large Language Models Are Active Critics in NLG Evaluation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+S">Shuying Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+J">Junjie Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+M">Ming Jiang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The conventional paradigm of using large language models (LLMs) for natural language generation (NLG) evaluation relies on pre-defined task definitions and evaluation criteria, positioning LLMs as "passive critics" that strictly follow developer-provided guidelines. However, human evaluators often apply implicit criteria, and their expectations in practice can vary widely based on specific end-user needs. Consequently, these rigid evaluation methods struggle to adapt to diverse scenarios without extensive prompt customization. To address this, we introduce Active-Critic, a novel LLM-based evaluator that transforms LLMs into "active critics'' capable of adapting to diverse NLG tasks using limited example data. Active-Critic consists of two stages: (1) self-inferring the target NLG task and relevant evaluation criteria, and (2) dynamically optimizing prompts to produce human-aligned scores along with detailed justifications. Our experiments show that Active-Critic can generate nuanced, context-aware evaluation criteria, enabling it to achieve superior alignment with human judgments across multiple tasks. </p> </div> </dd> <dt> <a name='item345'>[345]</a> <a href ="/abs/2410.10863" title="Abstract" id="2410.10863"> arXiv:2410.10863 </a> (replaced) [<a href="/pdf/2410.10863" title="Download PDF" id="pdf-2410.10863" aria-labelledby="pdf-2410.10863">pdf</a>, <a href="https://arxiv.org/html/2410.10863v2" title="View HTML" id="html-2410.10863" aria-labelledby="html-2410.10863" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.10863" title="Other formats" id="oth-2410.10863" aria-labelledby="oth-2410.10863">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Exploring the Personality Traits of LLMs through Latent Features Steering </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+S">Shu Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+S">Shenzhe Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+L">Liang Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+L">Lijie Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+M">Mengdi Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+D">Di Wang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> under review </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Large language models (LLMs) have significantly advanced dialogue systems and role-playing agents through their ability to generate human-like text. While prior studies have shown that LLMs can exhibit distinct and consistent personalities, the mechanisms through which these models encode and express specific personality traits remain poorly understood. To address this, we investigate how various factors, such as cultural norms and environmental stressors, encoded within LLMs, shape their personality traits, guided by the theoretical framework of social determinism. Inspired by related work on LLM interpretability, we propose a training-free approach to modify the model's behavior by extracting and steering latent features corresponding to factors within the model, thereby eliminating the need for retraining. Furthermore, we analyze the implications of these factors for model safety, focusing on their impact through the lens of personality. </p> </div> </dd> <dt> <a name='item346'>[346]</a> <a href ="/abs/2410.12323" title="Abstract" id="2410.12323"> arXiv:2410.12323 </a> (replaced) [<a href="/pdf/2410.12323" title="Download PDF" id="pdf-2410.12323" aria-labelledby="pdf-2410.12323">pdf</a>, <a href="https://arxiv.org/html/2410.12323v2" title="View HTML" id="html-2410.12323" aria-labelledby="html-2410.12323" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.12323" title="Other formats" id="oth-2410.12323" aria-labelledby="oth-2410.12323">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Reversal of Thought: Enhancing Large Language Models with Preference-Guided Reverse Reasoning Warm-up </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+J">Jiahao Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Du,+D">Dehui Du</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+H">Hao Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Di,+Z">Zixiang Di</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Naseem,+U">Usman Naseem</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Large language models (LLMs) have shown remarkable performance in reasoning tasks but face limitations in mathematical and complex logical reasoning. Existing methods to improve LLMs' logical capabilities either involve traceable or verifiable logical sequences that generate more reliable responses by constructing logical structures yet increase computational costs, or introduces rigid logic template rules, reducing flexibility. In this paper, we propose Reversal of Thought (RoT), a plug-and-play and cost-effective reasoning framework designed to enhance the logical reasoning abilities of LLMs during the warm-up phase prior to batch inference. RoT utilizes a Preference-Guided Reverse Reasoning warm-up strategy, which integrates logical symbols for pseudocode planning through meta-cognitive mechanisms and pairwise preference self-evaluation to generate task-specific prompts solely through demonstrations, aligning with LLMs' cognitive preferences shaped by RLHF. Through reverse reasoning, we utilize a Cognitive Preference Manager to assess knowledge boundaries and further expand LLMs' reasoning capabilities by aggregating solution logic for known tasks and stylistic templates for unknown tasks. Experiments across various tasks demonstrate that RoT surpasses existing baselines in both reasoning accuracy and efficiency. </p> </div> </dd> <dt> <a name='item347'>[347]</a> <a href ="/abs/2410.12445" title="Abstract" id="2410.12445"> arXiv:2410.12445 </a> (replaced) [<a href="/pdf/2410.12445" title="Download PDF" id="pdf-2410.12445" aria-labelledby="pdf-2410.12445">pdf</a>, <a href="https://arxiv.org/html/2410.12445v2" title="View HTML" id="html-2410.12445" aria-labelledby="html-2410.12445" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.12445" title="Other formats" id="oth-2410.12445" aria-labelledby="oth-2410.12445">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Open Ko-LLM Leaderboard2: Bridging Foundational and Practical Evaluation for Korean LLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+H">Hyeonwoo Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+D">Dahyun Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+J">Jihoo Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+S">Sukyung Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+Y">Yungi Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Park,+C">Chanjun Park</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted to NAACL 2025 Industry </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> The Open Ko-LLM Leaderboard has been instrumental in benchmarking Korean Large Language Models (LLMs), yet it has certain limitations. Notably, the disconnect between quantitative improvements on the overly academic leaderboard benchmarks and the qualitative impact of the models should be addressed. Furthermore, the benchmark suite is largely composed of translated versions of their English counterparts, which may not fully capture the intricacies of the Korean language. To address these issues, we propose Open Ko-LLM Leaderboard2, an improved version of the earlier Open Ko-LLM Leaderboard. The original benchmarks are entirely replaced with new tasks that are more closely aligned with real-world capabilities. Additionally, four new native Korean benchmarks are introduced to better reflect the distinct characteristics of the Korean language. Through these refinements, Open Ko-LLM Leaderboard2 seeks to provide a more meaningful evaluation for advancing Korean LLMs. </p> </div> </dd> <dt> <a name='item348'>[348]</a> <a href ="/abs/2410.12480" title="Abstract" id="2410.12480"> arXiv:2410.12480 </a> (replaced) [<a href="/pdf/2410.12480" title="Download PDF" id="pdf-2410.12480" aria-labelledby="pdf-2410.12480">pdf</a>, <a href="https://arxiv.org/html/2410.12480v2" title="View HTML" id="html-2410.12480" aria-labelledby="html-2410.12480" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.12480" title="Other formats" id="oth-2410.12480" aria-labelledby="oth-2410.12480">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> KcMF: A Knowledge-compliant Framework for Schema and Entity Matching with Fine-tuning-free LLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+Y">Yongqin Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+H">Huan Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+K">Ke Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shou,+L">Lidan Shou</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> under reveiw; new results and analysis added, typos corrected </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Databases (cs.DB); Machine Learning (cs.LG) </div> <p class='mathjax'> Schema matching (SM) and entity matching (EM) tasks are crucial for data integration. While large language models (LLMs) have shown promising results in these tasks, they suffer from hallucinations and confusion about task instructions. This study presents the Knowledge-Compliant Matching Framework (KcMF), an LLM-based approach that addresses these issues without the need for domain-specific fine-tuning. KcMF employs a once-and-for-all pseudo-code-based task decomposition strategy to adopt natural language statements that guide LLM reasoning and reduce confusion across various task types. We also propose two mechanisms, Dataset as Knowledge (DaK) and Example as Knowledge (EaK), to build domain knowledge sets when unstructured domain knowledge is lacking. Moreover, we introduce a result-ensemble strategy to leverage multiple knowledge sources and suppress badly formatted outputs. Extensive evaluations confirm that KcMF clearly enhances five LLM backbones in both SM and EM tasks while outperforming the non-LLM competitors by an average F1-score of 17.93%. </p> </div> </dd> <dt> <a name='item349'>[349]</a> <a href ="/abs/2410.12691" title="Abstract" id="2410.12691"> arXiv:2410.12691 </a> (replaced) [<a href="/pdf/2410.12691" title="Download PDF" id="pdf-2410.12691" aria-labelledby="pdf-2410.12691">pdf</a>, <a href="https://arxiv.org/html/2410.12691v5" title="View HTML" id="html-2410.12691" aria-labelledby="html-2410.12691" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.12691" title="Other formats" id="oth-2410.12691" aria-labelledby="oth-2410.12691">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Building Better: Avoiding Pitfalls in Developing Language Resources when Data is Scarce </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ousidhoum,+N">Nedjma Ousidhoum</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Beloucif,+M">Meriem Beloucif</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mohammad,+S+M">Saif M. Mohammad</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 13 pages, under review </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Computers and Society (cs.CY) </div> <p class='mathjax'> Language is a symbolic capital that affects people's lives in many ways (Bourdieu, 1977, 1991). It is a powerful tool that accounts for identities, cultures, traditions, and societies in general. Hence, data in a given language should be viewed as more than a collection of tokens. Good data collection and labeling practices are key to building more human-centered and socially aware technologies. While there has been a rising interest in mid- to low-resource languages within the NLP community, work in this space has to overcome unique challenges such as data scarcity and access to suitable annotators. In this paper, we collect feedback from those directly involved in and impacted by NLP artefacts for mid- to low-resource languages. We conduct a quantitative and qualitative analysis of the responses and highlight the main issues related to (1) data quality such as linguistic and cultural data suitability; and (2) the ethics of common annotation practices such as the misuse of online community services. Based on these findings, we make several recommendations for the creation of high-quality language artefacts that reflect the cultural milieu of its speakers, while simultaneously respecting the dignity and labor of data workers. </p> </div> </dd> <dt> <a name='item350'>[350]</a> <a href ="/abs/2410.12916" title="Abstract" id="2410.12916"> arXiv:2410.12916 </a> (replaced) [<a href="/pdf/2410.12916" title="Download PDF" id="pdf-2410.12916" aria-labelledby="pdf-2410.12916">pdf</a>, <a href="https://arxiv.org/html/2410.12916v2" title="View HTML" id="html-2410.12916" aria-labelledby="html-2410.12916" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.12916" title="Other formats" id="oth-2410.12916" aria-labelledby="oth-2410.12916">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MSc-SQL: Multi-Sample Critiquing Small Language Models For Text-To-SQL Translation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Gorti,+S+K">Satya Krishna Gorti</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gofman,+I">Ilan Gofman</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zhaoyan Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+J">Jiapeng Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Vouitsis,+N">No毛l Vouitsis</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+G">Guangwei Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cresswell,+J+C">Jesse C. Cresswell</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hosseinzadeh,+R">Rasa Hosseinzadeh</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Published at NAACL 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Text-to-SQL generation enables non-experts to interact with databases via natural language. Recent advances rely on large closed-source models like GPT-4 that present challenges in accessibility, privacy, and latency. To address these issues, we focus on developing small, efficient, and open-source text-to-SQL models. We demonstrate the benefits of sampling multiple candidate SQL generations and propose our method, MSc-SQL, to critique them using associated metadata. Our sample critiquing model evaluates multiple outputs simultaneously, achieving state-of-the-art performance compared to other open-source models while remaining competitive with larger models at a much lower cost. Full code can be found at <a href="https://github.com/layer6ai-labs/msc-sql" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item351'>[351]</a> <a href ="/abs/2410.13184" title="Abstract" id="2410.13184"> arXiv:2410.13184 </a> (replaced) [<a href="/pdf/2410.13184" title="Download PDF" id="pdf-2410.13184" aria-labelledby="pdf-2410.13184">pdf</a>, <a href="https://arxiv.org/html/2410.13184v3" title="View HTML" id="html-2410.13184" aria-labelledby="html-2410.13184" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.13184" title="Other formats" id="oth-2410.13184" aria-labelledby="oth-2410.13184">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Router-Tuning: A Simple and Effective Approach for Enabling Dynamic-Depth in Transformers </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=He,+S">Shwai He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ge,+T">Tao Ge</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+G">Guoheng Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tian,+B">Bowei Tian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+X">Xiaoyang Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+A">Ang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+D">Dong Yu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Traditional transformer models often allocate a fixed amount of computational resources to every input token, leading to inefficient and unnecessary computation. To address this, the Mixture of Depths (MoD) was introduced to dynamically adjust the computational depth by skipping less important layers. Despite its promise, current MoD approaches remain under-explored and face two main challenges: (1) high training costs due to the need to train the entire model along with the routers that determine which layers to skip, and (2) the risk of performance degradation when important layers are bypassed. In response to the first issue, we propose Router-Tuning, a method that fine-tunes only the router on a small dataset, drastically reducing the computational overhead associated with full model training. For the second challenge, we propose MindSkip, which deploys Attention with Dynamic Depths. This method preserves the model's performance while significantly enhancing computational and memory efficiency. Extensive experiments demonstrate that our approach delivers competitive results while dramatically improving the computation efficiency, e.g., 21\% speedup and only a 0.2\% performance drop. The code is released at <a href="https://github.com/CASE-Lab-UMD/Router-Tuning" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item352'>[352]</a> <a href ="/abs/2410.13276" title="Abstract" id="2410.13276"> arXiv:2410.13276 </a> (replaced) [<a href="/pdf/2410.13276" title="Download PDF" id="pdf-2410.13276" aria-labelledby="pdf-2410.13276">pdf</a>, <a href="https://arxiv.org/html/2410.13276v4" title="View HTML" id="html-2410.13276" aria-labelledby="html-2410.13276" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.13276" title="Other formats" id="oth-2410.13276" aria-labelledby="oth-2410.13276">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SeerAttention: Learning Intrinsic Sparse Attention in Your LLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+Y">Yizhao Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zeng,+Z">Zhichen Zeng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Du,+D">Dayou Du</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+S">Shijie Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+P">Peiyuan Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qi,+J">Jiaxing Qi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lai,+J">Junjie Lai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=So,+H+K">Hayden Kwok-Hay So</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+T">Ting Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+F">Fan Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+M">Mao Yang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Attention is the cornerstone of modern Large Language Models (LLMs). Yet its quadratic complexity hinders efficiency and scalability, especially for long-context processing. A promising approach is to leverage sparsity in attention. However, existing sparsity-based solutions predominantly rely on predefined patterns or heuristics at the attention head level, struggling to adapt dynamically to different contexts efficiently. <br>We propose SeerAttention, a simple yet effective attention mechanism that directly learns the block-level attention sparsity from the LLM itself. Inspired by the gating mechanism in Mixture of Experts (MoE), SeerAttention augments the conventional attention with a learnable gate that selectively activates important blocks within the attention map. Specifically, the gate first pools the query (Q) and key (K) tensors along the sequence dimension and processes them through learnable linear layers. The resulting matrices are then multiplied together to produce the gating scores, which are used to predict block-level attention sparsity. Combined with our block-sparse FlashAttention kernel, SeerAttention can achieve significant speedup on GPUs. When applied to pre-trained LLMs, SeerAttention only requires training the gate parameters in a lightweight self-distillation manner, allowing rapid convergence. Our evaluation results demonstrate that SeerAttention achieves better model accuracy and lower latency for long-context pre-filling compared to prior methods. Code is available at: <a href="https://github.com/microsoft/SeerAttention" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item353'>[353]</a> <a href ="/abs/2410.13305" title="Abstract" id="2410.13305"> arXiv:2410.13305 </a> (replaced) [<a href="/pdf/2410.13305" title="Download PDF" id="pdf-2410.13305" aria-labelledby="pdf-2410.13305">pdf</a>, <a href="https://arxiv.org/html/2410.13305v2" title="View HTML" id="html-2410.13305" aria-labelledby="html-2410.13305" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.13305" title="Other formats" id="oth-2410.13305" aria-labelledby="oth-2410.13305">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Reference-Based Post-OCR Processing with LLM for Diacritic Languages </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Do,+T">Thao Do</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tran,+D+P">Dinh Phu Tran</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Vo,+A">An Vo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+D">Daeyoung Kim</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted in the AAAI 2025 (39th) AISI track. Dataset and repo are in the paper </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Extracting fine-grained OCR text from aged documents in diacritic languages remains challenging due to unexpected artifacts, time-induced degradation, and lack of datasets. While standalone spell correction approaches have been proposed, they show limited performance for historical documents due to numerous possible OCR error combinations and differences between modern and classical corpus distributions. We propose a method utilizing available content-focused ebooks as a reference base to correct imperfect OCR-generated text, supported by large language models. This technique generates high-precision pseudo-page-to-page labels for diacritic languages, where small strokes pose significant challenges in historical conditions. The pipeline eliminates various types of noise from aged documents and addresses issues such as missing characters, words, and disordered sequences. Our post-processing method, which generated a large OCR dataset of classical Vietnamese books, achieved a mean grading score of 8.72 on a 10-point scale. This outperformed the state-of-the-art transformer-based Vietnamese spell correction model, which scored 7.03 when evaluated on a sampled subset of the dataset. We also trained a baseline OCR model to assess and compare it with well-known engines. Experimental results demonstrate the strength of our baseline model compared to widely used open-source solutions. The resulting dataset will be released publicly to support future studies. </p> </div> </dd> <dt> <a name='item354'>[354]</a> <a href ="/abs/2410.14157" title="Abstract" id="2410.14157"> arXiv:2410.14157 </a> (replaced) [<a href="/pdf/2410.14157" title="Download PDF" id="pdf-2410.14157" aria-labelledby="pdf-2410.14157">pdf</a>, <a href="https://arxiv.org/html/2410.14157v2" title="View HTML" id="html-2410.14157" aria-labelledby="html-2410.14157" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.14157" title="Other formats" id="oth-2410.14157" aria-labelledby="oth-2410.14157">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Beyond Autoregression: Discrete Diffusion for Complex Reasoning and Planning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ye,+J">Jiacheng Ye</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+J">Jiahui Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gong,+S">Shansan Gong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zheng,+L">Lin Zheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+X">Xin Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Z">Zhenguo Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kong,+L">Lingpeng Kong</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> ICLR 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Autoregressive language models, despite their impressive capabilities, struggle with complex reasoning and long-term planning tasks. We introduce discrete diffusion models as a novel solution to these challenges. Through the lens of subgoal imbalance, we demonstrate how diffusion models effectively learn difficult subgoals that elude autoregressive approaches. We propose Multi-granularity Diffusion Modeling (MDM), which prioritizes subgoals based on difficulty during learning. On complex tasks like Countdown, Sudoku, and Boolean Satisfiability Problems, MDM significantly outperforms autoregressive models without using search techniques. For instance, MDM achieves 91.5\% and 100\% accuracy on Countdown and Sudoku, respectively, compared to 45.8\% and 20.7\% for autoregressive models. Our work highlights the potential of diffusion-based approaches in advancing AI capabilities for sophisticated language understanding and problem-solving tasks. </p> </div> </dd> <dt> <a name='item355'>[355]</a> <a href ="/abs/2410.14387" title="Abstract" id="2410.14387"> arXiv:2410.14387 </a> (replaced) [<a href="/pdf/2410.14387" title="Download PDF" id="pdf-2410.14387" aria-labelledby="pdf-2410.14387">pdf</a>, <a href="https://arxiv.org/html/2410.14387v2" title="View HTML" id="html-2410.14387" aria-labelledby="html-2410.14387" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.14387" title="Other formats" id="oth-2410.14387" aria-labelledby="oth-2410.14387">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> How Do Multilingual Language Models Remember Facts? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Fierro,+C">Constanza Fierro</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Foroutan,+N">Negar Foroutan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Elliott,+D">Desmond Elliott</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=S%C3%B8gaard,+A">Anders S酶gaard</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large Language Models (LLMs) store and retrieve vast amounts of factual knowledge acquired during pre-training. Prior research has localized and identified mechanisms behind knowledge recall; however, it has only focused on English monolingual models. The question of how these mechanisms generalize to non-English languages and multilingual LLMs remains unexplored. In this paper, we address this gap by conducting a comprehensive analysis of three multilingual LLMs. First, we show that previously identified recall mechanisms in English largely apply to multilingual contexts, with nuances based on language and architecture. Next, through patching intermediate representations, we localize the role of language during recall, finding that subject enrichment is language-independent, while object extraction is language-dependent. Additionally, we discover that the last token representation acts as a Function Vector (FV), encoding both the language of the query and the content to be extracted from the subject. Furthermore, in decoder-only LLMs, FVs compose these two pieces of information in two separate stages. These insights reveal unique mechanisms in multilingual LLMs for recalling information, highlighting the need for new methodologies--such as knowledge evaluation, fact editing, and knowledge acquisition--that are specifically tailored for multilingual LLMs. </p> </div> </dd> <dt> <a name='item356'>[356]</a> <a href ="/abs/2410.14735" title="Abstract" id="2410.14735"> arXiv:2410.14735 </a> (replaced) [<a href="/pdf/2410.14735" title="Download PDF" id="pdf-2410.14735" aria-labelledby="pdf-2410.14735">pdf</a>, <a href="https://arxiv.org/html/2410.14735v4" title="View HTML" id="html-2410.14735" aria-labelledby="html-2410.14735" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.14735" title="Other formats" id="oth-2410.14735" aria-labelledby="oth-2410.14735">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Agent Skill Acquisition for Large Language Models via CycleQD </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Kuroki,+S">So Kuroki</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nakamura,+T">Taishi Nakamura</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Akiba,+T">Takuya Akiba</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+Y">Yujin Tang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> To appear at the 13th International Conference on Learning Representations (ICLR 2025) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Neural and Evolutionary Computing (cs.NE) </div> <p class='mathjax'> Training large language models to acquire specific skills remains a challenging endeavor. Conventional training approaches often struggle with data distribution imbalances and inadequacies in objective functions that do not align well with task-specific performance. To address these challenges, we introduce CycleQD, a novel approach that leverages the Quality Diversity framework through a cyclic adaptation of the algorithm, along with a model merging based crossover and an SVD-based mutation. In CycleQD, each task's performance metric is alternated as the quality measure while the others serve as the behavioral characteristics. This cyclic focus on individual tasks allows for concentrated effort on one task at a time, eliminating the need for data ratio tuning and simplifying the design of the objective function. Empirical results from AgentBench indicate that applying CycleQD to LLAMA3-8B-INSTRUCT based models not only enables them to surpass traditional fine-tuning methods in coding, operating systems, and database tasks, but also achieves performance on par with GPT-3.5-TURBO, which potentially contains much more parameters, across these domains. Crucially, this enhanced performance is achieved while retaining robust language capabilities, as evidenced by its performance on widely adopted language benchmark tasks. We highlight the key design choices in CycleQD, detailing how these contribute to its effectiveness. Furthermore, our method is general and can be applied to image segmentation models, highlighting its applicability across different domains. </p> </div> </dd> <dt> <a name='item357'>[357]</a> <a href ="/abs/2410.15277" title="Abstract" id="2410.15277"> arXiv:2410.15277 </a> (replaced) [<a href="/pdf/2410.15277" title="Download PDF" id="pdf-2410.15277" aria-labelledby="pdf-2410.15277">pdf</a>, <a href="/format/2410.15277" title="Other formats" id="oth-2410.15277" aria-labelledby="oth-2410.15277">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> BRIEF: Bridging Retrieval and Inference for Multi-hop Reasoning via Compression </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yuankai Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gu,+J">Jia-Chen Gu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+D">Di Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chang,+K">Kai-Wei Chang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Peng,+N">Nanyun Peng</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted by NAACL 2025 Findings. Project page: <a href="https://jasonforjoy.github.io/BRIEF/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Retrieval-augmented generation (RAG) can supplement large language models (LLMs) by integrating external knowledge. However, as the number of retrieved documents increases, the input length to LLMs grows linearly, causing a dramatic increase in latency and a degradation in long-context understanding. This is particularly serious for multi-hop questions that require a chain of reasoning across documents. To accelerate inference, reduce costs, and minimize distractions, this paper presents BRIEF (Bridging Retrieval and Inference through Evidence Fusion), a lightweight approach that performs query-aware multi-hop reasoning by compressing retrieved documents into highly dense textual summaries to integrate into in-context RAG. To enable learning compression for multi-hop reasoning, we curate synthetic data by extracting atomic propositions that encapsulate distinct factoids from the source documents to compose synthetic summaries. Based on our synthetic data built entirely by open-source models, BRIEF generates more concise summaries and enables a range of LLMs to achieve exceptional open-domain question answering (QA) performance. For example, on HotpotQA, BRIEF improves the compression rate by 2 times compared to the state-of-the-art baseline, while outperforming it by 3.00% EM and 4.16% F1 with Flan-UL2 as the reader model. It also generates more concise summaries than proprietary GPT-3.5, while demonstrating nearly identical QA performance. </p> </div> </dd> <dt> <a name='item358'>[358]</a> <a href ="/abs/2410.15539" title="Abstract" id="2410.15539"> arXiv:2410.15539 </a> (replaced) [<a href="/pdf/2410.15539" title="Download PDF" id="pdf-2410.15539" aria-labelledby="pdf-2410.15539">pdf</a>, <a href="https://arxiv.org/html/2410.15539v2" title="View HTML" id="html-2410.15539" aria-labelledby="html-2410.15539" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.15539" title="Other formats" id="oth-2410.15539" aria-labelledby="oth-2410.15539">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Grammatical Error Correction for Low-Resource Languages: The Case of Zarma </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Keita,+M+K">Mamadou K. Keita</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Homan,+C">Christopher Homan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zampieri,+M">Marcos Zampieri</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bremang,+A">Adwoa Bremang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Alfari,+H+A">Habibatou Abdoulaye Alfari</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ibrahim,+E+A">Elysabhete Amadou Ibrahim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Owusu,+D">Dennis Owusu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Grammatical error correction (GEC) aims to improve quality and readability of texts through accurate correction of linguistic mistakes. Previous work has focused on high-resource languages, while low-resource languages lack robust tools. However, low-resource languages often face problems such as: non-standard orthography, limited annotated corpora, and diverse dialects, which slows down the development of GEC tools. We present a study on GEC for Zarma, spoken by over five million in West Africa. We compare three approaches: rule-based methods, machine translation (MT) models, and large language models (LLMs). We evaluated them using a dataset of more than 250,000 examples, including synthetic and human-annotated data. Our results showed that the MT-based approach using M2M100 outperforms others, with a detection rate of 95. 82% and a suggestion accuracy of 78. 90% in automatic evaluations (AE) and an average score of 3.0 out of 5.0 in manual evaluation (ME) from native speakers for grammar and logical corrections. The rule-based method was effective for spelling errors but failed on complex context-level errors. LLMs -- MT5-small -- showed moderate performance. Our work supports use of MT models to enhance GEC in low-resource settings, and we validated these results with Bambara, another West African language. </p> </div> </dd> <dt> <a name='item359'>[359]</a> <a href ="/abs/2410.16491" title="Abstract" id="2410.16491"> arXiv:2410.16491 </a> (replaced) [<a href="/pdf/2410.16491" title="Download PDF" id="pdf-2410.16491" aria-labelledby="pdf-2410.16491">pdf</a>, <a href="https://arxiv.org/html/2410.16491v2" title="View HTML" id="html-2410.16491" aria-labelledby="html-2410.16491" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.16491" title="Other formats" id="oth-2410.16491" aria-labelledby="oth-2410.16491">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> BIG5-CHAT: Shaping LLM Personalities Through Training on Human-Grounded Data </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+W">Wenkai Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+J">Jiarui Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+A">Andy Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+X">Xuhui Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Diab,+M">Mona Diab</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sap,+M">Maarten Sap</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> In this work, we tackle the challenge of embedding realistic human personality traits into LLMs. Previous approaches have primarily focused on prompt-based methods that describe the behavior associated with the desired personality traits, suffering from realism and validity issues. To address these limitations, we introduce BIG5-CHAT, a large-scale dataset containing 100,000 dialogues designed to ground models in how humans express their personality in language. Leveraging this dataset, we explore Supervised Fine-Tuning and Direct Preference Optimization as training-based methods to align LLMs more naturally with human personality patterns. Our methods outperform prompting on personality assessments such as BFI and IPIP-NEO, with trait correlations more closely matching human data. Furthermore, our experiments reveal that models trained to exhibit higher conscientiousness, higher agreeableness, lower extraversion, and lower neuroticism display better performance on reasoning tasks, aligning with psychological findings on how these traits impact human cognitive performance. To our knowledge, this work is the first comprehensive study to demonstrate how training-based methods can shape LLM personalities through learning from real human behaviors. </p> </div> </dd> <dt> <a name='item360'>[360]</a> <a href ="/abs/2410.20779" title="Abstract" id="2410.20779"> arXiv:2410.20779 </a> (replaced) [<a href="/pdf/2410.20779" title="Download PDF" id="pdf-2410.20779" aria-labelledby="pdf-2410.20779">pdf</a>, <a href="https://arxiv.org/html/2410.20779v2" title="View HTML" id="html-2410.20779" aria-labelledby="html-2410.20779" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.20779" title="Other formats" id="oth-2410.20779" aria-labelledby="oth-2410.20779">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Decoding Reading Goals from Eye Movements </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Shubi,+O">Omer Shubi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hadar,+C+A">Cfir Avraham Hadar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Berzak,+Y">Yevgeni Berzak</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Readers can have different goals with respect to the text that they are reading. Can these goals be decoded from their eye movements over the text? In this work, we examine for the first time whether it is possible to distinguish between two types of common reading goals: information seeking and ordinary reading for comprehension. Using large-scale eye tracking data, we address this task with a wide range of models that cover different architectural and data representation strategies, and further introduce a new model ensemble. We find that transformer-based models with scanpath representations coupled with language modeling solve it most successfully, and that accurate predictions can be made in real time, long before the participant finished reading the text. We further introduce a new method for model performance analysis based on mixed effect modeling. Combining this method with rich textual annotations reveals key properties of textual items and participants that contribute to the difficulty of the task, and improves our understanding of the variability in eye movement patterns across the two reading regimes. </p> </div> </dd> <dt> <a name='item361'>[361]</a> <a href ="/abs/2410.21013" title="Abstract" id="2410.21013"> arXiv:2410.21013 </a> (replaced) [<a href="/pdf/2410.21013" title="Download PDF" id="pdf-2410.21013" aria-labelledby="pdf-2410.21013">pdf</a>, <a href="https://arxiv.org/html/2410.21013v3" title="View HTML" id="html-2410.21013" aria-labelledby="html-2410.21013" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.21013" title="Other formats" id="oth-2410.21013" aria-labelledby="oth-2410.21013">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Frequency matters: Modeling irregular morphological patterns in Spanish with Transformers </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ramarao,+A+K">Akhilesh Kakolu Ramarao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+K">Kevin Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Baer-Henney,+D">Dinah Baer-Henney</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Typos and grammatical corrections </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Over the past decade, various studies have addressed how speakers solve the so-called `The Paradigm Cell Filling Problem' (PCFP) \citep{ackerman2009parts} across different languages. The PCFP addresses a fundamental question in morphological processing: how do speakers accurately generate inflected forms of words when presented with incomplete paradigms? This problem is particularly salient when modeling complex inflectional systems. We focus on Spanish verbal paradigms, where certain verbs follow an irregular L-shaped pattern, where the first-person singular present indicative stem matches the stem used throughout the present subjunctive mood. We formulate the problem as a morphological reinflection task. Specifically, we investigate the role of input frequency in the acquisition of regular versus irregular L-shaped patterns in transformer models. By systematically manipulating the input distributions and analyzing model behavior, we reveal four key findings: 1) Models perform better on L-shaped verbs compared to regular verbs, especially in uneven frequency conditions; 2) Robust primacy effects are observed, but no consistent recency effects; 3) Memorization becomes more prominent as the proportion of L-shaped verbs increases; 4) There is a tendency to regularize L-shaped verbs when their consonant alternation pairs are rare or absent in the training data. </p> </div> </dd> <dt> <a name='item362'>[362]</a> <a href ="/abs/2410.21662" title="Abstract" id="2410.21662"> arXiv:2410.21662 </a> (replaced) [<a href="/pdf/2410.21662" title="Download PDF" id="pdf-2410.21662" aria-labelledby="pdf-2410.21662">pdf</a>, <a href="https://arxiv.org/html/2410.21662v2" title="View HTML" id="html-2410.21662" aria-labelledby="html-2410.21662" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.21662" title="Other formats" id="oth-2410.21662" aria-labelledby="oth-2410.21662">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> $f$-PO: Generalizing Preference Optimization with $f$-divergence Minimization </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+J">Jiaqi Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+M">Mingjian Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+Y">Yuxuan Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ermon,+S">Stefano Ermon</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+M">Minkai Xu</a></div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> AISTATS 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Preference optimization has made significant progress recently, with numerous methods developed to align language models with human preferences. This paper introduces $f$-divergence Preference Optimization ($f$-PO), a novel framework that generalizes and extends existing approaches. $f$-PO minimizes $f$-divergences between the optimized policy and the optimal policy, encompassing a broad family of alignment methods using various divergences. Our approach unifies previous algorithms like DPO and EXO, while offering new variants through different choices of $f$-divergences. We provide theoretical analysis of $f$-PO's properties and conduct extensive experiments on state-of-the-art language models using benchmark datasets. Results demonstrate $f$-PO's effectiveness across various tasks, achieving superior performance compared to existing methods on popular benchmarks such as AlpacaEval 2, Arena-Hard, MT-Bench, and Open LLM Leaderboard v2. Additionally, we present ablation studies exploring the impact of different $f$-divergences, offering insights into the trade-offs between regularization and performance in offline preference optimization. Our work contributes both practical algorithms and theoretical understanding to the field of language model alignment. Code is available at <a href="https://github.com/MinkaiXu/fPO" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item363'>[363]</a> <a href ="/abs/2410.21728" title="Abstract" id="2410.21728"> arXiv:2410.21728 </a> (replaced) [<a href="/pdf/2410.21728" title="Download PDF" id="pdf-2410.21728" aria-labelledby="pdf-2410.21728">pdf</a>, <a href="https://arxiv.org/html/2410.21728v2" title="View HTML" id="html-2410.21728" aria-labelledby="html-2410.21728" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.21728" title="Other formats" id="oth-2410.21728" aria-labelledby="oth-2410.21728">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Let's Be Self-generated via Step by Step: A Curriculum Learning Approach to Automated Reasoning with Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+K">Kangyang Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ding,+Z">Zichen Ding</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Weng,+Z">Zhenmin Weng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qiao,+L">Lingfeng Qiao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+M">Meng Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+X">Xiang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yin,+D">Di Yin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shu,+J">Jinlong Shu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> While Chain of Thought (CoT) prompting approaches have significantly consolidated the reasoning capabilities of large language models (LLMs), they still face limitations that require extensive human effort or have performance needs to be improved. Existing endeavors have focused on bridging these gaps; however, these approaches either hinge on external data and cannot completely eliminate manual effort, or they fall short in effectively directing LLMs to generate high-quality exemplary prompts. To address the said pitfalls, we propose a novel prompt approach for automatic reasoning named \textbf{LBS3}, inspired by curriculum learning which better reflects human learning habits. Specifically, LBS3 initially steers LLMs to recall easy-to-hard proxy queries that are pertinent to the target query. Following this, it invokes a progressive strategy that utilizes exemplary prompts stemmed from easy-proxy queries to direct LLMs in solving hard-proxy queries, enabling the high-quality of the proxy solutions. Finally, our extensive experiments in various reasoning-intensive tasks with varying open- and closed-source LLMs show that LBS3 achieves strongly competitive performance compared to the SOTA baselines. </p> </div> </dd> <dt> <a name='item364'>[364]</a> <a href ="/abs/2410.22108" title="Abstract" id="2410.22108"> arXiv:2410.22108 </a> (replaced) [<a href="/pdf/2410.22108" title="Download PDF" id="pdf-2410.22108" aria-labelledby="pdf-2410.22108">pdf</a>, <a href="/format/2410.22108" title="Other formats" id="oth-2410.22108" aria-labelledby="oth-2410.22108">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Protecting Privacy in Multimodal Large Language Models with MLLMU-Bench </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zheyuan Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dou,+G">Guangyao Dou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jia,+M">Mengzhao Jia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tan,+Z">Zhaoxuan Tan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zeng,+Q">Qingkai Zeng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+Y">Yongle Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+M">Meng Jiang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> NAACL Main 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Generative models such as Large Language Models (LLM) and Multimodal Large Language models (MLLMs) trained on massive web corpora can memorize and disclose individuals' confidential and private data, raising legal and ethical concerns. While many previous works have addressed this issue in LLM via machine unlearning, it remains largely unexplored for MLLMs. To tackle this challenge, we introduce Multimodal Large Language Model Unlearning Benchmark (MLLMU-Bench), a novel benchmark aimed at advancing the understanding of multimodal machine unlearning. MLLMU-Bench consists of 500 fictitious profiles and 153 profiles for public celebrities, each profile feature over 14 customized question-answer pairs, evaluated from both multimodal (image+text) and unimodal (text) perspectives. The benchmark is divided into four sets to assess unlearning algorithms in terms of efficacy, generalizability, and model utility. Finally, we provide baseline results using existing generative model unlearning algorithms. Surprisingly, our experiments show that unimodal unlearning algorithms excel in generation and cloze tasks, while multimodal unlearning approaches perform better in classification tasks with multimodal inputs. </p> </div> </dd> <dt> <a name='item365'>[365]</a> <a href ="/abs/2410.23166" title="Abstract" id="2410.23166"> arXiv:2410.23166 </a> (replaced) [<a href="/pdf/2410.23166" title="Download PDF" id="pdf-2410.23166" aria-labelledby="pdf-2410.23166">pdf</a>, <a href="https://arxiv.org/html/2410.23166v2" title="View HTML" id="html-2410.23166" aria-labelledby="html-2410.23166" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.23166" title="Other formats" id="oth-2410.23166" aria-labelledby="oth-2410.23166">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SciPIP: An LLM-based Scientific Paper Idea Proposer </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+W">Wenxiao Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gu,+L">Lihui Gu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+L">Liye Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+Y">Yunxiang Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dai,+Y">Yi Dai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+C">Chen Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xie,+L">Liang Xie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+B">Binbin Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+X">Xiaofei He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ye,+J">Jieping Ye</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 20 pages, 5 figures, 12 tables. The code has been availabel: <a href="https://github.com/cheerss/SciPIP" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Information Retrieval (cs.IR); Machine Learning (cs.LG) </div> <p class='mathjax'> The rapid advancement of large language models (LLMs) has opened new possibilities for automating the proposal of innovative scientific ideas. This process involves two key phases: literature retrieval and idea generation. However, existing approaches often fall short due to their reliance on keyword-based search tools during the retrieval phase, which neglects crucial semantic information and frequently results in incomplete retrieval outcomes. Similarly, in the idea generation phase, current methodologies tend to depend solely on the internal knowledge of LLMs or metadata from retrieved papers, thereby overlooking significant valuable insights contained within the full texts. To address these limitations, we introduce SciPIP, an innovative framework designed to enhance the LLM-based proposal of scientific ideas through improvements in both literature retrieval and idea generation. Our approach begins with the construction of a comprehensive literature database that supports advanced retrieval based not only on keywords but also on semantics and citation relationships. This is complemented by the introduction of a multi-granularity retrieval algorithm aimed at ensuring more thorough and exhaustive retrieval results. For the idea generation phase, we propose a dual-path framework that effectively integrates both the content of retrieved papers and the extensive internal knowledge of LLMs. This integration significantly boosts the novelty, feasibility, and practical value of proposed ideas. Our experiments, conducted across various domains such as natural language processing and computer vision, demonstrate SciPIP's capability to generate a multitude of innovative and useful ideas. These findings underscore SciPIP's potential as a valuable tool for researchers seeking to advance their fields with groundbreaking concepts. </p> </div> </dd> <dt> <a name='item366'>[366]</a> <a href ="/abs/2410.23918" title="Abstract" id="2410.23918"> arXiv:2410.23918 </a> (replaced) [<a href="/pdf/2410.23918" title="Download PDF" id="pdf-2410.23918" aria-labelledby="pdf-2410.23918">pdf</a>, <a href="https://arxiv.org/html/2410.23918v3" title="View HTML" id="html-2410.23918" aria-labelledby="html-2410.23918" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.23918" title="Other formats" id="oth-2410.23918" aria-labelledby="oth-2410.23918">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> BitStack: Any-Size Compression of Large Language Models in Variable Memory Environments </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+X">Xinghao Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+P">Pengyu Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+B">Bo Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+D">Dong Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+Y">Yunhua Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qiu,+X">Xipeng Qiu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> ICLR 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV); Machine Learning (cs.LG) </div> <p class='mathjax'> Large language models (LLMs) have revolutionized numerous applications, yet their deployment remains challenged by memory constraints on local devices. While scaling laws have enhanced LLM capabilities, the primary bottleneck has shifted from \textit{capability} to \textit{availability}, emphasizing the need for efficient memory management. Traditional compression methods, such as quantization, often require predefined compression ratios and separate compression processes for each setting, complicating deployment in variable memory environments. In this paper, we introduce \textbf{BitStack}, a novel, training-free weight compression approach that enables megabyte-level trade-offs between memory usage and model performance. By leveraging weight decomposition, BitStack can dynamically adjust the model size with minimal transmission between running memory and storage devices. Our approach iteratively decomposes weight matrices while considering the significance of each parameter, resulting in an approximately 1-bit per parameter residual block in each decomposition iteration. These blocks are sorted and stacked in storage as basic transmission units, with different quantities loaded based on current memory availability. Extensive experiments across a wide range of tasks demonstrate that, despite offering fine-grained size control, BitStack consistently matches or surpasses strong quantization baselines, particularly at extreme compression ratios. To the best of our knowledge, this is the first decomposition-based method that effectively bridges the gap to practical compression techniques like quantization. Code is available at <a href="https://github.com/xinghaow99/BitStack" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item367'>[367]</a> <a href ="/abs/2411.02305" title="Abstract" id="2411.02305"> arXiv:2411.02305 </a> (replaced) [<a href="/pdf/2411.02305" title="Download PDF" id="pdf-2411.02305" aria-labelledby="pdf-2411.02305">pdf</a>, <a href="https://arxiv.org/html/2411.02305v2" title="View HTML" id="html-2411.02305" aria-labelledby="html-2411.02305" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.02305" title="Other formats" id="oth-2411.02305" aria-labelledby="oth-2411.02305">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CRMArena: Understanding the Capacity of LLM Agents to Perform Professional CRM Tasks in Realistic Environments </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+K">Kung-Hsiang Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Prabhakar,+A">Akshara Prabhakar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dhawan,+S">Sidharth Dhawan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mao,+Y">Yixin Mao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+H">Huan Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Savarese,+S">Silvio Savarese</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiong,+C">Caiming Xiong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Laban,+P">Philippe Laban</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+C">Chien-Sheng Wu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> NAACL 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Customer Relationship Management (CRM) systems are vital for modern enterprises, providing a foundation for managing customer interactions and data. Integrating AI agents into CRM systems can automate routine processes and enhance personalized service. However, deploying and evaluating these agents is challenging due to the lack of realistic benchmarks that reflect the complexity of real-world CRM tasks. To address this issue, we introduce CRMArena, a novel benchmark designed to evaluate AI agents on realistic tasks grounded in professional work environments. Following guidance from CRM experts and industry best practices, we designed CRMArena with nine customer service tasks distributed across three personas: service agent, analyst, and manager. The benchmark includes 16 commonly used industrial objects (e.g., account, order, knowledge article, case) with high interconnectivity, along with latent variables (e.g., complaint habits, policy violations) to simulate realistic data distributions. Experimental results reveal that state-of-the-art LLM agents succeed in less than 40% of the tasks with ReAct prompting, and less than 55% even with function-calling abilities. Our findings highlight the need for enhanced agent capabilities in function-calling and rule-following to be deployed in real-world work environments. CRMArena is an open challenge to the community: systems that can reliably complete tasks showcase direct business value in a popular work environment. </p> </div> </dd> <dt> <a name='item368'>[368]</a> <a href ="/abs/2411.03888" title="Abstract" id="2411.03888"> arXiv:2411.03888 </a> (replaced) [<a href="/pdf/2411.03888" title="Download PDF" id="pdf-2411.03888" aria-labelledby="pdf-2411.03888">pdf</a>, <a href="https://arxiv.org/html/2411.03888v2" title="View HTML" id="html-2411.03888" aria-labelledby="html-2411.03888" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.03888" title="Other formats" id="oth-2411.03888" aria-labelledby="oth-2411.03888">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Multi3Hate: Multimodal, Multilingual, and Multicultural Hate Speech Detection with Vision-Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Bui,+M+D">Minh Duc Bui</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=von+der+Wense,+K">Katharina von der Wense</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lauscher,+A">Anne Lauscher</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted to NAACL 2025 Main (Camera-Ready Version) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Warning: this paper contains content that may be offensive or upsetting <br>Hate speech moderation on global platforms poses unique challenges due to the multimodal and multilingual nature of content, along with the varying cultural perceptions. How well do current vision-language models (VLMs) navigate these nuances? To investigate this, we create the first multimodal and multilingual parallel hate speech dataset, annotated by a multicultural set of annotators, called Multi3Hate. It contains 300 parallel meme samples across 5 languages: English, German, Spanish, Hindi, and Mandarin. We demonstrate that cultural background significantly affects multimodal hate speech annotation in our dataset. The average pairwise agreement among countries is just 74%, significantly lower than that of randomly selected annotator groups. Our qualitative analysis indicates that the lowest pairwise label agreement-only 67% between the USA and India-can be attributed to cultural factors. We then conduct experiments with 5 large VLMs in a zero-shot setting, finding that these models align more closely with annotations from the US than with those from other cultures, even when the memes and prompts are presented in the dominant language of the other culture. Code and dataset are available at <a href="https://github.com/MinhDucBui/Multi3Hate" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item369'>[369]</a> <a href ="/abs/2411.06438" title="Abstract" id="2411.06438"> arXiv:2411.06438 </a> (replaced) [<a href="/pdf/2411.06438" title="Download PDF" id="pdf-2411.06438" aria-labelledby="pdf-2411.06438">pdf</a>, <a href="https://arxiv.org/html/2411.06438v3" title="View HTML" id="html-2411.06438" aria-labelledby="html-2411.06438" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.06438" title="Other formats" id="oth-2411.06438" aria-labelledby="oth-2411.06438">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Conditional [MASK] Discrete Diffusion Language Model </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Koh,+H">Hyukhun Koh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jhang,+M">Minha Jhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+D">Dohyung Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+S">Sangmook Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jung,+K">Kyomin Jung</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Although auto-regressive models excel in natural language processing, they often struggle to generate diverse text and provide limited controllability. Non-auto-regressive methods could be an alternative but often produce degenerate outputs and exhibit shortcomings in conditional generation. To address these challenges, we propose Diffusion-EAGS, a novel framework that integrates conditional masked language models into diffusion language models through the theoretical lens of a conditional Markov Random Field. In doing so, we propose entropy-adaptive Gibbs sampling and entropy-based noise scheduling to counterbalance each model's shortcomings. Experimental results show that Diffusion-EAGS outperforms baselines and achieves the best quality-diversity tradeoff, demonstrating its effectiveness in non-autoregressive text generation. </p> </div> </dd> <dt> <a name='item370'>[370]</a> <a href ="/abs/2411.06729" title="Abstract" id="2411.06729"> arXiv:2411.06729 </a> (replaced) [<a href="/pdf/2411.06729" title="Download PDF" id="pdf-2411.06729" aria-labelledby="pdf-2411.06729">pdf</a>, <a href="https://arxiv.org/html/2411.06729v3" title="View HTML" id="html-2411.06729" aria-labelledby="html-2411.06729" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.06729" title="Other formats" id="oth-2411.06729" aria-labelledby="oth-2411.06729">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Reverse Prompt Engineering </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+H">Hanqing Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Klabjan,+D">Diego Klabjan</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> We explore a new language model inversion problem under strict black-box, zero-shot, and limited data conditions. We propose a novel training-free framework that reconstructs prompts using only a limited number of text outputs from a language model. Existing methods rely on the availability of a large number of outputs for both training and inference, an assumption that is unrealistic in the real world, and they can sometimes produce garbled text. In contrast, our approach, which relies on limited resources, consistently yields coherent and semantically meaningful prompts. Our framework leverages a large language model together with an optimization process inspired by the genetic algorithm to effectively recover prompts. Experimental results on several datasets derived from public sources indicate that our approach achieves high-quality prompt recovery and generates prompts more semantically and functionally aligned with the originals than current state-of-the-art methods. Additionally, use-case studies introduced demonstrate the method's strong potential for generating high-quality text data on perturbed prompts. </p> </div> </dd> <dt> <a name='item371'>[371]</a> <a href ="/abs/2411.07381" title="Abstract" id="2411.07381"> arXiv:2411.07381 </a> (replaced) [<a href="/pdf/2411.07381" title="Download PDF" id="pdf-2411.07381" aria-labelledby="pdf-2411.07381">pdf</a>, <a href="https://arxiv.org/html/2411.07381v4" title="View HTML" id="html-2411.07381" aria-labelledby="html-2411.07381" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.07381" title="Other formats" id="oth-2411.07381" aria-labelledby="oth-2411.07381">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MaLei at the PLABA Track of TREC 2024: RoBERTa for Term Replacement -- LLaMA3.1 and GPT-4o for Complete Abstract Adaptation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ling,+Z">Zhidong Ling</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Z">Zihao Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Romero,+P">Pablo Romero</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+L">Lifeng Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nenadic,+G">Goran Nenadic</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> ongoing work - system report for PLABA2024 with TREC-2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> This report is the system description of the MaLei team (Manchester and Leiden) for the shared task Plain Language Adaptation of Biomedical Abstracts (PLABA) 2024 (we had an earlier name BeeManc following last year), affiliated with TREC2024 (33rd Text REtrieval Conference <a href="https://ir.nist.gov/evalbase/conf/trec-2024" rel="external noopener nofollow" class="link-external link-https">this https URL</a>). This report contains two sections corresponding to the two sub-tasks in PLABA-2024. In task one (term replacement), we applied fine-tuned ReBERTa-Base models to identify and classify the difficult terms, jargon, and acronyms in the biomedical abstracts and reported the F1 score (Task 1A and 1B). In task two (complete abstract adaptation), we leveraged Llamma3.1-70B-Instruct and GPT-4o with the one-shot prompts to complete the abstract adaptation and reported the scores in BLEU, SARI, BERTScore, LENS, and SALSA. From the official Evaluation from PLABA-2024 on Task 1A and 1B, our much smaller fine-tuned RoBERTa-Base model ranked 3rd and 2nd respectively on the two sub-tasks, and the 1st on averaged F1 scores across the two tasks from 9 evaluated systems. Our LLaMA-3.1-70B-instructed model achieved the highest Completeness score for Task 2. We share our source codes, fine-tuned models, and related resources at <a href="https://github.com/HECTA-UoM/PLABA2024" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item372'>[372]</a> <a href ="/abs/2411.12405" title="Abstract" id="2411.12405"> arXiv:2411.12405 </a> (replaced) [<a href="/pdf/2411.12405" title="Download PDF" id="pdf-2411.12405" aria-labelledby="pdf-2411.12405">pdf</a>, <a href="https://arxiv.org/html/2411.12405v2" title="View HTML" id="html-2411.12405" aria-labelledby="html-2411.12405" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.12405" title="Other formats" id="oth-2411.12405" aria-labelledby="oth-2411.12405">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Evaluating the Prompt Steerability of Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Miehling,+E">Erik Miehling</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Desmond,+M">Michael Desmond</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ramamurthy,+K+N">Karthikeyan Natesan Ramamurthy</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Daly,+E+M">Elizabeth M. Daly</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dognin,+P">Pierre Dognin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rios,+J">Jesus Rios</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bouneffouf,+D">Djallel Bouneffouf</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+M">Miao Liu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Short version appeared at the Pluralistic Alignment workshop at NeurIPS 2024; extended version appeared at NAACL 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Human-Computer Interaction (cs.HC) </div> <p class='mathjax'> Building pluralistic AI requires designing models that are able to be shaped to represent a wide range of value systems and cultures. Achieving this requires first being able to evaluate the degree to which a given model is capable of reflecting various personas. To this end, we propose a benchmark for evaluating the steerability of model personas as a function of prompting. Our design is based on a formal definition of prompt steerability, which analyzes the degree to which a model's joint behavioral distribution can be shifted from its baseline. By defining steerability indices and inspecting how these indices change as a function of steering effort, we can estimate the steerability of a model across various persona dimensions and directions. Our benchmark reveals that the steerability of many current models is limited -- due to both a skew in their baseline behavior and an asymmetry in their steerability across many persona dimensions. We release an implementation of our benchmark at <a href="https://github.com/IBM/prompt-steering" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item373'>[373]</a> <a href ="/abs/2411.14483" title="Abstract" id="2411.14483"> arXiv:2411.14483 </a> (replaced) [<a href="/pdf/2411.14483" title="Download PDF" id="pdf-2411.14483" aria-labelledby="pdf-2411.14483">pdf</a>, <a href="https://arxiv.org/html/2411.14483v2" title="View HTML" id="html-2411.14483" aria-labelledby="html-2411.14483" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14483" title="Other formats" id="oth-2411.14483" aria-labelledby="oth-2411.14483">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Ranking Unraveled: Recipes for LLM Rankings in Head-to-Head AI Combat </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Daynauth,+R">Roland Daynauth</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Clarke,+C">Christopher Clarke</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Flautner,+K">Krisztian Flautner</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+L">Lingjia Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mars,+J">Jason Mars</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Deciding which large language model (LLM) to use is a complex challenge. Pairwise ranking has emerged as a new method for evaluating human preferences for LLMs. This approach entails humans evaluating pairs of model outputs based on a predefined criterion. By collecting these comparisons, a ranking can be constructed using methods such as Elo. However, applying these algorithms as constructed in the context of LLM evaluation introduces several challenges. In this paper, we explore the effectiveness of ranking systems for head-to-head comparisons of LLMs. We formally define a set of fundamental principles for effective ranking and conduct a series of extensive evaluations on the robustness of several ranking algorithms in the context of LLMs. Our analysis uncovers key insights into the factors that affect ranking accuracy and efficiency, offering guidelines for selecting the most appropriate methods based on specific evaluation contexts and resource constraints. </p> </div> </dd> <dt> <a name='item374'>[374]</a> <a href ="/abs/2411.16365" title="Abstract" id="2411.16365"> arXiv:2411.16365 </a> (replaced) [<a href="/pdf/2411.16365" title="Download PDF" id="pdf-2411.16365" aria-labelledby="pdf-2411.16365">pdf</a>, <a href="https://arxiv.org/html/2411.16365v2" title="View HTML" id="html-2411.16365" aria-labelledby="html-2411.16365" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.16365" title="Other formats" id="oth-2411.16365" aria-labelledby="oth-2411.16365">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Multi-modal Retrieval Augmented Multi-modal Generation: Datasets, Evaluation Metrics and Strong Baseliness </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+Z">Zi-Ao Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lan,+T">Tian Lan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tu,+R">Rong-Cheng Tu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+Y">Yong Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+Y">Yu-Shi Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+T">Tong Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+H">Heyan Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mao,+X">Xian-Ling Mao</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> We present a systematic investigation of Multi-modal Retrieval Augmented Multi-modal Generation (M$^2$RAG), a novel task that enables foundation models to process multi-modal web content and generate multi-modal responses, which exhibits better information density and readability. Despite its potential impact, M$^2$RAG remains understudied, lacking comprehensive analysis and high-quality data resources. To address this gap, we establish a comprehensive benchmark through a rigorous data curation pipeline, and employ text-modal metrics and multi-modal metrics based on foundation models for evaluation. We further propose several strategies for foundation models to process M$^2$RAG effectively and construct a training set by filtering high-quality samples using designed metrics. Our extensive experiments demonstrate the reliability of our proposed metrics, a landscape of model performance within our designed strategies, and show that our fine-tuned 7B-8B models outperform the state-of-the-art GPT-4o model. Additionally, we perform fine-grained analyses across diverse domains and validate the effectiveness of our designs in data curation pipeline. All resources, including codes, datasets, and model weights, will be publicly released. </p> </div> </dd> <dt> <a name='item375'>[375]</a> <a href ="/abs/2412.09879" title="Abstract" id="2412.09879"> arXiv:2412.09879 </a> (replaced) [<a href="/pdf/2412.09879" title="Download PDF" id="pdf-2412.09879" aria-labelledby="pdf-2412.09879">pdf</a>, <a href="https://arxiv.org/html/2412.09879v2" title="View HTML" id="html-2412.09879" aria-labelledby="html-2412.09879" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.09879" title="Other formats" id="oth-2412.09879" aria-labelledby="oth-2412.09879">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> On the Limit of Language Models as Planning Formalizers </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+C">Cassie Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+L">Li Zhang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large Language Models have been shown to fail to create executable and verifiable plans in grounded environments. An emerging line of work shows success in using LLM as a formalizer to generate a formal representation (e.g., PDDL) of the planning domain, which can be deterministically solved to find a plan. We systematically evaluate this methodology while bridging some major gaps. While previous work only generates a partial PDDL representation given templated and thus unrealistic environment descriptions, we generate the complete representation given descriptions of various naturalness levels. Among an array of observations critical to improve LLMs' formal planning ability, we note that large enough models can effectively formalize descriptions as PDDL, outperforming those directly generating plans, while being robust to lexical perturbation. As the descriptions become more natural-sounding, we observe a decrease in performance and provide detailed error analysis. </p> </div> </dd> <dt> <a name='item376'>[376]</a> <a href ="/abs/2412.11041" title="Abstract" id="2412.11041"> arXiv:2412.11041 </a> (replaced) [<a href="/pdf/2412.11041" title="Download PDF" id="pdf-2412.11041" aria-labelledby="pdf-2412.11041">pdf</a>, <a href="https://arxiv.org/html/2412.11041v2" title="View HTML" id="html-2412.11041" aria-labelledby="html-2412.11041" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.11041" title="Other formats" id="oth-2412.11041" aria-labelledby="oth-2412.11041">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Separate the Wheat from the Chaff: A Post-Hoc Approach to Safety Re-Alignment for Fine-Tuned Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+D">Di Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+X">Xin Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+Y">Yanyan Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qin,+B">Bing Qin</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 16 pages, 14 figures, </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Although large language models (LLMs) achieve effective safety alignment at the time of release, they still face various safety challenges. A key issue is that fine-tuning often compromises the safety alignment of LLMs. To address this issue, we propose a method named IRR (Identify, Remove, and Recalibrate for Safety Realignment) that performs safety realignment for LLMs. The core of IRR is to identify and remove unsafe delta parameters from the fine-tuned models, while recalibrating the retained ones. We evaluate the effectiveness of IRR across various datasets, including both full fine-tuning and LoRA methods. Our results demonstrate that IRR significantly enhances the safety performance of fine-tuned models on safety benchmarks, such as harmful queries and jailbreak attacks, while maintaining their performance on downstream tasks. The source code is available at: <a href="https://anonymous.4open.science/r/IRR-BD4F" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item377'>[377]</a> <a href ="/abs/2412.11167" title="Abstract" id="2412.11167"> arXiv:2412.11167 </a> (replaced) [<a href="/pdf/2412.11167" title="Download PDF" id="pdf-2412.11167" aria-labelledby="pdf-2412.11167">pdf</a>, <a href="https://arxiv.org/html/2412.11167v2" title="View HTML" id="html-2412.11167" aria-labelledby="html-2412.11167" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.11167" title="Other formats" id="oth-2412.11167" aria-labelledby="oth-2412.11167">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Cultural Palette: Pluralising Culture Alignment via Multi-agent Palette </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+J">Jiahao Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Di,+Z">Zixiang Di</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+S">Shangzixin Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Naseem,+U">Usman Naseem</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large language models (LLMs) face challenges in aligning with diverse cultural values despite their remarkable performance in generation, which stems from inherent monocultural biases and difficulties in capturing nuanced cultural semantics. Existing methods struggle to adapt to unkown culture after fine-tuning. Inspired by cultural geography across five continents, we propose Cultural Palette, a multi-agent framework that redefines cultural alignment as an adaptive "color-blending" process for country-specific adaptation. Our approach harnesses cultural geography across five continents (Africa, America, Asia, Europe, Oceania) through three key steps: First, we synthesize the Pentachromatic Cultural Palette Dataset using GPT-4o, refining continental-level dialogues with Hofstede cultural dimensions to establish foundational cultural representations. Second, five continent-level alignment agents form specialized cultural communities that generate region-specific draft responses. Third, a Meta Agent employs Cultural MoErges to dynamically blend these cultural "colors" through attention-gated parameter merging, akin to mixing pigments on a palette, resolving conflicts while preserving cultural nuances to produce the final culturally-aligned response. Extensive experiments across various countries demonstrate that Cultural Palette surpasses existing baselines in cultural alignment. </p> </div> </dd> <dt> <a name='item378'>[378]</a> <a href ="/abs/2412.12072" title="Abstract" id="2412.12072"> arXiv:2412.12072 </a> (replaced) [<a href="/pdf/2412.12072" title="Download PDF" id="pdf-2412.12072" aria-labelledby="pdf-2412.12072">pdf</a>, <a href="https://arxiv.org/html/2412.12072v2" title="View HTML" id="html-2412.12072" aria-labelledby="html-2412.12072" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.12072" title="Other formats" id="oth-2412.12072" aria-labelledby="oth-2412.12072">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Making FETCH! Happen: Finding Emergent Dog Whistles Through Common Habitats </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Sasse,+K">Kuleen Sasse</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Aguirre,+C">Carlos Aguirre</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cachola,+I">Isabel Cachola</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Levy,+S">Sharon Levy</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dredze,+M">Mark Dredze</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> WARNING: This paper contains content that maybe upsetting or offensive to some readers. Dog whistles are coded expressions with dual meanings: one intended for the general public (outgroup) and another that conveys a specific message to an intended audience (ingroup). Often, these expressions are used to convey controversial political opinions while maintaining plausible deniability and slip by content moderation filters. Identification of dog whistles relies on curated lexicons, which have trouble keeping up to date. We introduce FETCH!, a task for finding novel dog whistles in massive social media corpora. We find that state-of-the-art systems fail to achieve meaningful results across three distinct social media case studies. We present EarShot, a strong baseline system that combines the strengths of vector databases and Large Language Models (LLMs) to efficiently and effectively identify new dog whistles. </p> </div> </dd> <dt> <a name='item379'>[379]</a> <a href ="/abs/2412.12145" title="Abstract" id="2412.12145"> arXiv:2412.12145 </a> (replaced) [<a href="/pdf/2412.12145" title="Download PDF" id="pdf-2412.12145" aria-labelledby="pdf-2412.12145">pdf</a>, <a href="/format/2412.12145" title="Other formats" id="oth-2412.12145" aria-labelledby="oth-2412.12145">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Na'vi or Knave: Jailbreaking Language Models via Metaphorical Avatars </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yan,+Y">Yu Yan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+S">Sheng Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tong,+J">Junqi Tong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+M">Min Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Q">Qi Li</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> We still need to polish our paper </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Metaphor serves as an implicit approach to convey information, while enabling the generalized comprehension of complex subjects. However, metaphor can potentially be exploited to bypass the safety alignment mechanisms of Large Language Models (LLMs), leading to the theft of harmful knowledge. In our study, we introduce a novel attack framework that exploits the imaginative capacity of LLMs to achieve jailbreaking, the J\underline{\textbf{A}}ilbreak \underline{\textbf{V}}ia \underline{\textbf{A}}dversarial Me\underline{\textbf{TA}} -pho\underline{\textbf{R}} (\textit{AVATAR}). Specifically, to elicit the harmful response, AVATAR extracts harmful entities from a given harmful target and maps them to innocuous adversarial entities based on LLM's imagination. Then, according to these metaphors, the harmful target is nested within human-like interaction for jailbreaking adaptively. Experimental results demonstrate that AVATAR can effectively and transferablly jailbreak LLMs and achieve a state-of-the-art attack success rate across multiple advanced LLMs. Our study exposes a security risk in LLMs from their endogenous imaginative capabilities. Furthermore, the analytical study reveals the vulnerability of LLM to adversarial metaphors and the necessity of developing defense methods against jailbreaking caused by the adversarial metaphor. \textcolor{orange}{ \textbf{Warning: This paper contains potentially harmful content from LLMs.}} </p> </div> </dd> <dt> <a name='item380'>[380]</a> <a href ="/abs/2412.12499" title="Abstract" id="2412.12499"> arXiv:2412.12499 </a> (replaced) [<a href="/pdf/2412.12499" title="Download PDF" id="pdf-2412.12499" aria-labelledby="pdf-2412.12499">pdf</a>, <a href="https://arxiv.org/html/2412.12499v2" title="View HTML" id="html-2412.12499" aria-labelledby="html-2412.12499" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.12499" title="Other formats" id="oth-2412.12499" aria-labelledby="oth-2412.12499">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LinguaLIFT: An Effective Two-stage Instruction Tuning Framework for Low-Resource Language Reasoning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+H">Hongbin Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+K">Kehai Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bai,+X">Xuefeng Bai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiang,+Y">Yang Xiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+M">Min Zhang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Large language models (LLMs) have exhibited impressive multilingual reasoning capabilities, driven by extensive multilingual pre-training corpora and instruction fine-tuning data. However, a performance gap exists between high- and low-resource language reasoning tasks due to the language imbalance in the pre-training corpus, which is exacerbated by evaluation bias in existing reasoning benchmarks lacking low-resource language coverage. To alleviate this issue, we propose LinguaLIFT, a two-stage instruction tuning framework for advancing low-resource language reasoning. LinguaLIFT employs a language alignment layer to capture multilingual alignment in a code-switched tuning way without requiring multilingual instruction or parallel data, thereby transferring the cross-lingual reasoning capabilities to low-resource languages through English-only instruction tuning data. To comprehensively evaluate the multilingual reasoning capabilities, we introduce the Multilingual Math World Problem (MMWP) benchmark, which spans 21 low-resource, 17 medium-resource, and 10 high-resource languages. Experimental results show that LinguaLIFT outperforms several competitive baselines across MMWP and four widely used benchmarks. </p> </div> </dd> <dt> <a name='item381'>[381]</a> <a href ="/abs/2412.12527" title="Abstract" id="2412.12527"> arXiv:2412.12527 </a> (replaced) [<a href="/pdf/2412.12527" title="Download PDF" id="pdf-2412.12527" aria-labelledby="pdf-2412.12527">pdf</a>, <a href="https://arxiv.org/html/2412.12527v2" title="View HTML" id="html-2412.12527" aria-labelledby="html-2412.12527" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.12527" title="Other formats" id="oth-2412.12527" aria-labelledby="oth-2412.12527">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> When to Speak, When to Abstain: Contrastive Decoding with Abstention </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+H+J">Hyuhng Joon Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+Y">Youna Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+S">Sang-goo Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+T">Taeuk Kim</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> under-review </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large Language Models (LLMs) demonstrate exceptional performance across diverse tasks by leveraging pre-trained (i.e., parametric) and external (i.e., contextual) knowledge. While substantial efforts have been made to enhance the utilization of both forms of knowledge, situations in which models lack relevant information remain underexplored. To investigate this challenge, we first present a controlled testbed featuring four distinct knowledge access scenarios, including the aforementioned edge case, revealing that conventional LLM usage exhibits insufficient robustness in handling all instances. Addressing this limitation, we propose Contrastive Decoding with Abstention (CDA), a novel training-free decoding method that allows LLMs to generate responses when relevant knowledge is available and to abstain otherwise. CDA estimates the relevance of both knowledge sources for a given input, adaptively deciding which type of information to prioritize and which to exclude. Through extensive experiments, we demonstrate that CDA can effectively perform accurate generation and abstention simultaneously, enhancing reliability and preserving user trust. </p> </div> </dd> <dt> <a name='item382'>[382]</a> <a href ="/abs/2412.12583" title="Abstract" id="2412.12583"> arXiv:2412.12583 </a> (replaced) [<a href="/pdf/2412.12583" title="Download PDF" id="pdf-2412.12583" aria-labelledby="pdf-2412.12583">pdf</a>, <a href="https://arxiv.org/html/2412.12583v2" title="View HTML" id="html-2412.12583" aria-labelledby="html-2412.12583" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.12583" title="Other formats" id="oth-2412.12583" aria-labelledby="oth-2412.12583">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Process-Supervised Reward Models for Verifying Clinical Note Generation: A Scalable Approach Guided by Domain Expertise </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+H">Hanyin Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+C">Chufan Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+Q">Qiping Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+B">Bolun Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hussein,+G">Guleid Hussein</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Korsapati,+H">Hariprasad Korsapati</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Labban,+M+E">Mohamad El Labban</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Iheasirim,+K">Kingsley Iheasirim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hassan,+M">Mohamed Hassan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Anil,+G">Gokhan Anil</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bartlett,+B">Brian Bartlett</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+J">Jimeng Sun</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Process-supervised reward models (PRMs), which verify large language model (LLM) outputs step-by-step, have achieved significant success in mathematical and coding problems. However, their application to other domains remains largely unexplored. In this work, we train a PRM to provide step-level reward signals for clinical notes generated by LLMs from patient-doctor dialogues. Guided by real-world clinician expertise, we carefully designed step definitions for clinical notes and utilized Gemini-Pro 1.5 to automatically generate process supervision data at scale. Our proposed PRM, trained on the LLaMA-3.1 8B instruct model, outperformed both Gemini-Pro 1.5 and the vanilla outcome-supervised reward model (ORM) in two key evaluations: (1) selecting gold-reference samples from error-containing ones, achieving 98.8% accuracy (versus 70.0% for the vanilla ORM and 93.8% for Gemini-Pro 1.5), and (2) selecting physician-preferred notes, achieving 56.2% accuracy (compared to 37.5% for the vanilla ORM and 50.0% for Gemini-Pro 1.5). Additionally, we conducted ablation studies to determine optimal loss functions and data selection strategies, along with physician reader studies to explore predictors of downstream Best-of-N performance. Our promising results suggest the potential of PRMs to extend beyond the clinical domain, offering a scalable and effective solution for diverse generative tasks. </p> </div> </dd> <dt> <a name='item383'>[383]</a> <a href ="/abs/2412.13018" title="Abstract" id="2412.13018"> arXiv:2412.13018 </a> (replaced) [<a href="/pdf/2412.13018" title="Download PDF" id="pdf-2412.13018" aria-labelledby="pdf-2412.13018">pdf</a>, <a href="https://arxiv.org/html/2412.13018v2" title="View HTML" id="html-2412.13018" aria-labelledby="html-2412.13018" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.13018" title="Other formats" id="oth-2412.13018" aria-labelledby="oth-2412.13018">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> OmniEval: An Omnidirectional and Automatic RAG Evaluation Benchmark in Financial Domain </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Shuting Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tan,+J">Jiejun Tan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dou,+Z">Zhicheng Dou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wen,+J">Ji-Rong Wen</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> As a typical and practical application of Large Language Models (LLMs), Retrieval-Augmented Generation (RAG) techniques have gained extensive attention, particularly in vertical domains where LLMs may lack domain-specific knowledge. In this paper, we introduce an omnidirectional and automatic RAG benchmark, OmniEval, in the financial domain. Our benchmark is characterized by its multi-dimensional evaluation framework, including (1) a matrix-based RAG scenario evaluation system that categorizes queries into five task classes and 16 financial topics, leading to a structured assessment of diverse query scenarios; (2) a multi-dimensional evaluation data generation approach, which combines GPT-4-based automatic generation and human annotation, achieving an 87.47\% acceptance ratio in human evaluations on generated instances; (3) a multi-stage evaluation system that evaluates both retrieval and generation performance, result in a comprehensive evaluation on the RAG pipeline; and (4) robust evaluation metrics derived from rule-based and LLM-based ones, enhancing the reliability of assessments through manual annotations and supervised fine-tuning of an LLM evaluator. Our experiments demonstrate the comprehensiveness of OmniEval, which includes extensive test datasets and highlights the performance variations of RAG systems across diverse topics and tasks, revealing significant opportunities for RAG models to improve their capabilities in vertical domains. We open source the code of our benchmark in \href{<a href="https://github.com/RUC-NLPIR/OmniEval" rel="external noopener nofollow" class="link-external link-https">this https URL</a>}{<a href="https://github.com/RUC-NLPIR/OmniEval" rel="external noopener nofollow" class="link-external link-https">this https URL</a>}. </p> </div> </dd> <dt> <a name='item384'>[384]</a> <a href ="/abs/2412.14838" title="Abstract" id="2412.14838"> arXiv:2412.14838 </a> (replaced) [<a href="/pdf/2412.14838" title="Download PDF" id="pdf-2412.14838" aria-labelledby="pdf-2412.14838">pdf</a>, <a href="https://arxiv.org/html/2412.14838v2" title="View HTML" id="html-2412.14838" aria-labelledby="html-2412.14838" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.14838" title="Other formats" id="oth-2412.14838" aria-labelledby="oth-2412.14838">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> DynamicKV: Task-Aware Adaptive KV Cache Compression for Long Context LLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+X">Xiabin Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+W">Wenbin Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zeng,+M">Minyan Zeng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+J">Jiaxian Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+X">Xuebo Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+L">Li Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+M">Min Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ding,+L">Liang Ding</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Efficient KV cache management in LLMs is crucial for long-context tasks like RAG and summarization. Existing KV cache compression methods enforce a fixed pattern, neglecting task-specific characteristics and reducing the retention of essential information. However, we observe distinct activation patterns across layers in various tasks, highlighting the need for adaptive strategies tailored to each task's unique demands. Based on this insight, we propose DynamicKV, a method that dynamically optimizes token retention by adjusting the number of tokens retained at each layer to adapt to the specific task. DynamicKV establishes global and per-layer maximum KV cache budgets, temporarily retaining the maximum budget for the current layer, and periodically updating the KV cache sizes of all preceding layers during inference. Our method retains only 1.7% of the KV cache size while achieving ~85% of the Full KV cache performance on LongBench. Notably, even under extreme compression (0.9%), DynamicKV surpasses state-of-the-art (SOTA) methods by 11% in the Needle-in-a-Haystack test using Mistral-7B-Instruct-v0.2. The code will be released. </p> </div> </dd> <dt> <a name='item385'>[385]</a> <a href ="/abs/2412.15628" title="Abstract" id="2412.15628"> arXiv:2412.15628 </a> (replaced) [<a href="/pdf/2412.15628" title="Download PDF" id="pdf-2412.15628" aria-labelledby="pdf-2412.15628">pdf</a>, <a href="https://arxiv.org/html/2412.15628v2" title="View HTML" id="html-2412.15628" aria-labelledby="html-2412.15628" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.15628" title="Other formats" id="oth-2412.15628" aria-labelledby="oth-2412.15628">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Can Input Attributions Interpret the Inductive Reasoning Process Elicited in In-Context Learning? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ye,+M">Mengyu Ye</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kuribayashi,+T">Tatsuki Kuribayashi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kobayashi,+G">Goro Kobayashi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Suzuki,+J">Jun Suzuki</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Preprint </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Interpreting the internal process of neural models has long been a challenge. This challenge remains relevant in the era of large language models (LLMs) and in-context learning (ICL); for example, ICL poses a new issue of interpreting which example in the few-shot examples contributed to identifying/solving the task. To this end, in this paper, we design synthetic diagnostic tasks of inductive reasoning, inspired by the generalization tests in linguistics; here, most in-context examples are ambiguous w.r.t. their underlying rule, and one critical example disambiguates the task demonstrated. The question is whether conventional input attribution (IA) methods can track such a reasoning process, i.e., identify the influential example, in ICL. Our experiments provide several practical findings; for example, a certain simple IA method works the best, and the larger the model, the generally harder it is to interpret the ICL with gradient-based IA methods. </p> </div> </dd> <dt> <a name='item386'>[386]</a> <a href ="/abs/2412.16516" title="Abstract" id="2412.16516"> arXiv:2412.16516 </a> (replaced) [<a href="/pdf/2412.16516" title="Download PDF" id="pdf-2412.16516" aria-labelledby="pdf-2412.16516">pdf</a>, <a href="https://arxiv.org/html/2412.16516v2" title="View HTML" id="html-2412.16516" aria-labelledby="html-2412.16516" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.16516" title="Other formats" id="oth-2412.16516" aria-labelledby="oth-2412.16516">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> HammerBench: Fine-Grained Function-Calling Evaluation in Real Mobile Device Scenarios </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+J">Jun Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+J">Jiamu Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wen,+M">Muning Wen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mo,+X">Xiaoyun Mo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+H">Haoyu Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+Q">Qiqiang Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jin,+C">Cheng Jin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+X">Xihuai Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+W">Weinan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Peng,+Q">Qiuying Peng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+J">Jun Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Human-Computer Interaction (cs.HC) </div> <p class='mathjax'> Evaluating the performance of LLMs in multi-turn human-agent interactions presents significant challenges, particularly due to the complexity and variability of user behavior. In this paper, we introduce HammerBench, a novel benchmark framework for assessing LLMs' function-calling capabilities in real-world, multi-turn dialogues. HammerBench simulates diverse mobile assistant use cases, incorporating imperfect instructions, dynamic question-answer trajectories, intent and argument shifts, and the indirect use of external information through pronouns. To construct this benchmark, we curate a comprehensive dataset derived from popular mobile app functionalities and anonymized user logs, complemented by a cost-effective data generation pipeline leveraging open-source models. HammerBench is further augmented with fine-grained interaction snapshots and metrics, enabling detailed evaluation of function-calling performance across individual conversational turns. We demonstrate the effectiveness of HammerBench by evaluating several leading LLMs and uncovering key performance trends. Our experiments reveal that different types of parameter name errors are a significant source of failure across different interaction scenarios, highlighting critical areas for further improvement in LLM robustness for mobile assistant applications. </p> </div> </dd> <dt> <a name='item387'>[387]</a> <a href ="/abs/2412.18053" title="Abstract" id="2412.18053"> arXiv:2412.18053 </a> (replaced) [<a href="/pdf/2412.18053" title="Download PDF" id="pdf-2412.18053" aria-labelledby="pdf-2412.18053">pdf</a>, <a href="https://arxiv.org/html/2412.18053v2" title="View HTML" id="html-2412.18053" aria-labelledby="html-2412.18053" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.18053" title="Other formats" id="oth-2412.18053" aria-labelledby="oth-2412.18053">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Neuron Empirical Gradient: Discovering and Quantifying Neurons Global Linear Controllability </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+X">Xin Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+Z">Zehui Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yoshinaga,+N">Naoki Yoshinaga</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 29 pages, 19 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Although feed-forward neurons in pre-trained language models (PLMs) can store knowledge and their importance in influencing model outputs has been studied, existing work focuses on finding a limited set of neurons and analyzing their relative importance. However, the global quantitative role of activation values in shaping outputs remains unclear, hindering further advancements in applications like knowledge editing. Our study first investigates the numerical relationship between neuron activations and model output and discovers the global linear relationship between them through neuron interventions on a knowledge probing dataset. We refer to the gradient of this linear relationship as neuron empirical gradient (NEG), and introduce NeurGrad, an accurate and efficient method for computing NEG. NeurGrad enables quantitative analysis of all neurons in PLMs, advancing our understanding of neurons' controllability. Furthermore, we explore NEG's ability to represent language skills across diverse prompts via skill neuron probing. Experiments on MCEval8k, a multi-choice knowledge benchmark spanning various genres, validate NEG's representational ability. The data and code are released. </p> </div> </dd> <dt> <a name='item388'>[388]</a> <a href ="/abs/2412.18196" title="Abstract" id="2412.18196"> arXiv:2412.18196 </a> (replaced) [<a href="/pdf/2412.18196" title="Download PDF" id="pdf-2412.18196" aria-labelledby="pdf-2412.18196">pdf</a>, <a href="https://arxiv.org/html/2412.18196v2" title="View HTML" id="html-2412.18196" aria-labelledby="html-2412.18196" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.18196" title="Other formats" id="oth-2412.18196" aria-labelledby="oth-2412.18196">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Robustness-aware Automatic Prompt Optimization </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Shi,+Z">Zeru Shi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zhenting Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Su,+Y">Yongye Su</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+W">Weidi Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+H">Hang Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+F">Fan Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+R">Ruixiang Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yongfeng Zhang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> The performance of Large Language Models (LLMs) depends on the quality of prompts and the semantic and structural integrity of the input data. However, existing prompt generation methods primarily focus on well-structured input data, often neglecting the impact of perturbed inputs on prompt effectiveness. To address this limitation, we propose BATprompt (By Adversarial Training prompt), a novel method for prompt generation designed to withstand input perturbations (such as typos in the input). Inspired by adversarial training techniques, BATprompt demonstrates strong performance on a variety of perturbed tasks through a two-step process: adversarial perturbation and iterative optimization on unperturbed input via LLM. Unlike conventional adversarial attack methods, BATprompt does not need access to model parameters and gradients. Instead, BATprompt leverages the advanced reasoning, language understanding and self reflection capabilities of LLMs to simulate gradients, guiding the generation of adversarial perturbations and optimizing prompt performance. We evaluate BATprompt on multiple datasets across both language understanding and generation tasks. The results indicate that BATprompt outperforms existing prompt generation methods, delivering superior robustness and performance under diverse perturbation scenarios. </p> </div> </dd> <dt> <a name='item389'>[389]</a> <a href ="/abs/2412.18367" title="Abstract" id="2412.18367"> arXiv:2412.18367 </a> (replaced) [<a href="/pdf/2412.18367" title="Download PDF" id="pdf-2412.18367" aria-labelledby="pdf-2412.18367">pdf</a>, <a href="https://arxiv.org/html/2412.18367v5" title="View HTML" id="html-2412.18367" aria-labelledby="html-2412.18367" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.18367" title="Other formats" id="oth-2412.18367" aria-labelledby="oth-2412.18367">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Towards Global AI Inclusivity: A Large-Scale Multilingual Terminology Dataset (GIST) </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+J">Jiarui Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ouzzani,+I">Iman Ouzzani</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+W">Wenkai Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+L">Lechen Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ou,+T">Tianyue Ou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bouamor,+H">Houda Bouamor</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jin,+Z">Zhijing Jin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Diab,+M">Mona Diab</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The field of machine translation has achieved significant advancements, yet domain-specific terminology translation, particularly in AI, remains challenging. We introduce GIST, a large-scale multilingual AI terminology dataset containing 5K terms extracted from top AI conference papers spanning 2000 to 2023. The terms are translated into Arabic, Chinese, French, Japanese, and Russian using a hybrid framework that combines LLMs for extraction with human expertise for translation. The dataset's quality is benchmarked against existing resources, demonstrating superior translation accuracy through crowdsourced evaluation. GIST is integrated into translation workflows using post-translation refinement methods that require no retraining, where LLM prompting consistently improves BLEU and COMET scores. A web demonstration on the ACL Anthology platform highlights its practical application, showcasing improved accessibility for non-English speakers. This work aims to address critical gaps in AI terminology resources and fosters global inclusivity and collaboration in AI research. </p> </div> </dd> <dt> <a name='item390'>[390]</a> <a href ="/abs/2412.18547" title="Abstract" id="2412.18547"> arXiv:2412.18547 </a> (replaced) [<a href="/pdf/2412.18547" title="Download PDF" id="pdf-2412.18547" aria-labelledby="pdf-2412.18547">pdf</a>, <a href="https://arxiv.org/html/2412.18547v4" title="View HTML" id="html-2412.18547" aria-labelledby="html-2412.18547" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.18547" title="Other formats" id="oth-2412.18547" aria-labelledby="oth-2412.18547">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Token-Budget-Aware LLM Reasoning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+T">Tingxu Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zhenting Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fang,+C">Chunrong Fang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+S">Shiyu Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+S">Shiqing Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Z">Zhenyu Chen</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Reasoning is critical for large language models (LLMs) to excel in a wide range of tasks. While methods like Chain-of-Thought (CoT) reasoning enhance LLM performance by decomposing problems into intermediate steps, they also incur significant overhead in token usage, leading to increased costs. We find that the reasoning process of current LLMs is unnecessarily lengthy and it can be compressed by including a reasonable token budget in the prompt, but the choice of token budget plays a crucial role in the actual compression effectiveness. We then propose a token-budget-aware LLM reasoning framework, which dynamically estimates token budgets for different problems based on reasoning complexity and uses the estimated token budgets to guide the reasoning process. Experiments show that our method effectively reduces token costs in CoT reasoning with only a slight performance reduction, offering a practical solution to balance efficiency and accuracy in LLM reasoning. Code: <a href="https://github.com/GeniusHTX/TALE" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item391'>[391]</a> <a href ="/abs/2412.21036" title="Abstract" id="2412.21036"> arXiv:2412.21036 </a> (replaced) [<a href="/pdf/2412.21036" title="Download PDF" id="pdf-2412.21036" aria-labelledby="pdf-2412.21036">pdf</a>, <a href="https://arxiv.org/html/2412.21036v2" title="View HTML" id="html-2412.21036" aria-labelledby="html-2412.21036" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.21036" title="Other formats" id="oth-2412.21036" aria-labelledby="oth-2412.21036">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> GePBench: Evaluating Fundamental Geometric Perception for Multimodal Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xing,+S">Shangyu Xing</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiang,+C">Changhao Xiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+Y">Yuteng Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yue,+Y">Yifan Yue</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Z">Zhen Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+X">Xinyu Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Z">Zhangtai Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+F">Fei Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dai,+X">Xinyu Dai</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Multimodal large language models (MLLMs) have made significant progress in integrating visual and linguistic understanding. Existing benchmarks typically focus on high-level semantic capabilities, such as scene understanding and visual reasoning, but often overlook a crucial, foundational ability: geometric perception. Geometric perception involves understanding geometric shapes, structures, and spatial relationships, which are essential for supporting higher-level semantic tasks. Despite its importance, this capability remains underexplored in current MLLM research. To address this gap, we introduce GePBench, a novel benchmark designed to assess the geometric perception abilities of MLLMs. Our extensive evaluations reveal that current state-of-the-art MLLMs exhibit significant deficiencies in geometric perception tasks. Furthermore, we show that models trained with GePBench data demonstrate substantial improvements on a wide range of benchmark tasks, highlighting the critical role of geometric perception in enabling advanced multimodal applications. Our code and datasets will be publicly available. </p> </div> </dd> <dt> <a name='item392'>[392]</a> <a href ="/abs/2501.01046" title="Abstract" id="2501.01046"> arXiv:2501.01046 </a> (replaced) [<a href="/pdf/2501.01046" title="Download PDF" id="pdf-2501.01046" aria-labelledby="pdf-2501.01046">pdf</a>, <a href="https://arxiv.org/html/2501.01046v2" title="View HTML" id="html-2501.01046" aria-labelledby="html-2501.01046" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.01046" title="Other formats" id="oth-2501.01046" aria-labelledby="oth-2501.01046">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> FED: Fast and Efficient Dataset Deduplication Framework with GPU Acceleration </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Son,+Y">Youngjun Son</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+C">Chaewon Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+J">Jaejin Lee</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 13 pages, 4 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Dataset deduplication plays a crucial role in enhancing data quality, ultimately improving the training performance and efficiency of large language models. A commonly used method for data deduplication is the MinHash LSH algorithm. Recently, NVIDIA introduced a GPU-based MinHash LSH deduplication method, but it remains suboptimal, leaving room for further improvement in processing efficiency. This paper proposes a GPU-accelerated deduplication framework, FED, that optimizes MinHash LSH for GPU clusters and leverages computationally efficient, partially reusable non-cryptographic hash functions. FED significantly outperforms the CPU-based deduplication tool in SlimPajama (using 64 logical CPU cores) by up to 107.2 times and the GPU-based tool in NVIDIA NeMo Curator by up to 6.3 times when processing 30 million documents on a node with four GPUs. Notably, our method dramatically accelerates the previously time-consuming MinHash signature generation phase, achieving speed-ups of up to 260 compared to the CPU baseline. Despite these gains in efficiency, FED maintains high deduplication quality, with the duplicate document sets reaching a Jaccard similarity of over 0.96 compared to those identified by the standard MinHash algorithm. In large-scale experiments, the deduplication of 1.2 trillion tokens is completed in just 6 hours in a four-node, 16-GPU environment. The related code is publicly available on GitHub (\href{<a href="https://github.com/mcrl/FED" rel="external noopener nofollow" class="link-external link-https">this https URL</a>}{<a href="https://github.com/mcrl/FED" rel="external noopener nofollow" class="link-external link-https">this https URL</a>}). </p> </div> </dd> <dt> <a name='item393'>[393]</a> <a href ="/abs/2501.01743" title="Abstract" id="2501.01743"> arXiv:2501.01743 </a> (replaced) [<a href="/pdf/2501.01743" title="Download PDF" id="pdf-2501.01743" aria-labelledby="pdf-2501.01743">pdf</a>, <a href="https://arxiv.org/html/2501.01743v2" title="View HTML" id="html-2501.01743" aria-labelledby="html-2501.01743" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.01743" title="Other formats" id="oth-2501.01743" aria-labelledby="oth-2501.01743">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Automating Legal Concept Interpretation with LLMs: Retrieval, Generation, and Evaluation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+K">Kangcheng Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+Q">Quzhe Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+C">Cong Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+Y">Yansong Feng</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Legal articles often include vague concepts for adapting to the ever-changing society. Providing detailed interpretations of these concepts is a critical and challenging task even for legal practitioners. It requires meticulous and professional annotations and summarizations by legal experts, which are admittedly time-consuming and expensive to collect at scale. By emulating legal experts' doctrinal method, we introduce a novel framework, ATRIE, using large language models (LLMs) to AuTomatically Retrieve concept-related information, Interpret legal concepts, and Evaluate generated interpretations, eliminating dependence on legal experts. ATRIE comprises a legal concept interpreter and a legal concept interpretation evaluator. The interpreter uses LLMs to retrieve relevant information from judicial precedents and interpret legal concepts. The evaluator uses performance changes on legal concept entailment, a downstream task we propose, as a proxy of interpretation quality. Automatic and multifaceted human evaluations indicate that the quality of our interpretations is comparable to those written by legal experts, with superior comprehensiveness and readability. Although there remains a slight gap in accuracy, it can already assist legal practitioners in improving the efficiency of concept interpretation. </p> </div> </dd> <dt> <a name='item394'>[394]</a> <a href ="/abs/2501.02795" title="Abstract" id="2501.02795"> arXiv:2501.02795 </a> (replaced) [<a href="/pdf/2501.02795" title="Download PDF" id="pdf-2501.02795" aria-labelledby="pdf-2501.02795">pdf</a>, <a href="https://arxiv.org/html/2501.02795v3" title="View HTML" id="html-2501.02795" aria-labelledby="html-2501.02795" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.02795" title="Other formats" id="oth-2501.02795" aria-labelledby="oth-2501.02795">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> InfiFusion: A Unified Framework for Enhanced Cross-Model Reasoning via LLM Fusion </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yan,+Z">Zhaoyi Yan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yiming Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+B">Baoyi He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fu,+Y">Yuhao Fu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+Q">Qi Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sang,+Z">Zhijie Sang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ji,+C">Chunlin Ji</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+S">Shengyu Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+F">Fei Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+H">Hongxia Yang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Significant performance improvements over the previous version; under review; </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> We introduce InfiFusion, an efficient training pipeline designed to integrate multiple domain-specialized Large Language Models (LLMs) into a single pivot model, effectively harnessing the strengths of each source model. Traditional fusion methods either merge model parameters directly or rely on knowledge distillation with rigid assumptions, limiting their flexibility and efficiency. InfiFusion overcomes these limitations by enhancing Universal Logit Distillation (ULD) with Top-K selection and Logits Standardization. We propose two fusion strategies: Pairwise Fusion (InfiFusion$_p$), where each source model knowledge is distilled individually into the pivot model followed by merging and Unified Fusion (InfiFusion$_u$), where knowledge from all source models is distilled simultaneously into the pivot model. InfiFusion outperforms the state-of-the-art models, such as Qwen-2.5-14B-Instruct and Phi-4, across 11 widely applied benchmarks covering reasoning, coding, mathematics, and instruction-following tasks. Notably, InfiFusion achieves this superior performance while significantly reduces computational costs, completing full training with only 160 H800 GPU hours compared to the millions typically required for traditional LLM training. </p> </div> </dd> <dt> <a name='item395'>[395]</a> <a href ="/abs/2501.02979" title="Abstract" id="2501.02979"> arXiv:2501.02979 </a> (replaced) [<a href="/pdf/2501.02979" title="Download PDF" id="pdf-2501.02979" aria-labelledby="pdf-2501.02979">pdf</a>, <a href="https://arxiv.org/html/2501.02979v2" title="View HTML" id="html-2501.02979" aria-labelledby="html-2501.02979" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.02979" title="Other formats" id="oth-2501.02979" aria-labelledby="oth-2501.02979">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Registering Source Tokens to Target Language Spaces in Multilingual Neural Machine Translation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Qu,+Z">Zhi Qu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yiran Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mao,+J">Jiannan Mao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ding,+C">Chenchen Ding</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tanaka,+H">Hideki Tanaka</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Utiyama,+M">Masao Utiyama</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Watanabe,+T">Taro Watanabe</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The multilingual neural machine translation (MNMT) aims for arbitrary translations across multiple languages. Although MNMT-specific models trained by parallel data offer low costs in training and deployment, their performance consistently lags behind that of large language models (LLMs). In this work, we introduce registering, a novel method that enables a small MNMT-specific model to compete with LLMs. Specifically, we insert a set of artificial tokens specifying the target language, called registers, into the input sequence between the source and target tokens. By modifying the attention mask, the target token generation only pays attention to the activation of registers, representing the source tokens in the target language space. Experiments on EC-40, a large-scale benchmark, show that our method advances the state-of-the-art of MNMT. We further pre-train two models, namely MITRE (multilingual translation with registers), by 9.3 billion sentence pairs across 24 languages collected from public corpus. One of them, MITRE-913M, outperforms NLLB-3.3B, achieves comparable performance with commercial LLMs, and shows strong adaptability in fine-tuning. Finally, we open-source our models to facilitate further research and development in MNMT: <a href="https://github.com/zhiqu22/mitre" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item396'>[396]</a> <a href ="/abs/2501.03035" title="Abstract" id="2501.03035"> arXiv:2501.03035 </a> (replaced) [<a href="/pdf/2501.03035" title="Download PDF" id="pdf-2501.03035" aria-labelledby="pdf-2501.03035">pdf</a>, <a href="https://arxiv.org/html/2501.03035v2" title="View HTML" id="html-2501.03035" aria-labelledby="html-2501.03035" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.03035" title="Other formats" id="oth-2501.03035" aria-labelledby="oth-2501.03035">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Quantization Meets Reasoning: Exploring LLM Low-Bit Quantization Degradation for Mathematical Reasoning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Z">Zhen Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Su,+Y">Yupeng Su</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+R">Runming Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xie,+C">Congkai Xie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zheng Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xie,+Z">Zhongwei Xie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wong,+N">Ngai Wong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+H">Hongxia Yang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Large language models have achieved significant advancements in complex mathematical reasoning benchmarks, such as MATH. However, their substantial computational requirements present challenges for practical deployment. Model quantization has emerged as an effective strategy to reduce memory usage and computational costs by employing lower precision and bit-width representations. In this study, we systematically evaluate the impact of quantization on mathematical reasoning tasks. Our results demonstrate that aggressive quantization methods like AWQ and GPTQ introduce up to 32.39% accuracy degradation (average 11.31%) on Llama-3 models, particularly in numerical computation and reasoning planning. To address this, we introduce a multidimensional evaluation framework combining qualitative capability analysis and quantitative error assessment. We further develop targeted recovery strategies, showing that fine-tuning quantized models on only 545 task-specific examples for 3 minutes on 4 GPUs effectively restores reasoning capabilities to near full-precision levels. Additionally, our error assessment pipeline achieves 98.9% accuracy in diagnosing and localizing errors across 3,366 failure cases, providing actionable insights for mitigating quantization-induced degradation. </p> </div> </dd> <dt> <a name='item397'>[397]</a> <a href ="/abs/2501.03191" title="Abstract" id="2501.03191"> arXiv:2501.03191 </a> (replaced) [<a href="/pdf/2501.03191" title="Download PDF" id="pdf-2501.03191" aria-labelledby="pdf-2501.03191">pdf</a>, <a href="https://arxiv.org/html/2501.03191v2" title="View HTML" id="html-2501.03191" aria-labelledby="html-2501.03191" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.03191" title="Other formats" id="oth-2501.03191" aria-labelledby="oth-2501.03191">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CLIX: Cross-Lingual Explanations of Idiomatic Expressions </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Gluck,+A">Aaron Gluck</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=von+der+Wense,+K">Katharina von der Wense</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pacheco,+M+L">Maria Leonor Pacheco</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Automated definition generation systems have been proposed to support vocabulary expansion for language learners. The main barrier to the success of these systems is that learners often struggle to understand definitions due to the presence of potentially unfamiliar words and grammar, particularly when non-standard language is involved. To address these challenges, we propose CLIX, the task of Cross-Lingual explanations of Idiomatic eXpressions. We explore the capabilities of current NLP models for this task, and observe that while it remains challenging, large language models show promise. Finally, we perform a detailed error analysis to highlight the key challenges that need to be addressed before we can reliably incorporate these systems into educational tools. </p> </div> </dd> <dt> <a name='item398'>[398]</a> <a href ="/abs/2501.03226" title="Abstract" id="2501.03226"> arXiv:2501.03226 </a> (replaced) [<a href="/pdf/2501.03226" title="Download PDF" id="pdf-2501.03226" aria-labelledby="pdf-2501.03226">pdf</a>, <a href="https://arxiv.org/html/2501.03226v3" title="View HTML" id="html-2501.03226" aria-labelledby="html-2501.03226" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.03226" title="Other formats" id="oth-2501.03226" aria-labelledby="oth-2501.03226">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> BoostStep: Boosting mathematical capability of Large Language Models via improved single-step reasoning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+B">Beichen Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Y">Yuhong Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dong,+X">Xiaoyi Dong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zang,+Y">Yuhang Zang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+P">Pan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Duan,+H">Haodong Duan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+Y">Yuhang Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+D">Dahua Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+J">Jiaqi Wang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Codes and Data are available at <a href="https://github.com/beichenzbc/BoostStep" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Large language models (LLMs) have demonstrated impressive ability in solving complex mathematical problems with multi-step reasoning and can be further enhanced with well-designed in-context learning (ICL) examples. However, this potential is often constrained by two major challenges in ICL: granularity mismatch and irrelevant information. We observe that while LLMs excel at decomposing mathematical problems, they often struggle with reasoning errors in fine-grained steps. Moreover, ICL examples retrieved at the question level may omit critical steps or even mislead the model with irrelevant details. To address this issue, we propose BoostStep, a method that enhances reasoning accuracy through step-aligned ICL, a novel mechanism that carefully aligns retrieved reference steps with the corresponding reasoning steps. Additionally, BoostStep incorporates an effective "first-try" strategy to deliver exemplars highly relevant to the current state of reasoning. BoostStep is a flexible and powerful method that integrates seamlessly with chain-of-thought (CoT) and tree search algorithms, refining both candidate selection and decision-making. Empirical results show that BoostStep improves GPT-4o's CoT performance by 4.6% across mathematical benchmarks, significantly surpassing traditional few-shot learning's 1.2%. Moreover, it can achieve an additional 7.5\% gain combined with tree search. Surprisingly, it enhances state-of-the-art LLMs to solve challenging math problems using simpler examples. It improves DeepSeek-R1-671B's performance on AIME by 2.2%, leveraging simple examples only from the MATH dataset. </p> </div> </dd> <dt> <a name='item399'>[399]</a> <a href ="/abs/2501.03884" title="Abstract" id="2501.03884"> arXiv:2501.03884 </a> (replaced) [<a href="/pdf/2501.03884" title="Download PDF" id="pdf-2501.03884" aria-labelledby="pdf-2501.03884">pdf</a>, <a href="/format/2501.03884" title="Other formats" id="oth-2501.03884" aria-labelledby="oth-2501.03884">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> AlphaPO - Reward shape matters for LLM alignment </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Gupta,+A">Aman Gupta</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+S">Shao Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+Q">Qingquan Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+S">Sirou Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hong,+J">Jiwoo Hong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Saha,+A">Ankan Saha</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gupta,+V">Viral Gupta</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+N">Noah Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+E">Eunki Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+S">Siyu Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Agarwal,+P">Parag Agarwal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pillai,+N">Natesh Pillai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Keerthi,+S+S">S. Sathiya Keerthi</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Reinforcement Learning with Human Feedback (RLHF) and its variants have made huge strides toward the effective alignment of large language models (LLMs) to follow instructions and reflect human values. More recently, Direct Alignment Algorithms (DAAs) have emerged in which the reward modeling stage of RLHF is skipped by characterizing the reward directly as a function of the policy being learned. Some popular examples of DAAs include Direct Preference Optimization (DPO) and Simple Preference Optimization (SimPO). These methods often suffer from likelihood displacement, a phenomenon by which the probabilities of preferred responses are often reduced undesirably. <br>In this paper, we argue that, for DAAs the reward (function) shape matters. We introduce \textbf{AlphaPO}, a new DAA method that leverages an $\alpha$-parameter to help change the shape of the reward function beyond the standard log reward. AlphaPO helps maintain fine-grained control over likelihood displacement and over-optimization. Compared to SimPO, one of the best performing DAAs, AlphaPO leads to about 7\% to 10\% relative improvement in alignment performance for the instruct versions of Mistral-7B and Llama3-8B while achieving 15\% to 50\% relative improvement over DPO on the same models. The analysis and results presented highlight the importance of the reward shape, and how one can systematically change it to affect training dynamics, as well as improve alignment performance. </p> </div> </dd> <dt> <a name='item400'>[400]</a> <a href ="/abs/2501.04945" title="Abstract" id="2501.04945"> arXiv:2501.04945 </a> (replaced) [<a href="/pdf/2501.04945" title="Download PDF" id="pdf-2501.04945" aria-labelledby="pdf-2501.04945">pdf</a>, <a href="https://arxiv.org/html/2501.04945v3" title="View HTML" id="html-2501.04945" aria-labelledby="html-2501.04945" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.04945" title="Other formats" id="oth-2501.04945" aria-labelledby="oth-2501.04945">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Step-by-Step Mastery: Enhancing Soft Constraint Following Ability of Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ren,+Q">Qingyu Ren</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zeng,+J">Jie Zeng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+Q">Qianyu He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liang,+J">Jiaqing Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiao,+Y">Yanghua Xiao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+W">Weikang Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+Z">Zeye Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+F">Fei Yu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> It is crucial for large language models (LLMs) to follow instructions that involve multiple constraints. However, it is an unexplored area to enhance LLMs' ability to follow soft constraints. To bridge the gap, we initially design a pipeline to construct datasets with high-quality outputs automatically. Additionally, to fully utilize the positive and negative samples generated during the data construction process, we choose Direct Preference Optimization (DPO) as the training method. Furthermore, taking into account the difficulty of soft constraints indicated by the number of constraints, we design a curriculum learning training paradigm based on the constraint quantity. We experimentally evaluate the effectiveness of our methods in improving LLMs' soft constraint following ability and analyze the factors driving the <a href="http://improvements.The" rel="external noopener nofollow" class="link-external link-http">this http URL</a> datasets and code are publicly available at <a href="https://github.com/Rainier-rq/FollowSoftConstraint" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item401'>[401]</a> <a href ="/abs/2501.07824" title="Abstract" id="2501.07824"> arXiv:2501.07824 </a> (replaced) [<a href="/pdf/2501.07824" title="Download PDF" id="pdf-2501.07824" aria-labelledby="pdf-2501.07824">pdf</a>, <a href="https://arxiv.org/html/2501.07824v2" title="View HTML" id="html-2501.07824" aria-labelledby="html-2501.07824" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.07824" title="Other formats" id="oth-2501.07824" aria-labelledby="oth-2501.07824">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Real-time Verification and Refinement of Language Model Text Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ko,+J">Joonho Ko</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Baek,+J">Jinheon Baek</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hwang,+S+J">Sung Ju Hwang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Large language models (LLMs) have shown remarkable performance across a wide range of natural language tasks. However, a critical challenge remains in that they sometimes generate factually incorrect answers. To address this, while many previous work has focused on identifying errors in their generation and further refining them, they are slow in deployment since they are designed to verify the response from LLMs only after their entire generation (from the first to last tokens) is done. Further, we observe that once LLMs generate incorrect tokens early on, there is a higher likelihood that subsequent tokens will also be factually incorrect. To this end, in this work, we propose Streaming-VR (Streaming Verification and Refinement), a novel approach designed to enhance the efficiency of verification and refinement of LLM outputs. Specifically, the proposed Streaming-VR enables on-the-fly verification and correction of tokens as they are being generated, similar to a streaming process, ensuring that each subset of tokens is checked and refined in real-time by another LLM as the LLM constructs its response. Through comprehensive evaluations on multiple datasets, we demonstrate that our approach not only enhances the factual accuracy of LLMs, but also offers a more efficient solution compared to prior refinement methods. </p> </div> </dd> <dt> <a name='item402'>[402]</a> <a href ="/abs/2501.09766" title="Abstract" id="2501.09766"> arXiv:2501.09766 </a> (replaced) [<a href="/pdf/2501.09766" title="Download PDF" id="pdf-2501.09766" aria-labelledby="pdf-2501.09766">pdf</a>, <a href="https://arxiv.org/html/2501.09766v2" title="View HTML" id="html-2501.09766" aria-labelledby="html-2501.09766" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.09766" title="Other formats" id="oth-2501.09766" aria-labelledby="oth-2501.09766">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> iTool: Boosting Tool Use of Large Language Models via Iterative Reinforced Fine-Tuning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zeng,+Y">Yirong Zeng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ding,+X">Xiao Ding</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yuxian Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+W">Weiwen Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ning,+W">Wu Ning</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hou,+Y">Yutai Hou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+X">Xu Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qin,+B">Bing Qin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+T">Ting Liu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Augmenting large language models (LLMs) with external tools is known as a promising approach to enhancing their capabilities, especially for complex tasks. Synthesizing tool-use data through real-world simulations is an effective way to achieve it. Nevertheless, our investigation reveals that (1) training gains significantly decay as synthetic data increases. The model struggles to benefit from more synthetic data due to potential data diversity issues, resulting in poor performance in complex scenarios. Moreover, we find that (2) this challenge primarily manifests as minor discrepancies between the model's output and the ground truth response (termed as deficiency), such as errors in parameter values that require complex reasoning from the context to resolve. To this end, we propose an iterative reinforced fine-tuning strategy designed to alleviate these challenges. This strategy involves: (1) enhancing the diversity of synthetic data through path exploration of Monte Carlo Tree Search. (2) iteratively identifying deficiency-related data, constructing fine-grained preference pairs to pinpoint deficiencies, and then applying preference optimization to optimize these deficiencies. Our experiments show that models trained using our method achieve about 3\% better performance than same-size models, outperforming larger open-source and closed-source models. </p> </div> </dd> <dt> <a name='item403'>[403]</a> <a href ="/abs/2501.11790" title="Abstract" id="2501.11790"> arXiv:2501.11790 </a> (replaced) [<a href="/pdf/2501.11790" title="Download PDF" id="pdf-2501.11790" aria-labelledby="pdf-2501.11790">pdf</a>, <a href="https://arxiv.org/html/2501.11790v2" title="View HTML" id="html-2501.11790" aria-labelledby="html-2501.11790" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.11790" title="Other formats" id="oth-2501.11790" aria-labelledby="oth-2501.11790">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Benchmarking Large Language Models via Random Variables </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Hong,+Z">Zijin Hong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+H">Hao Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dong,+S">Su Dong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dong,+J">Junnan Dong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiao,+Y">Yilin Xiao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yujing Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zhu Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+F">Feiran Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+L">Linyi Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+H">Hongxia Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+X">Xiao Huang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Under review </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Recent studies have raised concerns about the reliability of current mathematical benchmarks, highlighting issues such as simplistic design and potential data contamination. Therefore, creating a reliable benchmark that effectively evaluates the genuine capabilities of large language models (LLMs) in mathematical reasoning remains a significant challenge. To address this, we propose RV-Bench, a framework for Benchmarking LLMs via Random Variables in mathematical reasoning. Specifically, the background content of a random variable question (RV question) mirrors the original problem in existing benchmarks, but the variable combinations are randomized, making it "unseen" by the LLMs. Models must completely understand the question pattern of the original problem to correctly answer RV questions with various variable values. As a result, the LLM's genuine capability in mathematical reasoning is reflected by its accuracy and robustness on RV-Bench. We conducted extensive experiments on over 30 representative LLMs across more than 1000 RV questions. Our findings suggest that LLMs exhibit an imbalance in proficiency between encountered and "unseen" data domains. Proficiency generalization across similar mathematical reasoning tasks is verified to be limited by accuracy and robustness, but it can still be enhanced through test-time scaling. </p> </div> </dd> <dt> <a name='item404'>[404]</a> <a href ="/abs/2501.12051" title="Abstract" id="2501.12051"> arXiv:2501.12051 </a> (replaced) [<a href="/pdf/2501.12051" title="Download PDF" id="pdf-2501.12051" aria-labelledby="pdf-2501.12051">pdf</a>, <a href="https://arxiv.org/html/2501.12051v2" title="View HTML" id="html-2501.12051" aria-labelledby="html-2501.12051" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.12051" title="Other formats" id="oth-2501.12051" aria-labelledby="oth-2501.12051">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MedS$^3$: Towards Medical Small Language Models with Self-Evolved Slow Thinking </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+S">Shuyang Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liao,+Y">Yusheng Liao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Z">Zhe Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Ya Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yanfeng Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yu Wang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 21 pages; </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Medical language models (MLMs) have become pivotal in advancing medical natural language processing. However, prior models that rely on pre-training or supervised fine-tuning often exhibit low data efficiency and limited practicality in real-world clinical applications. While OpenAI's o1 highlights test-time scaling in mathematics, attempts to replicate this approach in medicine typically distill responses from GPT-series models to open-source models, focusing primarily on multiple-choice tasks. This strategy, though straightforward, neglects critical concerns like data privacy and realistic deployment in clinical settings. In this work, we present a deployable, small-scale medical reasoning system, MedS3, designed for long-chain reasoning in clinical tasks using a self-evolution paradigm. Starting with a seed dataset of around 8,000 instances spanning five domains and 16 datasets, we prompt a base policy model to perform Monte Carlo Tree Search (MCTS) to construct rule-verifiable reasoning chains. Each reasoning step is assigned an evolution rollout value, allowing verified trajectories to train the policy model and the process reward model (PRM). During inference, the policy model generates multiple responses, and the reward model selects the one with a newly proposed PRM-guided Vote-Sum (P-VS) strategy. Experiments on eleven evaluation datasets demonstrate that MedS3 outperforms not only the prior strongest medical model by 6.59, but also 32B-level general reasoning models by 8.71 points. Code and data are available at <a href="https://github.com/pixas/MedSSS" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item405'>[405]</a> <a href ="/abs/2501.12619" title="Abstract" id="2501.12619"> arXiv:2501.12619 </a> (replaced) [<a href="/pdf/2501.12619" title="Download PDF" id="pdf-2501.12619" aria-labelledby="pdf-2501.12619">pdf</a>, <a href="https://arxiv.org/html/2501.12619v3" title="View HTML" id="html-2501.12619" aria-labelledby="html-2501.12619" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.12619" title="Other formats" id="oth-2501.12619" aria-labelledby="oth-2501.12619">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Quantification of Large Language Model Distillation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+S">Sunbowen Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+J">Junting Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ao,+C">Chang Ao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+K">Kaige Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Du,+X">Xinrun Du</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+S">Sirui He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+H">Haihong Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+T">Tianci Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+J">Jiaheng Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Alinejad-Rokny,+H">Hamid Alinejad-Rokny</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+M">Min Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liang,+Y">Yitao Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wen,+Z">Zhoufutu Wen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ni,+S">Shiwen Ni</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Model distillation is a fundamental technique in building large language models (LLMs), transferring knowledge from a teacher model to a student model. However, distillation can lead to model homogenization, reducing diversity among models and impairing their ability to robustly handle complex or novel tasks. These limitations underscore the need to systematically quantify the distillation process and its impact. In this work, we propose a framework to evaluate and quantify model distillation. Our method addresses two key aspects: (1) Identifying identity cognition contradictions to assess discrepancies in how models perceive and represent identity-related information, and (2) Analyzing multi-granularity response similarities across models to measure the extent of homogenization. Experimental results demonstrate two key insights: (1) Well-known closed-source and open-source LLMs usually exhibit high distillation degrees, except for Claude, Doubao, and Gemini. (2) Base LLMs show higher distillation degrees compared to aligned LLMs. By offering a systematic approach to improve the transparency of LLM data distillation, we call for LLMs with more independent development and more transparent technical reports to improve LLMs' robustness and safety. The code and data are available under <a href="https://github.com/Aegis1863/LLMs-Distillation-Quantification" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item406'>[406]</a> <a href ="/abs/2501.13115" title="Abstract" id="2501.13115"> arXiv:2501.13115 </a> (replaced) [<a href="/pdf/2501.13115" title="Download PDF" id="pdf-2501.13115" aria-labelledby="pdf-2501.13115">pdf</a>, <a href="https://arxiv.org/html/2501.13115v2" title="View HTML" id="html-2501.13115" aria-labelledby="html-2501.13115" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.13115" title="Other formats" id="oth-2501.13115" aria-labelledby="oth-2501.13115">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Dagger Behind Smile: Fool LLMs with a Happy Ending Story </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+X">Xurui Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xie,+Z">Zhixin Xie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huai,+S">Shuo Huai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kong,+J">Jiayi Kong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+J">Jun Luo</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Cryptography and Security (cs.CR) </div> <p class='mathjax'> The wide adoption of Large Language Models (LLMs) has attracted significant attention from $\textit{jailbreak}$ attacks, where adversarial prompts crafted through optimization or manual design exploit LLMs to generate malicious contents. However, optimization-based attacks have limited efficiency and transferability, while existing manual designs are either easily detectable or demand intricate interactions with LLMs. In this paper, we first point out a novel perspective for jailbreak attacks: LLMs are more responsive to $\textit{positive}$ prompts. Based on this, we deploy Happy Ending Attack (HEA) to wrap up a malicious request in a scenario template involving a positive prompt formed mainly via a $\textit{happy ending}$, it thus fools LLMs into jailbreaking either immediately or at a follow-up malicious <a href="http://request.This" rel="external noopener nofollow" class="link-external link-http">this http URL</a> has made HEA both efficient and effective, as it requires only up to two turns to fully jailbreak LLMs. Extensive experiments show that our HEA can successfully jailbreak on state-of-the-art LLMs, including GPT-4o, Llama3-70b, Gemini-pro, and achieves 88.79\% attack success rate on average. We also provide quantitative explanations for the success of HEA. </p> </div> </dd> <dt> <a name='item407'>[407]</a> <a href ="/abs/2501.13126" title="Abstract" id="2501.13126"> arXiv:2501.13126 </a> (replaced) [<a href="/pdf/2501.13126" title="Download PDF" id="pdf-2501.13126" aria-labelledby="pdf-2501.13126">pdf</a>, <a href="https://arxiv.org/html/2501.13126v2" title="View HTML" id="html-2501.13126" aria-labelledby="html-2501.13126" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.13126" title="Other formats" id="oth-2501.13126" aria-labelledby="oth-2501.13126">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Preference Curriculum: LLMs Should Always Be Pretrained on Their Preferred Data </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+X">Xuemiao Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+L">Liangyu Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Duan,+F">Feiyu Duan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+Y">Yongwei Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Sirui Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Weng,+R">Rongxiang Weng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+J">Jingang Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cai,+X">Xunliang Cai</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 18 pages, 13 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Large language models (LLMs) generally utilize a consistent data distribution throughout the pretraining process. However, as the model's capability improves, it is intuitive that its data preferences dynamically change, indicating the need for pretraining with different data at various training stages. To achieve it, we propose the Perplexity Difference (PD) based Preference Curriculum learning (PDPC) framework, which always perceives and uses the data preferred by LLMs to train and boost them. First, we introduce the PD metric to quantify the difference in how challenging a sample is for weak versus strong models. Samples with high PD are more challenging for weak models to learn and are more suitable to be arranged in the later stage of pretraining. Second, we propose the preference function to approximate and predict the data preference of the LLM at any training step, so as to complete the arrangement of the dataset offline and ensure continuous training without interruption. Experimental results on 1.3B and 3B models demonstrate that PDPC significantly surpasses baselines. Notably, the 3B model trained on 1T tokens achieves an increased average accuracy of over 8.1% across MMLU and CMMLU. </p> </div> </dd> <dt> <a name='item408'>[408]</a> <a href ="/abs/2501.13669" title="Abstract" id="2501.13669"> arXiv:2501.13669 </a> (replaced) [<a href="/pdf/2501.13669" title="Download PDF" id="pdf-2501.13669" aria-labelledby="pdf-2501.13669">pdf</a>, <a href="https://arxiv.org/html/2501.13669v2" title="View HTML" id="html-2501.13669" aria-labelledby="html-2501.13669" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.13669" title="Other formats" id="oth-2501.13669" aria-labelledby="oth-2501.13669">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> How to Alleviate Catastrophic Forgetting in LLMs Finetuning? Hierarchical Layer-Wise and Element-Wise Regularization </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+S">Shezheng Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+H">Hao Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+J">Jun Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+S">Shasha Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Peng,+L">Long Peng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wan,+Q">Qian Wan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+X">Xiaodong Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+J">Jie Yu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Work in progress </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Large Language Models (LLMs) exhibit strong general language capabilities. However, fine-tuning these models on domain-specific tasks often leads to catastrophic forgetting, where the model overwrites or loses essential knowledge acquired during pretraining. This phenomenon significantly limits the broader applicability of LLMs. To address this challenge, we propose a novel approach to compute the element-wise importance of model parameters crucial for preserving general knowledge during fine-tuning. Our method utilizes a dual-objective optimization strategy: (1) regularization loss based on element-wise parameter importance, which constrains the updates to parameters crucial for general knowledge; (2) cross-entropy loss to adapt to domain-specific tasks. Additionally, we introduce layer-wise coefficients to account for the varying contributions of different layers, dynamically balancing the dual-objective optimization. Extensive experiments on scientific, medical, and physical tasks using GPT-J and LLaMA-3 demonstrate that our approach mitigates catastrophic forgetting while enhancing model adaptability. Compared to previous methods, our solution is approximately 20 times faster and requires only 10-15% of the storage, highlighting the practical efficiency. The code will be released. </p> </div> </dd> <dt> <a name='item409'>[409]</a> <a href ="/abs/2501.15175" title="Abstract" id="2501.15175"> arXiv:2501.15175 </a> (replaced) [<a href="/pdf/2501.15175" title="Download PDF" id="pdf-2501.15175" aria-labelledby="pdf-2501.15175">pdf</a>, <a href="https://arxiv.org/html/2501.15175v2" title="View HTML" id="html-2501.15175" aria-labelledby="html-2501.15175" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.15175" title="Other formats" id="oth-2501.15175" aria-labelledby="oth-2501.15175">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Option-ID Based Elimination For Multiple Choice Questions </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+Z">Zhenhao Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+B">Bulou Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ai,+Q">Qingyao Ai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Y">Yiqun Liu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Multiple choice questions (MCQs) are a popular and important task for evaluating large language models (LLMs). Based on common strategies people use when answering MCQs, the process of elimination (PoE) has been proposed as an effective problem-solving method. Existing methods to the PoE generally fall into two categories: one involves having the LLM directly select the incorrect options, while the other involves scoring the options. However, both methods incur high computational costs and often perform worse than methods that directly answer the MCQs with the option IDs. To address this issue, this paper proposes a PoE based on option ID. Specifically, our method eliminates option by selecting the option ID with the lowest probability. We conduct experiments with 10 different LLMs in zero-shot settings on 7 publicly available datasets. The experimental results demonstrate that our method significantly improves the LLM's performance. Further analysis reveals that the sequential elimination strategy can effectively enhance the LLM's reasoning ability. Additionally, we find that sequential elimination is also applicable to few-shot settings and can be combined with debias methods to further improve LLM's performance. </p> </div> </dd> <dt> <a name='item410'>[410]</a> <a href ="/abs/2501.19353" title="Abstract" id="2501.19353"> arXiv:2501.19353 </a> (replaced) [<a href="/pdf/2501.19353" title="Download PDF" id="pdf-2501.19353" aria-labelledby="pdf-2501.19353">pdf</a>, <a href="https://arxiv.org/html/2501.19353v2" title="View HTML" id="html-2501.19353" aria-labelledby="html-2501.19353" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.19353" title="Other formats" id="oth-2501.19353" aria-labelledby="oth-2501.19353">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Do Large Multimodal Models Solve Caption Generation for Scientific Figures? Lessons Learned from SCICAP Challenge 2023 </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Hsu,+T+E">Ting-Yao E. Hsu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hsu,+Y">Yi-Li Hsu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rohatgi,+S">Shaurya Rohatgi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+C">Chieh-Yang Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ng,+H+Y+S">Ho Yin Sam Ng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rossi,+R">Ryan Rossi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+S">Sungchul Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+T">Tong Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ku,+L">Lun-Wei Ku</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Giles,+C+L">C. Lee Giles</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+T+K">Ting-Hao K. Huang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted to TACL 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Since the SCICAP datasets launch in 2021, the research community has made significant progress in generating captions for scientific figures in scholarly articles. In 2023, the first SCICAP Challenge took place, inviting global teams to use an expanded SCICAP dataset to develop models for captioning diverse figure types across various academic fields. At the same time, text generation models advanced quickly, with many powerful pre-trained large multimodal models (LMMs) emerging that showed impressive capabilities in various vision-and-language tasks. This paper presents an overview of the first SCICAP Challenge and details the performance of various models on its data, capturing a snapshot of the fields state. We found that professional editors overwhelmingly preferred figure captions generated by GPT-4V over those from all other models and even the original captions written by authors. Following this key finding, we conducted detailed analyses to answer this question: Have advanced LMMs solved the task of generating captions for scientific figures? </p> </div> </dd> <dt> <a name='item411'>[411]</a> <a href ="/abs/2502.00997" title="Abstract" id="2502.00997"> arXiv:2502.00997 </a> (replaced) [<a href="/pdf/2502.00997" title="Download PDF" id="pdf-2502.00997" aria-labelledby="pdf-2502.00997">pdf</a>, <a href="https://arxiv.org/html/2502.00997v3" title="View HTML" id="html-2502.00997" aria-labelledby="html-2502.00997" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.00997" title="Other formats" id="oth-2502.00997" aria-labelledby="oth-2502.00997">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MergeME: Model Merging Techniques for Homogeneous and Heterogeneous MoEs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+Y">Yuhang Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Karamanolakis,+G">Giannis Karamanolakis</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Soto,+V">Victor Soto</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rumshisky,+A">Anna Rumshisky</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kulkarni,+M">Mayank Kulkarni</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+F">Furong Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ai,+W">Wei Ai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+J">Jianhua Lu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted by NAACL 2025 Main </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> The recent success of specialized Large Language Models (LLMs) in domains such as mathematical reasoning and coding has led to growing interest in methods for merging these expert LLMs into a unified Mixture-of-Experts (MoE) model, with the goal of enhancing performance in each domain while retaining effectiveness on general tasks. However, the effective merging of expert models remains an open challenge, especially for models with highly divergent weight parameters or different architectures. State-of-the-art MoE merging methods only work with homogeneous model architectures and rely on simple unweighted averaging to merge expert layers, which does not address parameter interference and requires extensive fine-tuning of the merged MoE to restore performance. To address these limitations, this paper introduces new MoE merging techniques, including strategies to mitigate parameter interference, routing heuristics to reduce the need for MoE fine-tuning, and a novel method for merging experts with different architectures. Extensive experiments across multiple domains demonstrate the effectiveness of our proposed methods, reducing fine-tuning costs, improving performance over state-of-the-art methods, and expanding the applicability of MoE merging. </p> </div> </dd> <dt> <a name='item412'>[412]</a> <a href ="/abs/2502.01220" title="Abstract" id="2502.01220"> arXiv:2502.01220 </a> (replaced) [<a href="/pdf/2502.01220" title="Download PDF" id="pdf-2502.01220" aria-labelledby="pdf-2502.01220">pdf</a>, <a href="https://arxiv.org/html/2502.01220v2" title="View HTML" id="html-2502.01220" aria-labelledby="html-2502.01220" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.01220" title="Other formats" id="oth-2502.01220" aria-labelledby="oth-2502.01220">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Language Models Struggle to Achieve a Consistent Temporal Representation of Facts </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Khodja,+H+A">Hichem Ammar Khodja</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=B%C3%A9chet,+F">Fr茅d茅ric B茅chet</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Brabant,+Q">Quentin Brabant</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nasr,+A">Alexis Nasr</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lecorv%C3%A9,+G">Gw茅nol茅 Lecorv茅</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> preprint v2 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Language Models (LMs) have shown substantial improvements in handling factual knowledge, yet their capability to consistently represent temporal facts, which are valid only within specific timeframes, remains underexplored. To investigate this, we introduce TimeStress, a novel dataset comprising 521K statements on 2003 of the most popular temporal facts in Wikidata. Each statement contextualizes a fact with correct and incorrect dates across three precisions (Day, Month, Year). This setup allows us to evaluate LMs' ability to discern between correct and incorrect temporal statements based on their probability of being generated. We assess 18 LMs across various architectures using two metrics: the win rate, indicating how often correct dates outperform incorrect ones, and robustness, reflecting consistent performance across all dates. Our findings reveal that while some LMs achieve a win rate exceeding 80\%, robustness remains low, with the best model achieving only 6\%. Furthermore, robust knowledge at one date precision does not reliably transfer to others, highlighting a significant generalization gap. These results underscore the struggle of LMs to maintain a consistent temporal representation, supporting their limitations as reliable sources of temporal knowledge. We provide all data and code for further research. </p> </div> </dd> <dt> <a name='item413'>[413]</a> <a href ="/abs/2502.02028" title="Abstract" id="2502.02028"> arXiv:2502.02028 </a> (replaced) [<a href="/pdf/2502.02028" title="Download PDF" id="pdf-2502.02028" aria-labelledby="pdf-2502.02028">pdf</a>, <a href="https://arxiv.org/html/2502.02028v2" title="View HTML" id="html-2502.02028" aria-labelledby="html-2502.02028" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.02028" title="Other formats" id="oth-2502.02028" aria-labelledby="oth-2502.02028">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Fine-tuning Language Models for Recipe Generation: A Comparative Analysis and Benchmark Study </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Vij,+A">Anneketh Vij</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+C">Changhao Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nair,+R+A">Rahul Anil Nair</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ho,+T+E">Theodore Eugene Ho</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shi,+E">Edward Shi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bhowmick,+A">Ayan Bhowmick</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 18 pages, 10 figures,14 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> This research presents an exploration and study of the recipe generation task by fine-tuning various very small language models, with a focus on developing robust evaluation metrics and comparing across different language models the open-ended task of recipe generation. This study presents extensive experiments with multiple model architectures, ranging from T5-small (Raffel et al., 2023) and SmolLM-135M(Allal et al., 2024) to Phi-2 (Research, 2023), implementing both traditional NLP metrics and custom domain-specific evaluation metrics. Our novel evaluation framework incorporates recipe-specific metrics for assessing content quality and introduces approaches to allergen substitution. The results indicate that, while larger models generally perform better on standard metrics, the relationship between model size and recipe quality is more nuanced when considering domain-specific metrics. SmolLM-360M and SmolLM-1.7B demonstrate comparable performance despite their size difference before and after fine-tuning, while fine-tuning Phi-2 shows notable limitations in recipe generation despite its larger parameter count. The comprehensive evaluation framework and allergen substitution systems provide valuable insights for future work in recipe generation and broader NLG tasks that require domain expertise and safety considerations. </p> </div> </dd> <dt> <a name='item414'>[414]</a> <a href ="/abs/2502.03418" title="Abstract" id="2502.03418"> arXiv:2502.03418 </a> (replaced) [<a href="/pdf/2502.03418" title="Download PDF" id="pdf-2502.03418" aria-labelledby="pdf-2502.03418">pdf</a>, <a href="https://arxiv.org/html/2502.03418v2" title="View HTML" id="html-2502.03418" aria-labelledby="html-2502.03418" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.03418" title="Other formats" id="oth-2502.03418" aria-labelledby="oth-2502.03418">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Think or Step-by-Step? UnZIPping the Black Box in Zero-Shot Prompts </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Sadr,+N+G">Nikta Gohari Sadr</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Madhusudan,+S">Sangmitra Madhusudan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Emami,+A">Ali Emami</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 8 pages (excluding references) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Zero-shot prompting techniques have significantly improved the performance of Large Language Models (LLMs). However, we lack a clear understanding of why zero-shot prompts are so effective. For example, in the prompt "Let's think step-by-step," is "think" or "step-by-step" more crucial to its success? Existing interpretability methods, such as gradient-based and attention-based approaches, are computationally intensive and restricted to open-source models. We introduce the ZIP score (Zero-shot Importance of Perturbation score), a versatile metric applicable to both open and closed-source models, based on systematic input word perturbations. Our experiments across four recent LLMs, seven widely-used prompts, and several tasks, reveal interesting patterns in word importance. For instance, while both 'step-by-step' and 'think' show high ZIP scores, which one is more influential depends on the model and task. We validate our method using controlled experiments and compare our results with human judgments, finding that proprietary models align more closely with human intuition regarding word significance. These findings enhance our understanding of LLM behavior and contribute to developing more effective zero-shot prompts and improved model analysis. </p> </div> </dd> <dt> <a name='item415'>[415]</a> <a href ="/abs/2502.04511" title="Abstract" id="2502.04511"> arXiv:2502.04511 </a> (replaced) [<a href="/pdf/2502.04511" title="Download PDF" id="pdf-2502.04511" aria-labelledby="pdf-2502.04511">pdf</a>, <a href="/format/2502.04511" title="Other formats" id="oth-2502.04511" aria-labelledby="oth-2502.04511">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Beyond Sample-Level Feedback: Using Reference-Level Feedback to Guide Data Synthesis </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Mehri,+S">Shuhaib Mehri</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+X">Xiusi Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ji,+H">Heng Ji</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hakkani-T%C3%BCr,+D">Dilek Hakkani-T眉r</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> LLMs demonstrate remarkable capabilities in following natural language instructions, largely due to instruction-tuning on high-quality datasets. While synthetic data generation has emerged as a scalable approach for creating such datasets, maintaining consistent quality standards remains challenging. Recent approaches incorporate feedback to improve data quality, but typically operate at the sample level, generating and applying feedback for each response individually. In this work, we propose Reference-Level Feedback, a novel methodology that instead collects feedback based on high-quality reference samples from carefully curated seed data. We use this feedback to capture rich signals of desirable characteristics and propagate it throughout the data synthesis process. We present REFED, a dataset of 10K instruction-response pairs synthesized using such feedback. We demonstrate the effectiveness of our approach by showing that Llama-3.1-8B-Instruct finetuned on REFED achieves state-of-the-art performance among similar-sized SFT-based models on AlpacaEval 2.0 and strong results on Arena-Hard. Through extensive experiments, we show that our approach consistently outperforms traditional sample-level feedback methods with significantly fewer feedback collections and improves performance across different model architectures. </p> </div> </dd> <dt> <a name='item416'>[416]</a> <a href ="/abs/2502.04795" title="Abstract" id="2502.04795"> arXiv:2502.04795 </a> (replaced) [<a href="/pdf/2502.04795" title="Download PDF" id="pdf-2502.04795" aria-labelledby="pdf-2502.04795">pdf</a>, <a href="https://arxiv.org/html/2502.04795v2" title="View HTML" id="html-2502.04795" aria-labelledby="html-2502.04795" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.04795" title="Other formats" id="oth-2502.04795" aria-labelledby="oth-2502.04795">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Developmentally-plausible Working Memory Shapes a Critical Period for Language Acquisition </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Mita,+M">Masato Mita</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yoshida,+R">Ryo Yoshida</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Oseki,+Y">Yohei Oseki</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 13 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large language models possess general linguistic abilities but acquire language less efficiently than humans. This study proposes a method for integrating the developmental characteristics of working memory during the critical period, a stage when human language acquisition is particularly efficient, into the training process of language models. The proposed method introduces a mechanism that initially constrains working memory during the early stages of training and gradually relaxes this constraint in an exponential manner as learning progresses. Targeted syntactic evaluation shows that the proposed method outperforms conventional methods without memory constraints or with static memory constraints. These findings not only provide new directions for designing data-efficient language models but also offer indirect evidence supporting the role of the developmental characteristics of working memory as the underlying mechanism of the critical period in language acquisition. </p> </div> </dd> <dt> <a name='item417'>[417]</a> <a href ="/abs/2502.05551" title="Abstract" id="2502.05551"> arXiv:2502.05551 </a> (replaced) [<a href="/pdf/2502.05551" title="Download PDF" id="pdf-2502.05551" aria-labelledby="pdf-2502.05551">pdf</a>, <a href="https://arxiv.org/html/2502.05551v2" title="View HTML" id="html-2502.05551" aria-labelledby="html-2502.05551" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.05551" title="Other formats" id="oth-2502.05551" aria-labelledby="oth-2502.05551">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> FRAMES: Boosting LLMs with A Four-Quadrant Multi-Stage Pretraining Strategy </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+X">Xuemiao Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Duan,+F">Feiyu Duan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+L">Liangyu Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+Y">Yongwei Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Sirui Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Weng,+R">Rongxiang Weng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+J">Jingang Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cai,+X">Xunliang Cai</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large language models (LLMs) have significantly advanced human language understanding and generation, with pretraining data quality and organization being crucial to their performance. Multi-stage pretraining is a promising approach, but existing methods often lack quantitative criteria for data partitioning and instead rely on intuitive heuristics. In this paper, we propose the novel Four-quadRAnt Multi-stage prEtraining strategy (FRAME), guided by the established principle of organizing the pretraining process into four stages to achieve significant loss reductions four times. This principle is grounded in two key findings: first, training on high Perplexity (PPL) data followed by low PPL data, and second, training on low PPL difference (PD) data followed by high PD data, both causing the loss to drop significantly twice and performance enhancements. By partitioning data into four quadrants and strategically organizing them, FRAME achieves a remarkable 16.8% average improvement over random across MMLU and CMMLU for the 3B model, effectively boosting LLM performance. </p> </div> </dd> <dt> <a name='item418'>[418]</a> <a href ="/abs/2502.05670" title="Abstract" id="2502.05670"> arXiv:2502.05670 </a> (replaced) [<a href="/pdf/2502.05670" title="Download PDF" id="pdf-2502.05670" aria-labelledby="pdf-2502.05670">pdf</a>, <a href="https://arxiv.org/html/2502.05670v3" title="View HTML" id="html-2502.05670" aria-labelledby="html-2502.05670" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.05670" title="Other formats" id="oth-2502.05670" aria-labelledby="oth-2502.05670">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Language Models Largely Exhibit Human-like Constituent Ordering Preferences </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Tur,+A+D">Ada Defne Tur</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kamath,+G">Gaurav Kamath</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Reddy,+S">Siva Reddy</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> NAACL 2025 Main Conference </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Though English sentences are typically inflexible vis-脿-vis word order, constituents often show far more variability in ordering. One prominent theory presents the notion that constituent ordering is directly correlated with constituent weight: a measure of the constituent's length or complexity. Such theories are interesting in the context of natural language processing (NLP), because while recent advances in NLP have led to significant gains in the performance of large language models (LLMs), much remains unclear about how these models process language, and how this compares to human language processing. In particular, the question remains whether LLMs display the same patterns with constituent movement, and may provide insights into existing theories on when and how the shift occurs in human language. We compare a variety of LLMs with diverse properties to evaluate broad LLM performance on four types of constituent movement: heavy NP shift, particle movement, dative alternation, and multiple PPs. Despite performing unexpectedly around particle movement, LLMs generally align with human preferences around constituent ordering. </p> </div> </dd> <dt> <a name='item419'>[419]</a> <a href ="/abs/2502.05933" title="Abstract" id="2502.05933"> arXiv:2502.05933 </a> (replaced) [<a href="/pdf/2502.05933" title="Download PDF" id="pdf-2502.05933" aria-labelledby="pdf-2502.05933">pdf</a>, <a href="https://arxiv.org/html/2502.05933v2" title="View HTML" id="html-2502.05933" aria-labelledby="html-2502.05933" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.05933" title="Other formats" id="oth-2502.05933" aria-labelledby="oth-2502.05933">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Learning to Substitute Words with Model-based Score Ranking </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+H">Hongye Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Henao,+R">Ricardo Henao</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted at NAACL 2025 (main, long) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Smart word substitution aims to enhance sentence quality by improving word choices; however current benchmarks rely on human-labeled data. Since word choices are inherently subjective, ground-truth word substitutions generated by a small group of annotators are often incomplete and likely not generalizable. To circumvent this issue, we instead employ a model-based score (BARTScore) to quantify sentence quality, thus forgoing the need for human annotations. Specifically, we use this score to define a distribution for each word substitution, allowing one to test whether a substitution is statistically superior relative to others. In addition, we propose a loss function that directly optimizes the alignment between model predictions and sentence scores, while also enhancing the overall quality score of a substitution. Crucially, model learning no longer requires human labels, thus avoiding the cost of annotation while maintaining the quality of the text modified with substitutions. Experimental results show that the proposed approach outperforms both masked language models (BERT, BART) and large language models (GPT-4, LLaMA). The source code is available at <a href="https://github.com/Hyfred/Substitute-Words-with-Ranking" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item420'>[420]</a> <a href ="/abs/2502.06207" title="Abstract" id="2502.06207"> arXiv:2502.06207 </a> (replaced) [<a href="/pdf/2502.06207" title="Download PDF" id="pdf-2502.06207" aria-labelledby="pdf-2502.06207">pdf</a>, <a href="https://arxiv.org/html/2502.06207v2" title="View HTML" id="html-2502.06207" aria-labelledby="html-2502.06207" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.06207" title="Other formats" id="oth-2502.06207" aria-labelledby="oth-2502.06207">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Unveiling the Capabilities of Large Language Models in Detecting Offensive Language with Annotation Disagreement </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+J">Junyu Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+K">Kai Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+K">Kaichun Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiao,+K">Kelaiti Xiao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+R+K">Roy Ka-Wei Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+B">Bo Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+L">Liang Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+H">Hongfei Lin</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 17 pages, submitted to the ACL 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Large Language Models (LLMs) have become essential for offensive language detection, yet their ability to handle annotation disagreement remains underexplored. Disagreement samples, which arise from subjective interpretations, pose a unique challenge due to their ambiguous nature. Understanding how LLMs process these cases, particularly their confidence levels, can offer insight into their alignment with human annotators. This study systematically evaluates the performance of multiple LLMs in detecting offensive language at varying levels of annotation agreement. We analyze binary classification accuracy, examine the relationship between model confidence and human disagreement, and explore how disagreement samples influence model decision-making during few-shot learning and instruction fine-tuning. Our findings reveal that LLMs struggle with low-agreement samples, often exhibiting overconfidence in these ambiguous cases. However, utilizing disagreement samples in training improves both detection accuracy and model alignment with human judgment. These insights provide a foundation for enhancing LLM-based offensive language detection in real-world moderation tasks. </p> </div> </dd> <dt> <a name='item421'>[421]</a> <a href ="/abs/2502.06600" title="Abstract" id="2502.06600"> arXiv:2502.06600 </a> (replaced) [<a href="/pdf/2502.06600" title="Download PDF" id="pdf-2502.06600" aria-labelledby="pdf-2502.06600">pdf</a>, <a href="https://arxiv.org/html/2502.06600v2" title="View HTML" id="html-2502.06600" aria-labelledby="html-2502.06600" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.06600" title="Other formats" id="oth-2502.06600" aria-labelledby="oth-2502.06600">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Evaluation of Multilingual Image Captioning: How far can we get with CLIP models? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Gomes,+G">Gon莽alo Gomes</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zerva,+C">Chrysoula Zerva</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Martins,+B">Bruno Martins</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted in Findings of NAACL 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> The evaluation of image captions, looking at both linguistic fluency and semantic correspondence to visual contents, has witnessed a significant effort. Still, despite advancements such as the CLIPScore metric, multilingual captioning evaluation has remained relatively unexplored. This work presents several strategies, and extensive experiments, related to evaluating CLIPScore variants in multilingual settings. To address the lack of multilingual test data, we consider two different strategies: (1) using quality aware machine-translated datasets with human judgements, and (2) re-purposing multilingual datasets that target semantic inference and reasoning. Our results highlight the potential of finetuned multilingual models to generalize across languages and to handle complex linguistic challenges. Tests with machine-translated data show that multilingual CLIPScore models can maintain a high correlation with human judgements across different languages, and additional tests with natively multilingual and multicultural data further attest to the high-quality assessments. </p> </div> </dd> <dt> <a name='item422'>[422]</a> <a href ="/abs/2502.06851" title="Abstract" id="2502.06851"> arXiv:2502.06851 </a> (replaced) [<a href="/pdf/2502.06851" title="Download PDF" id="pdf-2502.06851" aria-labelledby="pdf-2502.06851">pdf</a>, <a href="https://arxiv.org/html/2502.06851v2" title="View HTML" id="html-2502.06851" aria-labelledby="html-2502.06851" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.06851" title="Other formats" id="oth-2502.06851" aria-labelledby="oth-2502.06851">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Survey on Vision-Language-Action Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Adilkhanov,+A">Adilzhan Adilkhanov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yelenov,+A">Amir Yelenov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Seitzhanov,+A">Assylkhan Seitzhanov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mazhitov,+A">Ayan Mazhitov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Abdikarimov,+A">Azamat Abdikarimov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sandykbayeva,+D">Danissa Sandykbayeva</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kenzhebek,+D">Daryn Kenzhebek</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mukashev,+D">Dinmukhammed Mukashev</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Umurbekov,+I">Ilyas Umurbekov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chumakov,+J">Jabrail Chumakov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Spanova,+K">Kamila Spanova</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Burunchina,+K">Karina Burunchina</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yergibay,+M">Madina Yergibay</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Issa,+M">Margulan Issa</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zabirova,+M">Moldir Zabirova</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhuzbay,+N">Nurdaulet Zhuzbay</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kabdyshev,+N">Nurlan Kabdyshev</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhaniyar,+N">Nurlan Zhaniyar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yermagambet,+R">Rasul Yermagambet</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chibar,+R">Rustam Chibar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Seitzhan,+S">Saltanat Seitzhan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Khajikhanov,+S">Soibkhon Khajikhanov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Taunyazov,+T">Tasbolat Taunyazov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Galimzhanov,+T">Temirlan Galimzhanov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kaiyrbay,+T">Temirlan Kaiyrbay</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mussin,+T">Tleukhan Mussin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Syrymova,+T">Togzhan Syrymova</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kostyukova,+V">Valeriya Kostyukova</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Massalim,+Y">Yerkebulan Massalim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kassym,+Y">Yermakhan Kassym</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nurbayeva,+Z">Zerde Nurbayeva</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kappassov,+Z">Zhanat Kappassov</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> This paper presents an AI-generated review of Vision-Language-Action (VLA) models, summarizing key methodologies, findings, and future directions. The content is produced using large language models (LLMs) and is intended only for demonstration purposes. This work does not represent original research, but highlights how AI can help automate literature reviews. As AI-generated content becomes more prevalent, ensuring accuracy, reliability, and proper synthesis remains a challenge. Future research will focus on developing a structured framework for AI-assisted literature reviews, exploring techniques to enhance citation accuracy, source credibility, and contextual understanding. By examining the potential and limitations of LLM in academic writing, this study aims to contribute to the broader discussion of integrating AI into research workflows. This work serves as a preliminary step toward establishing systematic approaches for leveraging AI in literature review generation, making academic knowledge synthesis more efficient and scalable. </p> </div> </dd> <dt> <a name='item423'>[423]</a> <a href ="/abs/2502.06855" title="Abstract" id="2502.06855"> arXiv:2502.06855 </a> (replaced) [<a href="/pdf/2502.06855" title="Download PDF" id="pdf-2502.06855" aria-labelledby="pdf-2502.06855">pdf</a>, <a href="/format/2502.06855" title="Other formats" id="oth-2502.06855" aria-labelledby="oth-2502.06855">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Self-Supervised Prompt Optimization </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xiang,+J">Jinyu Xiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+J">Jiayi Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+Z">Zhaoyang Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Teng,+F">Fengwei Teng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tu,+J">Jinhao Tu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liang,+X">Xinbing Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hong,+S">Sirui Hong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+C">Chenglin Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+Y">Yuyu Luo</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Well-designed prompts are crucial for enhancing Large language models' (LLMs) reasoning capabilities while aligning their outputs with task requirements across diverse domains. However, manually designed prompts require expertise and iterative experimentation. While existing prompt optimization methods aim to automate this process, they rely heavily on external references such as ground truth or by humans, limiting their applicability in real-world scenarios where such data is unavailable or costly to obtain. To address this, we propose Self-Supervised Prompt Optimization (SPO), a cost-efficient framework that discovers effective prompts for both closed and open-ended tasks without requiring external reference. Motivated by the observations that prompt quality manifests directly in LLM outputs and LLMs can effectively assess adherence to task requirements, we derive evaluation and optimization signals purely from output comparisons. Specifically, SPO selects superior prompts through pairwise output comparisons evaluated by an LLM evaluator, followed by an LLM optimizer that aligns outputs with task requirements. Extensive experiments demonstrate that SPO outperforms state-of-the-art prompt optimization methods, achieving comparable or superior results with significantly lower costs (e.g., 1.1% to 5.6% of existing methods) and fewer samples (e.g., three samples). The code is available at <a href="https://github.com/geekan/MetaGPT/blob/main/examples/spo" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item424'>[424]</a> <a href ="/abs/2502.07316" title="Abstract" id="2502.07316"> arXiv:2502.07316 </a> (replaced) [<a href="/pdf/2502.07316" title="Download PDF" id="pdf-2502.07316" aria-labelledby="pdf-2502.07316">pdf</a>, <a href="https://arxiv.org/html/2502.07316v3" title="View HTML" id="html-2502.07316" aria-labelledby="html-2502.07316" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.07316" title="Other formats" id="oth-2502.07316" aria-labelledby="oth-2502.07316">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CodeI/O: Condensing Reasoning Patterns via Code Input-Output Prediction </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+J">Junlong Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+D">Daya Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+D">Dejian Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+R">Runxin Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Y">Yu Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+J">Junxian He</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Reasoning is a fundamental capability of Large Language Models. While prior research predominantly focuses on enhancing narrow skills like math or code generation, improving performance on many other reasoning tasks remains challenging due to sparse and fragmented training data. To address this issue, we propose CodeI/O, a novel approach that systematically condenses diverse reasoning patterns inherently embedded in contextually-grounded codes, through transforming the original code into a code input-output prediction format. By training models to predict inputs/outputs given code and test cases entirely in natural language as Chain-of-Thought (CoT) rationales, we expose them to universal reasoning primitives -- like logic flow planning, state-space searching, decision tree traversal, and modular decomposition -- while decoupling structured reasoning from code-specific syntax and preserving procedural rigor. Experimental results demonstrate CodeI/O leads to consistent improvements across symbolic, scientific, logic, math & numerical, and commonsense reasoning tasks. By matching the existing ground-truth outputs or re-executing the code with predicted inputs, we can verify each prediction and further enhance the CoTs through multi-turn revision, resulting in CodeI/O++ and achieving higher performance. Our data and models are available at <a href="https://github.com/hkust-nlp/CodeIO" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item425'>[425]</a> <a href ="/abs/2502.07322" title="Abstract" id="2502.07322"> arXiv:2502.07322 </a> (replaced) [<a href="/pdf/2502.07322" title="Download PDF" id="pdf-2502.07322" aria-labelledby="pdf-2502.07322">pdf</a>, <a href="https://arxiv.org/html/2502.07322v2" title="View HTML" id="html-2502.07322" aria-labelledby="html-2502.07322" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.07322" title="Other formats" id="oth-2502.07322" aria-labelledby="oth-2502.07322">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MEMIT-Merge: Addressing MEMIT's Key-Value Conflicts in Same-Subject Batch Editing for LLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Dong,+Z">Zilu Dong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+X">Xiangqing Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xia,+R">Rui Xia</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 9 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> As large language models continue to scale up, knowledge editing techniques that modify models' internal knowledge without full retraining have gained significant attention. MEMIT, a prominent batch editing algorithm, stands out for its capability to perform mass knowledge modifications. However, we uncover a critical limitation that MEMIT's editing efficacy significantly deteriorates when processing batches containing multiple edits sharing the same subject. Our analysis reveals that the root cause lies in MEMIT's key value modeling framework: When multiple facts with the same subject in a batch are modeled through MEMIT's key value mechanism, identical keys (derived from the shared subject) are forced to represent different values (corresponding to different knowledge), resulting in updates conflicts during editing. Addressing this issue, we propose MEMIT-Merge, an enhanced approach that merges value computation processes for facts sharing the same subject, effectively resolving the performance degradation in same-subject batch editing scenarios. Experimental results demonstrate that when MEMIT's edit success rate drops to around 50% at larger batch sizes, MEMIT-Merge maintains a success rate exceeding 90%, showcasing remarkable robustness to subject entity collisions. </p> </div> </dd> <dt> <a name='item426'>[426]</a> <a href ="/abs/2502.07340" title="Abstract" id="2502.07340"> arXiv:2502.07340 </a> (replaced) [<a href="/pdf/2502.07340" title="Download PDF" id="pdf-2502.07340" aria-labelledby="pdf-2502.07340">pdf</a>, <a href="https://arxiv.org/html/2502.07340v2" title="View HTML" id="html-2502.07340" aria-labelledby="html-2502.07340" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.07340" title="Other formats" id="oth-2502.07340" aria-labelledby="oth-2502.07340">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Aligning Large Language Models to Follow Instructions and Hallucinate Less via Effective Data Filtering </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Si,+S">Shuzheng Si</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+H">Haozhe Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+G">Gang Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+C">Cheng Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bai,+Y">Yuzhuo Bai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zhitong Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=An,+K">Kaikai An</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+K">Kangyang Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qian,+C">Chen Qian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qi,+F">Fanchao Qi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chang,+B">Baobao Chang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+M">Maosong Sun</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Training LLMs on data containing unfamiliar knowledge during the instruction tuning stage can encourage hallucinations. To address this challenge, we introduce NOVA, a novel framework designed to identify high-quality data that aligns well with the LLM's learned knowledge to reduce hallucinations. NOVA includes Internal Consistency Probing (ICP) and Semantic Equivalence Identification (SEI) to measure how familiar the LLM is with instruction data. Specifically, ICP evaluates the LLM's understanding of the given instruction by calculating the tailored consistency among multiple self-generated responses. SEI further assesses the familiarity of the LLM with the target response by comparing it to the generated responses, using the proposed semantic clustering and well-designed voting strategy. Finally, to ensure the quality of selected samples, we introduce an expert-aligned reward model, considering characteristics beyond just familiarity. By considering data quality and avoiding unfamiliar data, we can utilize the selected data to effectively align LLMs to follow instructions and hallucinate less. </p> </div> </dd> <dt> <a name='item427'>[427]</a> <a href ="/abs/2502.07424" title="Abstract" id="2502.07424"> arXiv:2502.07424 </a> (replaced) [<a href="/pdf/2502.07424" title="Download PDF" id="pdf-2502.07424" aria-labelledby="pdf-2502.07424">pdf</a>, <a href="https://arxiv.org/html/2502.07424v2" title="View HTML" id="html-2502.07424" aria-labelledby="html-2502.07424" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.07424" title="Other formats" id="oth-2502.07424" aria-labelledby="oth-2502.07424">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> RomanLens: The Role Of Latent Romanization In Multilinguality In LLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Saji,+A">Alan Saji</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Husain,+J+A">Jaavid Aktar Husain</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jayakumar,+T">Thanmay Jayakumar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dabre,+R">Raj Dabre</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kunchukuttan,+A">Anoop Kunchukuttan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Puduppully,+R">Ratish Puduppully</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 19 pages, 19 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Large Language Models (LLMs) exhibit remarkable multilingual generalization despite being predominantly trained on English-centric corpora. A fundamental question arises: how do LLMs achieve such robust multilingual capabilities? We take the case of non-Roman script languages, we investigate the role of Romanization - the representation of non-Roman scripts using Roman characters - as a bridge in multilingual processing. Using mechanistic interpretability techniques, we analyze next-token generation and find that intermediate layers frequently represent target words in Romanized form before transitioning to native script, a phenomenon we term Latent Romanization. Further, through activation patching experiments, we demonstrate that LLMs encode semantic concepts similarly across native and Romanized scripts, suggesting a shared underlying representation. Additionally, for translation into non-Roman script languages, our findings reveal that when the target language is in Romanized form, its representations emerge earlier in the model's layers compared to native script. These insights contribute to a deeper understanding of multilingual representation in LLMs and highlight the implicit role of Romanization in facilitating language transfer. </p> </div> </dd> <dt> <a name='item428'>[428]</a> <a href ="/abs/2502.08045" title="Abstract" id="2502.08045"> arXiv:2502.08045 </a> (replaced) [<a href="/pdf/2502.08045" title="Download PDF" id="pdf-2502.08045" aria-labelledby="pdf-2502.08045">pdf</a>, <a href="https://arxiv.org/html/2502.08045v2" title="View HTML" id="html-2502.08045" aria-labelledby="html-2502.08045" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.08045" title="Other formats" id="oth-2502.08045" aria-labelledby="oth-2502.08045">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Break the Checkbox: Challenging Closed-Style Evaluations of Cultural Alignment in LLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Kabir,+M">Mohsinul Kabir</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Abrar,+A">Ajwad Abrar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ananiadou,+S">Sophia Ananiadou</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Preprint </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Computers and Society (cs.CY) </div> <p class='mathjax'> A large number of studies rely on closed-style multiple-choice surveys to evaluate cultural alignment in Large Language Models (LLMs). In this work, we challenge this constrained evaluation paradigm and explore more realistic, unconstrained approaches. Using the World Values Survey (WVS) and Hofstede Cultural Dimensions as case studies, we demonstrate that LLMs exhibit stronger cultural alignment in less constrained settings, where responses are not forced. Additionally, we show that even minor changes, such as reordering survey choices, lead to inconsistent outputs, exposing the limitations of closed-style evaluations. Our findings advocate for more robust and flexible evaluation frameworks that focus on specific cultural proxies, encouraging more nuanced and accurate assessments of cultural alignment in LLMs. </p> </div> </dd> <dt> <a name='item429'>[429]</a> <a href ="/abs/2502.08168" title="Abstract" id="2502.08168"> arXiv:2502.08168 </a> (replaced) [<a href="/pdf/2502.08168" title="Download PDF" id="pdf-2502.08168" aria-labelledby="pdf-2502.08168">pdf</a>, <a href="https://arxiv.org/html/2502.08168v3" title="View HTML" id="html-2502.08168" aria-labelledby="html-2502.08168" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.08168" title="Other formats" id="oth-2502.08168" aria-labelledby="oth-2502.08168">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SARChat-Bench-2M: A Multi-Task Vision-Language Benchmark for SAR Image Interpretation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+Z">Zhiming Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiao,+X">Xiayang Xiao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dong,+S">Sihao Dong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+P">Peidong Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+H">HaiPeng Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pan,+Q">Qingyun Pan</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> As a powerful all-weather Earth observation tool, synthetic aperture radar (SAR) remote sensing enables critical military reconnaissance, maritime surveillance, and infrastructure monitoring. Although Vision language models (VLMs) have made remarkable progress in natural language processing and image understanding, their applications remain limited in professional domains due to insufficient domain expertise. This paper innovatively proposes the first large-scale multimodal dialogue dataset for SAR images, named SARChat-2M, which contains approximately 2 million high-quality image-text pairs, encompasses diverse scenarios with detailed target annotations. This dataset not only supports several key tasks such as visual understanding and object detection tasks, but also has unique innovative aspects: this study develop a visual-language dataset and benchmark for the SAR domain, enabling and evaluating VLMs' capabilities in SAR image interpretation, which provides a paradigmatic framework for constructing multimodal datasets across various remote sensing vertical domains. Through experiments on 16 mainstream VLMs, the effectiveness of the dataset has been fully verified. The project will be released at <a href="https://github.com/JimmyMa99/SARChat" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item430'>[430]</a> <a href ="/abs/2502.08279" title="Abstract" id="2502.08279"> arXiv:2502.08279 </a> (replaced) [<a href="/pdf/2502.08279" title="Download PDF" id="pdf-2502.08279" aria-labelledby="pdf-2502.08279">pdf</a>, <a href="https://arxiv.org/html/2502.08279v2" title="View HTML" id="html-2502.08279" aria-labelledby="html-2502.08279" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.08279" title="Other formats" id="oth-2502.08279" aria-labelledby="oth-2502.08279">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> What Is That Talk About? A Video-to-Text Summarization Dataset for Scientific Presentations </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+D">Dongqi Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Whitehouse,+C">Chenxi Whitehouse</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+X">Xi Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mahon,+L">Louis Mahon</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Saxena,+R">Rohit Saxena</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+Z">Zheng Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qiu,+Y">Yifu Qiu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lapata,+M">Mirella Lapata</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Demberg,+V">Vera Demberg</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Transforming recorded videos into concise and accurate textual summaries is a growing challenge in multimodal learning. This paper introduces VISTA, a dataset specifically designed for video-to-text summarization in scientific domains. VISTA contains 18,599 recorded AI conference presentations paired with their corresponding paper abstracts. We benchmark the performance of state-of-the-art large models and apply a plan-based framework to better capture the structured nature of abstracts. Both human and automated evaluations confirm that explicit planning enhances summary quality and factual consistency. However, a considerable gap remains between models and human performance, highlighting the challenges of scientific video summarization. </p> </div> </dd> <dt> <a name='item431'>[431]</a> <a href ="/abs/2502.08356" title="Abstract" id="2502.08356"> arXiv:2502.08356 </a> (replaced) [<a href="/pdf/2502.08356" title="Download PDF" id="pdf-2502.08356" aria-labelledby="pdf-2502.08356">pdf</a>, <a href="/format/2502.08356" title="Other formats" id="oth-2502.08356" aria-labelledby="oth-2502.08356">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Systematic Knowledge Injection into Large Language Models via Diverse Augmentation for Domain-Specific RAG </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Bhushan,+K">Kushagra Bhushan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nandwani,+Y">Yatin Nandwani</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Khandelwal,+D">Dinesh Khandelwal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gupta,+S">Sonam Gupta</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pandey,+G">Gaurav Pandey</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Raghu,+D">Dinesh Raghu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Joshi,+S">Sachindra Joshi</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 22 pages, 14 tables, to be published in NAACL 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Retrieval-Augmented Generation (RAG) has emerged as a prominent method for incorporating domain knowledge into Large Language Models (LLMs). While RAG enhances response relevance by incorporating retrieved domain knowledge in the context, retrieval errors can still lead to hallucinations and incorrect answers. To recover from retriever failures, domain knowledge is injected by fine-tuning the model to generate the correct response, even in the case of retrieval errors. However, we observe that without systematic knowledge augmentation, fine-tuned LLMs may memorize new information but still fail to extract relevant domain knowledge, leading to poor performance. In this work, we present a novel framework that significantly enhances the fine-tuning process by augmenting the training data in two ways -- context augmentation and knowledge paraphrasing. In context augmentation, we create multiple training samples for a given QA pair by varying the relevance of the retrieved information, teaching the model when to ignore and when to rely on retrieved content. In knowledge paraphrasing, we fine-tune with multiple answers to the same question, enabling LLMs to better internalize specialized knowledge. To mitigate catastrophic forgetting due to fine-tuning, we add a domain-specific identifier to a question and also utilize a replay buffer containing general QA pairs. Experimental results demonstrate the efficacy of our method over existing techniques, achieving up to 10\% relative gain in token-level recall while preserving the LLM's generalization capabilities. </p> </div> </dd> <dt> <a name='item432'>[432]</a> <a href ="/abs/2502.08561" title="Abstract" id="2502.08561"> arXiv:2502.08561 </a> (replaced) [<a href="/pdf/2502.08561" title="Download PDF" id="pdf-2502.08561" aria-labelledby="pdf-2502.08561">pdf</a>, <a href="https://arxiv.org/html/2502.08561v2" title="View HTML" id="html-2502.08561" aria-labelledby="html-2502.08561" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.08561" title="Other formats" id="oth-2502.08561" aria-labelledby="oth-2502.08561">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Quality-Aware Decoding: Unifying Quality Estimation and Decoding </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Koneru,+S">Sai Koneru</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huck,+M">Matthias Huck</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Exel,+M">Miriam Exel</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Niehues,+J">Jan Niehues</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Under Review </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Quality Estimation (QE) models for Neural Machine Translation (NMT) predict the quality of the hypothesis without having access to the reference. An emerging research direction in NMT involves the use of QE models, which have demonstrated high correlations with human judgment and can enhance translations through Quality-Aware Decoding. Although several approaches have been proposed based on sampling multiple candidate translations and picking the best candidate, none have integrated these models directly into the decoding process. In this paper, we address this by proposing a novel token-level QE model capable of reliably scoring partial translations. We build a uni-directional QE model for this, as decoder models are inherently trained and efficient on partial sequences. We then present a decoding strategy that integrates the QE model for Quality-Aware decoding and demonstrate that the translation quality improves when compared to the N-best list re-ranking with state-of-the-art QE models (up to $1.39$ XCOMET-XXL $\uparrow$). Finally, we show that our approach provides significant benefits in document translation tasks, where the quality of N-best lists is typically suboptimal. Code can be found at <a href="https://ai4lt.iar.kit.edu/english/projects" rel="external noopener nofollow" class="link-external link-https">this https URL</a>\<a href="http://_kontextmt.php" rel="external noopener nofollow" class="link-external link-http">this http URL</a> </p> </div> </dd> <dt> <a name='item433'>[433]</a> <a href ="/abs/2502.08661" title="Abstract" id="2502.08661"> arXiv:2502.08661 </a> (replaced) [<a href="/pdf/2502.08661" title="Download PDF" id="pdf-2502.08661" aria-labelledby="pdf-2502.08661">pdf</a>, <a href="https://arxiv.org/html/2502.08661v2" title="View HTML" id="html-2502.08661" aria-labelledby="html-2502.08661" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.08661" title="Other formats" id="oth-2502.08661" aria-labelledby="oth-2502.08661">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Few-shot LLM Synthetic Data with Distribution Matching </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ren,+J">Jiyuan Ren</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Du,+Z">Zhaocheng Du</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wen,+Z">Zhihao Wen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jia,+Q">Qinglin Jia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dai,+S">Sunhao Dai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+C">Chuhan Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dong,+Z">Zhenhua Dong</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 10 pages, 5 figures, accepted at www 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> As large language models (LLMs) advance, their ability to perform in-context learning and few-shot language generation has improved significantly. This has spurred using LLMs to produce high-quality synthetic data to enhance the performance of smaller models like online retrievers or weak LLMs. However, LLM-generated synthetic data often differs from the real data in key language attributes (e.g., styles, tones, content proportions, etc.). As a result, mixing these synthetic data directly with real data may distort the original data distribution, potentially hindering performance improvements. To solve this, we introduce SynAlign: a synthetic data generation and filtering framework based on key attribute distribution matching. Before generation, SynAlign employs an uncertainty tracker surrogated by the Gaussian Process model to iteratively select data clusters distinct from selected ones as demonstrations for new data synthesis, facilitating the efficient exploration diversity of the real data. Then, a latent attribute reasoning method is employed: the LLM summarizes linguistic attributes of demonstrations and then synthesizes new data based on them. This approach facilitates synthesizing diverse data with linguistic attributes that appear in real <a href="http://data.After" rel="external noopener nofollow" class="link-external link-http">this http URL</a> generation, the Maximum Mean Discrepancy is used as the objective function to learn the sampling weight of each synthetic data, ensuring distribution matching with the real data. Our experiments on multiple text prediction tasks show significant performance improvements. We also conducted an online A/B test on an online retriever to demonstrate SynAlign's effectiveness. </p> </div> </dd> <dt> <a name='item434'>[434]</a> <a href ="/abs/2502.09056" title="Abstract" id="2502.09056"> arXiv:2502.09056 </a> (replaced) [<a href="/pdf/2502.09056" title="Download PDF" id="pdf-2502.09056" aria-labelledby="pdf-2502.09056">pdf</a>, <a href="https://arxiv.org/html/2502.09056v2" title="View HTML" id="html-2502.09056" aria-labelledby="html-2502.09056" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.09056" title="Other formats" id="oth-2502.09056" aria-labelledby="oth-2502.09056">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Adapting Language-Specific LLMs to a Reasoning Model in One Day via Model Merging - An Open Recipe </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Pipatanakul,+K">Kunat Pipatanakul</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Taveekitworachai,+P">Pittawat Taveekitworachai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Manakul,+P">Potsawee Manakul</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tharnpipitchai,+K">Kasima Tharnpipitchai</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 9 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> This paper investigates data selection and model merging methodologies aimed at incorporating advanced reasoning capabilities such as those of DeepSeek R1 into language-specific large language models (LLMs), with a particular focus on the Thai LLM. Our goal is to enhance the reasoning capabilities of language-specific LLMs while maintaining their target language abilities. DeepSeek R1 excels in reasoning but primarily benefits high-resource languages such as English and Chinese. However, low-resource languages remain underserved due to the dominance of English-centric training data and model optimizations, which limit performance in these languages. This limitation results in unreliable code-switching and diminished effectiveness on tasks in low-resource languages. Meanwhile, local and regional LLM initiatives have attempted to bridge this gap by developing language-specific LLMs that focus on improving local linguistic fidelity. We demonstrate that, with only publicly available datasets and a computational budget of $120, it is possible to enhance the reasoning capabilities of language-specific LLMs to match the level of DeepSeek R1, without compromising their performance on target language tasks. </p> </div> </dd> <dt> <a name='item435'>[435]</a> <a href ="/abs/2502.09120" title="Abstract" id="2502.09120"> arXiv:2502.09120 </a> (replaced) [<a href="/pdf/2502.09120" title="Download PDF" id="pdf-2502.09120" aria-labelledby="pdf-2502.09120">pdf</a>, <a href="https://arxiv.org/html/2502.09120v2" title="View HTML" id="html-2502.09120" aria-labelledby="html-2502.09120" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.09120" title="Other formats" id="oth-2502.09120" aria-labelledby="oth-2502.09120">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> The influence of visual and linguistic cues on ignorance inference in Vision-Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Cho,+Y">Ye-eun Cho</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Maeng,+Y">Yunho Maeng</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 13 pages, 3 figures, 3 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> This study explored how Vision-Language Models (VLMs) process ignorance implicatures with visual and linguistic cues. Particularly, we focused on the effects of contexts (precise and approximate contexts) and modifier types (bare numerals, superlative, and comparative modifiers), which were considered pragmatic and semantic factors respectively. Methodologically, we conducted a truth-value judgment task in visually grounded settings using GPT-4o and Gemini 1.5 Pro. The results indicate that while both models exhibited sensitivity to linguistic cues (modifier), they failed to process ignorance implicatures with visual cues (context) as humans do. Specifically, the influence of context was weaker and inconsistent across models, indicating challenges in pragmatic reasoning for VLMs. On the other hand, superlative modifiers were more strongly associated with ignorance implicatures as compared to comparative modifiers, supporting the semantic view. These findings highlight the need for further advancements in VLMs to process language-vision information in a context-dependent way to achieve human-like pragmatic inference. </p> </div> </dd> <dt> <a name='item436'>[436]</a> <a href ="/abs/2502.09566" title="Abstract" id="2502.09566"> arXiv:2502.09566 </a> (replaced) [<a href="/pdf/2502.09566" title="Download PDF" id="pdf-2502.09566" aria-labelledby="pdf-2502.09566">pdf</a>, <a href="https://arxiv.org/html/2502.09566v2" title="View HTML" id="html-2502.09566" aria-labelledby="html-2502.09566" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.09566" title="Other formats" id="oth-2502.09566" aria-labelledby="oth-2502.09566">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Zero-shot generation of synthetic neurosurgical data with large language models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Barr,+A+A">Austin A. Barr</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+E">Eddie Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sezgin,+E">Emre Sezgin</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 13 pages, 4 figures, 4 tables (updated version, fixed typos and formatting) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Clinical data is fundamental to advance neurosurgical research, but access is often constrained by data availability, small sample sizes, privacy regulations, and resource-intensive preprocessing and de-identification procedures. Synthetic data offers a potential solution to challenges associated with accessing and using real-world data (RWD). This study aims to evaluate the capability of zero-shot generation of synthetic neurosurgical data with a large language model (LLM), GPT-4o, by benchmarking with the conditional tabular generative adversarial network (CTGAN). Synthetic datasets were compared to real-world neurosurgical data to assess fidelity (means, proportions, distributions, and bivariate correlations), utility (ML classifier performance on RWD), and privacy (duplication of records from RWD). The GPT-4o-generated datasets matched or exceeded CTGAN performance, despite no fine-tuning or access to RWD for pre-training. Datasets demonstrated high univariate and bivariate fidelity to RWD without directly exposing any real patient records, even at amplified sample size. Training an ML classifier on GPT-4o-generated data and testing on RWD for a binary prediction task showed an F1 score (0.706) with comparable performance to training on the CTGAN data (0.705) for predicting postoperative functional status deterioration. GPT-4o demonstrated a promising ability to generate high-fidelity synthetic neurosurgical data. These findings also indicate that data synthesized with GPT-4o can effectively augment clinical data with small sample sizes, and train ML models for prediction of neurosurgical outcomes. Further investigation is necessary to improve the preservation of distributional characteristics and boost classifier performance. </p> </div> </dd> <dt> <a name='item437'>[437]</a> <a href ="/abs/2502.09589" title="Abstract" id="2502.09589"> arXiv:2502.09589 </a> (replaced) [<a href="/pdf/2502.09589" title="Download PDF" id="pdf-2502.09589" aria-labelledby="pdf-2502.09589">pdf</a>, <a href="https://arxiv.org/html/2502.09589v2" title="View HTML" id="html-2502.09589" aria-labelledby="html-2502.09589" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.09589" title="Other formats" id="oth-2502.09589" aria-labelledby="oth-2502.09589">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Logical forms complement probability in understanding language model (and human) performance </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yixuan Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shi,+F">Freda Shi</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Preprint </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Logic in Computer Science (cs.LO) </div> <p class='mathjax'> With the increasing interest in using large language models (LLMs) for planning in natural language, understanding their behaviors becomes an important research question. This work conducts a systematic investigation of LLMs' ability to perform logical reasoning in natural language. We introduce a controlled dataset of hypothetical and disjunctive syllogisms in propositional and modal logic and use it as the testbed for understanding LLM performance. Our results lead to novel insights in predicting LLM behaviors: in addition to the probability of input (Gonen et al., 2023; McCoy et al., 2024), logical forms should be considered as important factors. In addition, we show similarities and discrepancies between the logical reasoning performances of humans and LLMs by collecting and comparing behavioral data from both. </p> </div> </dd> <dt> <a name='item438'>[438]</a> <a href ="/abs/2502.09606" title="Abstract" id="2502.09606"> arXiv:2502.09606 </a> (replaced) [<a href="/pdf/2502.09606" title="Download PDF" id="pdf-2502.09606" aria-labelledby="pdf-2502.09606">pdf</a>, <a href="https://arxiv.org/html/2502.09606v2" title="View HTML" id="html-2502.09606" aria-labelledby="html-2502.09606" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.09606" title="Other formats" id="oth-2502.09606" aria-labelledby="oth-2502.09606">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Human-LLM Coevolution: Evidence from Academic Writing </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Geng,+M">Mingmeng Geng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Trotta,+R">Roberto Trotta</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Computers and Society (cs.CY); Digital Libraries (cs.DL); Machine Learning (cs.LG) </div> <p class='mathjax'> With a statistical analysis of arXiv paper abstracts, we report a marked drop in the frequency of several words previously identified as overused by ChatGPT, such as "delve", starting soon after they were pointed out in early 2024. The frequency of certain other words favored by ChatGPT, such as "significant", has instead kept increasing. These phenomena suggest that some authors of academic papers have adapted their use of large language models (LLMs), for example, by selecting outputs or applying modifications to the LLM-generated content. Such coevolution and cooperation of humans and LLMs thus introduce additional challenges to the detection of machine-generated text in real-world scenarios. Estimating the impact of LLMs on academic writing by examining word frequency remains feasible, and more attention should be paid to words that were already frequently employed, including those that have decreased in frequency due to LLMs' disfavor. </p> </div> </dd> <dt> <a name='item439'>[439]</a> <a href ="/abs/2502.10051" title="Abstract" id="2502.10051"> arXiv:2502.10051 </a> (replaced) [<a href="/pdf/2502.10051" title="Download PDF" id="pdf-2502.10051" aria-labelledby="pdf-2502.10051">pdf</a>, <a href="https://arxiv.org/html/2502.10051v2" title="View HTML" id="html-2502.10051" aria-labelledby="html-2502.10051" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10051" title="Other formats" id="oth-2502.10051" aria-labelledby="oth-2502.10051">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> ORI: O Routing Intelligence </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Shadid,+A">Ahmad Shadid</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kumar,+R">Rahul Kumar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mayank,+M">Mohit Mayank</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 13 pages, 2 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Single large language models (LLMs) often fall short when faced with the ever-growing range of tasks, making a single-model approach insufficient. We address this challenge by proposing ORI (O Routing Intelligence), a dynamic framework that leverages a set of LLMs. By intelligently routing incoming queries to the most suitable model, ORI not only improves task-specific accuracy, but also maintains efficiency. Comprehensive evaluations across diverse benchmarks demonstrate consistent accuracy gains while controlling computational overhead. By intelligently routing queries, ORI outperforms the strongest individual models by up to 2.7 points on MMLU and 1.8 points on MuSR, ties the top performance on ARC, and on BBH. These results underscore the benefits of a multi-model strategy and demonstrate how ORI's adaptive architecture can more effectively handle diverse tasks, offering a scalable, high-performance solution for a system of multiple large language models. </p> </div> </dd> <dt> <a name='item440'>[440]</a> <a href ="/abs/2310.14483" title="Abstract" id="2310.14483"> arXiv:2310.14483 </a> (replaced) [<a href="/pdf/2310.14483" title="Download PDF" id="pdf-2310.14483" aria-labelledby="pdf-2310.14483">pdf</a>, <a href="https://arxiv.org/html/2310.14483v4" title="View HTML" id="html-2310.14483" aria-labelledby="html-2310.14483" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2310.14483" title="Other formats" id="oth-2310.14483" aria-labelledby="oth-2310.14483">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Chain-of-Factors Paper-Reviewer Matching </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yu Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+Y">Yanzhen Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kang,+S">SeongKu Kang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+X">Xiusi Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jin,+B">Bowen Jin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+J">Jiawei Han</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 10 pages; Accepted to WWW 2025 (Code: <a href="https://github.com/yuzhimanhua/CoF" rel="external noopener nofollow" class="link-external link-https">this https URL</a>) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Information Retrieval (cs.IR)</span>; Computation and Language (cs.CL); Digital Libraries (cs.DL); Machine Learning (cs.LG) </div> <p class='mathjax'> With the rapid increase in paper submissions to academic conferences, the need for automated and accurate paper-reviewer matching is more critical than ever. Previous efforts in this area have considered various factors to assess the relevance of a reviewer's expertise to a paper, such as the semantic similarity, shared topics, and citation connections between the paper and the reviewer's previous works. However, most of these studies focus on only one factor, resulting in an incomplete evaluation of the paper-reviewer relevance. To address this issue, we propose a unified model for paper-reviewer matching that jointly considers semantic, topic, and citation factors. To be specific, during training, we instruction-tune a contextualized language model shared across all factors to capture their commonalities and characteristics; during inference, we chain the three factors to enable step-by-step, coarse-to-fine search for qualified reviewers given a submission. Experiments on four datasets (one of which is newly contributed by us) spanning various fields such as machine learning, computer vision, information retrieval, and data mining consistently demonstrate the effectiveness of our proposed Chain-of-Factors model in comparison with state-of-the-art paper-reviewer matching methods and scientific pre-trained language models. </p> </div> </dd> <dt> <a name='item441'>[441]</a> <a href ="/abs/2403.01643" title="Abstract" id="2403.01643"> arXiv:2403.01643 </a> (replaced) [<a href="/pdf/2403.01643" title="Download PDF" id="pdf-2403.01643" aria-labelledby="pdf-2403.01643">pdf</a>, <a href="https://arxiv.org/html/2403.01643v3" title="View HTML" id="html-2403.01643" aria-labelledby="html-2403.01643" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2403.01643" title="Other formats" id="oth-2403.01643" aria-labelledby="oth-2403.01643">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Cost-Effective Attention Mechanisms for Low Resource Settings: Necessity & Sufficiency of Linear Transformations </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Hosseini,+P">Peyman Hosseini</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hosseini,+M">Mehran Hosseini</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Castro,+I">Ignacio Castro</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Purver,+M">Matthew Purver</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> From natural language processing to vision, Scaled Dot Product Attention (SDPA) is the backbone of most modern deep learning applications. Unfortunately, its memory and computational requirements can be prohibitive in low-resource settings. In this paper, we improve its efficiency without sacrificing its versatility. We propose three attention variants where we remove consecutive linear transformations or add a novel one, and evaluate them on a range of standard NLP and vision tasks. Our proposed models are substantially lighter than standard SDPA (and have 25-50% fewer parameters). We show that the performance cost of these changes is negligible relative to size reduction and that in one case (Super Attention) we succeed in outperforming SDPA by up to 10% while improving its speed and reducing its parameters by 25%. </p> </div> </dd> <dt> <a name='item442'>[442]</a> <a href ="/abs/2405.13144" title="Abstract" id="2405.13144"> arXiv:2405.13144 </a> (replaced) [<a href="/pdf/2405.13144" title="Download PDF" id="pdf-2405.13144" aria-labelledby="pdf-2405.13144">pdf</a>, <a href="https://arxiv.org/html/2405.13144v3" title="View HTML" id="html-2405.13144" aria-labelledby="html-2405.13144" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2405.13144" title="Other formats" id="oth-2405.13144" aria-labelledby="oth-2405.13144">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LLMs for Mathematical Modeling: Towards Bridging the Gap between Natural and Mathematical Languages </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+X">Xuhan Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+Q">Qingning Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+Y">Yan Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+A">Anningzhe Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+B">Benyou Wang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Findings of NAACL2025. Project: <a href="https://github.com/FreedomIntelligence/Mamo" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Large Language Models (LLMs) have demonstrated strong performance across various natural language processing tasks, yet their proficiency in mathematical reasoning remains a key challenge. Addressing the gap between natural and mathematical language requires advanced reasoning capabilities, approaching those of Artificial General Intelligence (AGI). However, the evaluation remains challenging, as perfectly representing reality is inherently elusive, and traditional methods like manual or direct comparison of mathematical statements (Ramamonjison et al., 2023) are insufficient for assessing true modeling ability. We propose a process-oriented framework to evaluate LLMs' ability to construct mathematical models, using solvers to compare outputs with ground truth. Introducing Mamo, a benchmark with 1,209 questions covering ordinary differential equations, linear programming, and mixed-integer linear programming, we enable automatic evaluation of modeling accuracy. The results show that existing LLMs struggle with complex mathematical modeling tasks, with larger models demonstrating superior performance, while open-source models remain competitive in simpler cases but still fall short of proprietary models in more challenging problems. </p> </div> </dd> <dt> <a name='item443'>[443]</a> <a href ="/abs/2406.04116" title="Abstract" id="2406.04116"> arXiv:2406.04116 </a> (replaced) [<a href="/pdf/2406.04116" title="Download PDF" id="pdf-2406.04116" aria-labelledby="pdf-2406.04116">pdf</a>, <a href="/format/2406.04116" title="Other formats" id="oth-2406.04116" aria-labelledby="oth-2406.04116">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Promoting the Responsible Development of Speech Datasets for Mental Health and Neurological Disorders Research </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Mancini,+E">Eleonora Mancini</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tanevska,+A">Ana Tanevska</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Galassi,+A">Andrea Galassi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Galatolo,+A">Alessio Galatolo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ruggeri,+F">Federico Ruggeri</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Torroni,+P">Paolo Torroni</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 36 pages </div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> Journal of Artificial Intelligence Research (JAIR), vol 82 (2025), pp 937-972 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Current research in machine learning and artificial intelligence is largely centered on modeling and performance evaluation, less so on data collection. However, recent research demonstrated that limitations and biases in data may negatively impact trustworthiness and reliability. These aspects are particularly impactful on sensitive domains such as mental health and neurological disorders, where speech data are used to develop AI applications for patients and healthcare providers. In this paper, we chart the landscape of available speech datasets for this domain, to highlight possible pitfalls and opportunities for improvement and promote fairness and diversity. We present a comprehensive list of desiderata for building speech datasets for mental health and neurological disorders and distill it into an actionable checklist focused on ethical concerns to foster more responsible research. </p> </div> </dd> <dt> <a name='item444'>[444]</a> <a href ="/abs/2406.11087" title="Abstract" id="2406.11087"> arXiv:2406.11087 </a> (replaced) [<a href="/pdf/2406.11087" title="Download PDF" id="pdf-2406.11087" aria-labelledby="pdf-2406.11087">pdf</a>, <a href="https://arxiv.org/html/2406.11087v4" title="View HTML" id="html-2406.11087" aria-labelledby="html-2406.11087" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.11087" title="Other formats" id="oth-2406.11087" aria-labelledby="oth-2406.11087">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> DP-MemArc: Differential Privacy Transfer Learning for Memory Efficient Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Y">Yanming Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Peng,+X">Xinyue Peng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yuwei Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ke,+X">Xiaolan Ke</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Deng,+S">Songhang Deng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+J">Jiannan Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+C">Chen Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fu,+M">Mengchen Fu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cheng,+S">Sheng Cheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+X">Xun Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yin,+J">Jianwei Yin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Du,+T">Tianyu Du</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+X">Xuhong Zhang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Camera Ready version of AAAI 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Cryptography and Security (cs.CR)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Machine Learning (cs.LG) </div> <p class='mathjax'> Large language models have repeatedly shown outstanding performance across diverse applications. However, deploying these models can inadvertently risk user privacy. The significant memory demands during training pose a major challenge in terms of resource consumption. This substantial size places a heavy load on memory resources, raising considerable practical concerns. In this paper, we introduce DP-MemArc, a novel training framework aimed at reducing the memory costs of large language models while emphasizing the protection of user data privacy. DP-MemArc incorporates side network or reversible network designs to support a variety of differential privacy memory-efficient fine-tuning schemes. Our approach not only achieves in memory optimization but also ensures robust privacy protection, keeping user data secure and confidential. Extensive experiments have demonstrated that DP-MemArc effectively provides differential privacy-efficient fine-tuning across different task scenarios. </p> </div> </dd> <dt> <a name='item445'>[445]</a> <a href ="/abs/2406.11427" title="Abstract" id="2406.11427"> arXiv:2406.11427 </a> (replaced) [<a href="/pdf/2406.11427" title="Download PDF" id="pdf-2406.11427" aria-labelledby="pdf-2406.11427">pdf</a>, <a href="https://arxiv.org/html/2406.11427v2" title="View HTML" id="html-2406.11427" aria-labelledby="html-2406.11427" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.11427" title="Other formats" id="oth-2406.11427" aria-labelledby="oth-2406.11427">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> DiTTo-TTS: Diffusion Transformers for Scalable Text-to-Speech without Domain-Specific Factors </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Lee,+K">Keon Lee</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Kim,+D+W">Dong Won Kim</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Kim,+J">Jaehyeon Kim</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Chung,+S">Seungjun Chung</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Cho,+J">Jaewoong Cho</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Machine Learning (cs.LG); Sound (cs.SD) </div> <p class='mathjax'> Large-scale latent diffusion models (LDMs) excel in content generation across various modalities, but their reliance on phonemes and durations in text-to-speech (TTS) limits scalability and access from other fields. While recent studies show potential in removing these domain-specific factors, performance remains suboptimal. In this work, we introduce DiTTo-TTS, a Diffusion Transformer (DiT)-based TTS model, to investigate whether LDM-based TTS can achieve state-of-the-art performance without domain-specific factors. Through rigorous analysis and empirical exploration, we find that (1) DiT with minimal modifications outperforms U-Net, (2) variable-length modeling with a speech length predictor significantly improves results over fixed-length approaches, and (3) conditions like semantic alignment in speech latent representations are key to further enhancement. By scaling our training data to 82K hours and the model size to 790M parameters, we achieve superior or comparable zero-shot performance to state-of-the-art TTS models in naturalness, intelligibility, and speaker similarity, all without relying on domain-specific factors. Speech samples are available at <a href="https://ditto-tts.github.io" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item446'>[446]</a> <a href ="/abs/2406.15279" title="Abstract" id="2406.15279"> arXiv:2406.15279 </a> (replaced) [<a href="/pdf/2406.15279" title="Download PDF" id="pdf-2406.15279" aria-labelledby="pdf-2406.15279">pdf</a>, <a href="/format/2406.15279" title="Other formats" id="oth-2406.15279" aria-labelledby="oth-2406.15279">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Safe Inputs but Unsafe Output: Benchmarking Cross-modality Safety Alignment of Large Vision-Language Model </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Siyin Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ye,+X">Xingsong Ye</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cheng,+Q">Qinyuan Cheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Duan,+J">Junwen Duan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+S">Shimin Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fu,+J">Jinlan Fu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qiu,+X">Xipeng Qiu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+X">Xuanjing Huang</a></div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> NAACL 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> As Artificial General Intelligence (AGI) becomes increasingly integrated into various facets of human life, ensuring the safety and ethical alignment of such systems is paramount. Previous studies primarily focus on single-modality threats, which may not suffice given the integrated and complex nature of cross-modality interactions. We introduce a novel safety alignment challenge called Safe Inputs but Unsafe Output (SIUO) to evaluate cross-modality safety alignment. Specifically, it considers cases where single modalities are safe independently but could potentially lead to unsafe or unethical outputs when combined. To empirically investigate this problem, we developed the SIUO, a cross-modality benchmark encompassing 9 critical safety domains, such as self-harm, illegal activities, and privacy violations. Our findings reveal substantial safety vulnerabilities in both closed- and open-source LVLMs, such as GPT-4V and LLaVA, underscoring the inadequacy of current models to reliably interpret and respond to complex, real-world scenarios. </p> </div> </dd> <dt> <a name='item447'>[447]</a> <a href ="/abs/2406.16176" title="Abstract" id="2406.16176"> arXiv:2406.16176 </a> (replaced) [<a href="/pdf/2406.16176" title="Download PDF" id="pdf-2406.16176" aria-labelledby="pdf-2406.16176">pdf</a>, <a href="https://arxiv.org/html/2406.16176v2" title="View HTML" id="html-2406.16176" aria-labelledby="html-2406.16176" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.16176" title="Other formats" id="oth-2406.16176" aria-labelledby="oth-2406.16176">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> GraphEval36K: Benchmarking Coding and Reasoning Capabilities of Large Language Models on Graph Datasets </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Q">Qiming Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Z">Zichen Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Corcoran,+W">Will Corcoran</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Misha">Misha Sra</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Singh,+A+K">Ambuj K. Singh</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> The first two authors contributed equally to this work. This paper has been accepted by NAACL 2025. GraphEval36K is available at <a href="https://grapheval36k.github.io/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL); Machine Learning (cs.LG) </div> <p class='mathjax'> Large language models (LLMs) have achieved remarkable success in natural language processing (NLP), demonstrating significant capabilities in processing and understanding text data. However, recent studies have identified limitations in LLMs' ability to manipulate, program, and reason about structured data, especially graphs. We introduce GraphEval36K, the first comprehensive graph dataset, comprising 40 graph coding problems and 36,900 test cases to evaluate the ability of LLMs on graph problem-solving. Our dataset is categorized into eight primary and four sub-categories to ensure a thorough evaluation across different types of graphs. We benchmark ten LLMs, finding that private models outperform open-source ones, though the gap is narrowing. We also analyze the performance of LLMs across directed vs undirected graphs, different kinds of graph concepts, and network models. Furthermore, to improve the usability of our evaluation framework, we propose Structured Symbolic Decomposition (SSD), an instruction-based method designed to enhance LLM performance on complex graph tasks. Results show that SSD improves the average passing rate of GPT-4, GPT-4o, Gemini-Pro and Claude-3-Sonnet by 8.38%, 6.78%, 29.28% and 25.28%, respectively. </p> </div> </dd> <dt> <a name='item448'>[448]</a> <a href ="/abs/2406.16746" title="Abstract" id="2406.16746"> arXiv:2406.16746 </a> (replaced) [<a href="/pdf/2406.16746" title="Download PDF" id="pdf-2406.16746" aria-labelledby="pdf-2406.16746">pdf</a>, <a href="https://arxiv.org/html/2406.16746v4" title="View HTML" id="html-2406.16746" aria-labelledby="html-2406.16746" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.16746" title="Other formats" id="oth-2406.16746" aria-labelledby="oth-2406.16746">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> The Responsible Foundation Model Development Cheatsheet: A Review of Tools & Resources </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Longpre,+S">Shayne Longpre</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Biderman,+S">Stella Biderman</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Albalak,+A">Alon Albalak</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Schoelkopf,+H">Hailey Schoelkopf</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=McDuff,+D">Daniel McDuff</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kapoor,+S">Sayash Kapoor</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Klyman,+K">Kevin Klyman</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lo,+K">Kyle Lo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ilharco,+G">Gabriel Ilharco</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=San,+N">Nay San</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rauh,+M">Maribeth Rauh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Skowron,+A">Aviya Skowron</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Vidgen,+B">Bertie Vidgen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Weidinger,+L">Laura Weidinger</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Narayanan,+A">Arvind Narayanan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sanh,+V">Victor Sanh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Adelani,+D">David Adelani</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liang,+P">Percy Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bommasani,+R">Rishi Bommasani</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Henderson,+P">Peter Henderson</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luccioni,+S">Sasha Luccioni</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jernite,+Y">Yacine Jernite</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Soldaini,+L">Luca Soldaini</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> Foundation model development attracts a rapidly expanding body of contributors, scientists, and applications. To help shape responsible development practices, we introduce the Foundation Model Development Cheatsheet: a growing collection of 250+ tools and resources spanning text, vision, and speech modalities. We draw on a large body of prior work to survey resources (e.g. software, documentation, frameworks, guides, and practical tools) that support informed data selection, processing, and understanding, precise and limitation-aware artifact documentation, efficient model training, advance awareness of the environmental impact from training, careful model evaluation of capabilities, risks, and claims, as well as responsible model release, licensing and deployment practices. We hope this curated collection of resources helps guide more responsible development. The process of curating this list, enabled us to review the AI development ecosystem, revealing what tools are critically missing, misused, or over-used in existing practices. We find that (i) tools for data sourcing, model evaluation, and monitoring are critically under-serving ethical and real-world needs, (ii) evaluations for model safety, capabilities, and environmental impact all lack reproducibility and transparency, (iii) text and particularly English-centric analyses continue to dominate over multilingual and multi-modal analyses, and (iv) evaluation of systems, rather than just models, is needed so that capabilities and impact are assessed in context. </p> </div> </dd> <dt> <a name='item449'>[449]</a> <a href ="/abs/2407.00379" title="Abstract" id="2407.00379"> arXiv:2407.00379 </a> (replaced) [<a href="/pdf/2407.00379" title="Download PDF" id="pdf-2407.00379" aria-labelledby="pdf-2407.00379">pdf</a>, <a href="https://arxiv.org/html/2407.00379v2" title="View HTML" id="html-2407.00379" aria-labelledby="html-2407.00379" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2407.00379" title="Other formats" id="oth-2407.00379" aria-labelledby="oth-2407.00379">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> GraphArena: Evaluating and Exploring Large Language Models on Graph Computation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+J">Jianheng Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Q">Qifan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yuhan Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+N">Nuo Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+J">Jia Li</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> ICLR 2025 camera ready version </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> The ``arms race'' of Large Language Models (LLMs) demands new benchmarks to examine their progresses. In this paper, we introduce GraphArena, a benchmarking tool designed to evaluate LLMs on real-world graph computational problems. It offers a suite of four polynomial-time tasks (e.g., Shortest Distance) and six NP-complete challenges (e.g., Traveling Salesman Problem). GraphArena features a rigorous evaluation framework that classifies LLM outputs as correct, suboptimal (feasible but not optimal), hallucinatory (properly formatted but infeasible), or missing. Evaluation of over 10 LLMs reveals that even top-performing LLMs struggle with larger, more complex graph problems and exhibit hallucination issues. We further explore four potential solutions to address this issue and improve LLMs on graph computation, including chain-of-thought prompting, instruction tuning, code writing, and scaling test-time compute, each demonstrating unique strengths and limitations. GraphArena complements the existing LLM benchmarks and is open-sourced at <a href="https://github.com/squareRoot3/GraphArena" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item450'>[450]</a> <a href ="/abs/2407.16205" title="Abstract" id="2407.16205"> arXiv:2407.16205 </a> (replaced) [<a href="/pdf/2407.16205" title="Download PDF" id="pdf-2407.16205" aria-labelledby="pdf-2407.16205">pdf</a>, <a href="https://arxiv.org/html/2407.16205v4" title="View HTML" id="html-2407.16205" aria-labelledby="html-2407.16205" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2407.16205" title="Other formats" id="oth-2407.16205" aria-labelledby="oth-2407.16205">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LLMs can be Dangerous Reasoners: Analyzing-based Jailbreak Attack on Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+S">Shi Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+H">Hongming Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+R">Rongchang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+X">Xun Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+C">Changting Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xing,+W">Wenpeng Xing</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+M">Meng Han</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Cryptography and Security (cs.CR)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Machine Learning (cs.LG) </div> <p class='mathjax'> The rapid development of Large Language Models (LLMs) has brought significant advancements across various tasks. However, despite these achievements, LLMs still exhibit inherent safety vulnerabilities, especially when confronted with jailbreak attacks. Existing jailbreak methods suffer from two main limitations: reliance on complicated prompt engineering and iterative optimization, which lead to low attack success rate (ASR) and attack efficiency (AE). In this work, we propose an efficient jailbreak attack method, Analyzing-based Jailbreak (ABJ), which leverages the advanced reasoning capability of LLMs to autonomously generate harmful content, revealing their underlying safety vulnerabilities during complex reasoning process. We conduct comprehensive experiments on ABJ across various open-source and closed-source LLMs. In particular, ABJ achieves high ASR (82.1% on GPT-4o-2024-11-20) with exceptional AE among all target LLMs, showcasing its remarkable attack effectiveness, transferability, and efficiency. Our findings underscore the urgent need to prioritize and improve the safety of LLMs to mitigate the risks of misuse. </p> </div> </dd> <dt> <a name='item451'>[451]</a> <a href ="/abs/2408.14134" title="Abstract" id="2408.14134"> arXiv:2408.14134 </a> (replaced) [<a href="/pdf/2408.14134" title="Download PDF" id="pdf-2408.14134" aria-labelledby="pdf-2408.14134">pdf</a>, <a href="https://arxiv.org/html/2408.14134v3" title="View HTML" id="html-2408.14134" aria-labelledby="html-2408.14134" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2408.14134" title="Other formats" id="oth-2408.14134" aria-labelledby="oth-2408.14134">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Exploring the Potential of Large Language Models for Heterophilic Graphs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Y">Yuxia Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+S">Shujie Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fang,+Y">Yuan Fang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shi,+C">Chuan Shi</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted by NAACL 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Social and Information Networks (cs.SI) </div> <p class='mathjax'> Large language models (LLMs) have presented significant opportunities to enhance various machine learning applications, including graph neural networks (GNNs). By leveraging the vast open-world knowledge within LLMs, we can more effectively interpret and utilize textual data to better characterize heterophilic graphs, where neighboring nodes often have different labels. However, existing approaches for heterophilic graphs overlook the rich textual data associated with nodes, which could unlock deeper insights into their heterophilic contexts. In this work, we explore the potential of LLMs for modeling heterophilic graphs and propose a novel two-stage framework: LLM-enhanced edge discriminator and LLM-guided edge reweighting. In the first stage, we fine-tune the LLM to better identify homophilic and heterophilic edges based on the textual content of their nodes. In the second stage, we adaptively manage message propagation in GNNs for different edge types based on node features, structures, and heterophilic or homophilic characteristics. To cope with the computational demands when deploying LLMs in practical scenarios, we further explore model distillation techniques to fine-tune smaller, more efficient models that maintain competitive performance. Extensive experiments validate the effectiveness of our framework, demonstrating the feasibility of using LLMs to enhance node classification on heterophilic graphs. </p> </div> </dd> <dt> <a name='item452'>[452]</a> <a href ="/abs/2409.06635" title="Abstract" id="2409.06635"> arXiv:2409.06635 </a> (replaced) [<a href="/pdf/2409.06635" title="Download PDF" id="pdf-2409.06635" aria-labelledby="pdf-2409.06635">pdf</a>, <a href="https://arxiv.org/html/2409.06635v3" title="View HTML" id="html-2409.06635" aria-labelledby="html-2409.06635" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.06635" title="Other formats" id="oth-2409.06635" aria-labelledby="oth-2409.06635">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MoWE-Audio: Multitask AudioLLMs with Mixture of Weak Encoders </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+W">Wenyu Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+S">Shuo Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+B">Bin Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zou,+X">Xunlong Zou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zhuohan Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+Y">Yingxu He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+G">Geyu Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+N+F">Nancy F. Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Aw,+A+T">Ai Ti Aw</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted to ICASSP 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Audio and Speech Processing (eess.AS) </div> <p class='mathjax'> The rapid advancements in large language models (LLMs) have significantly enhanced natural language processing capabilities, facilitating the development of AudioLLMs that process and understand speech and audio inputs alongside text. Existing AudioLLMs typically combine a pre-trained audio encoder with a pre-trained LLM, which are subsequently finetuned on specific audio tasks. However, the pre-trained audio encoder has constrained capacity to capture features for new tasks and datasets. To address this, we propose to incorporate mixtures of `weak' encoders (MoWE) into the AudioLLM framework. MoWE supplements a base encoder with a pool of relatively light weight encoders, selectively activated based on the audio input to enhance feature extraction without significantly increasing model size. Our empirical results demonstrate that MoWE effectively improves multi-task performance, broadening the applicability of AudioLLMs to more diverse audio tasks. </p> </div> </dd> <dt> <a name='item453'>[453]</a> <a href ="/abs/2409.19058" title="Abstract" id="2409.19058"> arXiv:2409.19058 </a> (replaced) [<a href="/pdf/2409.19058" title="Download PDF" id="pdf-2409.19058" aria-labelledby="pdf-2409.19058">pdf</a>, <a href="https://arxiv.org/html/2409.19058v2" title="View HTML" id="html-2409.19058" aria-labelledby="html-2409.19058" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.19058" title="Other formats" id="oth-2409.19058" aria-labelledby="oth-2409.19058">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CLLMate: A Multimodal Benchmark for Weather and Climate Events Forecasting </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+H">Haobo Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zhaowei Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+J">Jiachen Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yueya Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lau,+A+K+H">Alexis Kai Hon Lau</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qu,+H">Huamin Qu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Atmospheric and Oceanic Physics (physics.ao-ph) </div> <p class='mathjax'> Forecasting weather and climate events is crucial for making appropriate measures to mitigate environmental hazards and minimize losses. However, existing environmental forecasting research focuses narrowly on predicting numerical meteorological variables (e.g., temperature), neglecting the translation of these variables into actionable textual narratives of events and their consequences. To bridge this gap, we proposed Weather and Climate Event Forecasting (WCEF), a new task that leverages numerical meteorological raster data and textual event data to predict weather and climate events. This task is challenging to accomplish due to difficulties in aligning multimodal data and the lack of supervised datasets. To address these challenges, we present CLLMate, the first multimodal dataset for WCEF, using 26,156 environmental news articles aligned with ERA5 reanalysis data. We systematically benchmark 23 existing MLLMs on CLLMate, including closed-source, open-source, and our fine-tuned models. Our experiments reveal the advantages and limitations of existing MLLMs and the value of CLLMate for the training and benchmarking of the WCEF task. </p> </div> </dd> <dt> <a name='item454'>[454]</a> <a href ="/abs/2409.19483" title="Abstract" id="2409.19483"> arXiv:2409.19483 </a> (replaced) [<a href="/pdf/2409.19483" title="Download PDF" id="pdf-2409.19483" aria-labelledby="pdf-2409.19483">pdf</a>, <a href="https://arxiv.org/html/2409.19483v4" title="View HTML" id="html-2409.19483" aria-labelledby="html-2409.19483" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.19483" title="Other formats" id="oth-2409.19483" aria-labelledby="oth-2409.19483">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MedCLIP-SAMv2: Towards Universal Text-Driven Medical Image Segmentation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Koleilat,+T">Taha Koleilat</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Asgariandehkordi,+H">Hojat Asgariandehkordi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rivaz,+H">Hassan Rivaz</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiao,+Y">Yiming Xiao</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 10 pages, 2 figures, 6 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Segmentation of anatomical structures and pathological regions in medical images is essential for modern clinical diagnosis, disease research, and treatment planning. While significant advancements have been made in deep learning-based segmentation techniques, many of these methods still suffer from limitations in data efficiency, generalizability, and interactivity. As a result, developing precise segmentation methods that require fewer labeled datasets remains a critical challenge in medical image analysis. Recently, the introduction of foundation models like CLIP and Segment-Anything-Model (SAM), with robust cross-domain representations, has paved the way for interactive and universal image segmentation. However, further exploration of these models for data-efficient segmentation in medical imaging is still needed and highly relevant. In this paper, we introduce MedCLIP-SAMv2, a novel framework that integrates the CLIP and SAM models to perform segmentation on clinical scans using text prompts, in both zero-shot and weakly supervised settings. Our approach includes fine-tuning the BiomedCLIP model with a new Decoupled Hard Negative Noise Contrastive Estimation (DHN-NCE) loss, and leveraging the Multi-modal Information Bottleneck (M2IB) to create visual prompts for generating segmentation masks from SAM in the zero-shot setting. We also investigate using zero-shot segmentation labels within a weakly supervised paradigm to enhance segmentation quality further. Extensive testing across four diverse segmentation tasks and medical imaging modalities (breast tumor ultrasound, brain tumor MRI, lung X-ray, and lung CT) demonstrates the high accuracy of our proposed framework. Our code is available at <a href="https://github.com/HealthX-Lab/MedCLIP-SAMv2" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item455'>[455]</a> <a href ="/abs/2410.01434" title="Abstract" id="2410.01434"> arXiv:2410.01434 </a> (replaced) [<a href="/pdf/2410.01434" title="Download PDF" id="pdf-2410.01434" aria-labelledby="pdf-2410.01434">pdf</a>, <a href="/format/2410.01434" title="Other formats" id="oth-2410.01434" aria-labelledby="oth-2410.01434">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Circuit Compositions: Exploring Modular Structures in Transformer-Based Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Mondorf,+P">Philipp Mondorf</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wold,+S">Sondre Wold</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Plank,+B">Barbara Plank</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 22 pages, 21 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> A fundamental question in interpretability research is to what extent neural networks, particularly language models, implement reusable functions through subnetworks that can be composed to perform more complex tasks. Recent advances in mechanistic interpretability have made progress in identifying $\textit{circuits}$, which represent the minimal computational subgraphs responsible for a model's behavior on specific tasks. However, most studies focus on identifying circuits for individual tasks without investigating how functionally similar circuits $\textit{relate}$ to each other. To address this gap, we study the modularity of neural networks by analyzing circuits for highly compositional subtasks within a transformer-based language model. Specifically, given a probabilistic context-free grammar, we identify and compare circuits responsible for ten modular string-edit operations. Our results indicate that functionally similar circuits exhibit both notable node overlap and cross-task faithfulness. Moreover, we demonstrate that the circuits identified can be reused and combined through set operations to represent more complex functional model capabilities. </p> </div> </dd> <dt> <a name='item456'>[456]</a> <a href ="/abs/2410.02810" title="Abstract" id="2410.02810"> arXiv:2410.02810 </a> (replaced) [<a href="/pdf/2410.02810" title="Download PDF" id="pdf-2410.02810" aria-labelledby="pdf-2410.02810">pdf</a>, <a href="https://arxiv.org/html/2410.02810v2" title="View HTML" id="html-2410.02810" aria-labelledby="html-2410.02810" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.02810" title="Other formats" id="oth-2410.02810" aria-labelledby="oth-2410.02810">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> StateAct: State Tracking and Reasoning for Acting and Planning with Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Rozanov,+N">Nikolai Rozanov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rei,+M">Marek Rei</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 9 pages, 5 pages appendix, 7 figures, 5 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL); Machine Learning (cs.LG) </div> <p class='mathjax'> Planning and acting to solve `real' tasks using large language models (LLMs) in interactive environments has become a new frontier for AI methods. While recent advances allowed LLMs to interact with online tools, solve robotics tasks and many more, long range reasoning tasks remain a problem for LLMs. Existing methods to address this issue are very resource intensive and require additional data or human crafted rules, instead, we propose a simple method based on few-shot in-context learning alone to enhance `chain-of-thought' with state-tracking for planning and acting with LLMs. We show that our method establishes the new state-of-the-art on Alfworld for in-context learning methods (+14\% over the previous best few-shot in-context learning method) and performs on par with methods that use additional training data and additional tools such as code-execution. We also demonstrate that our enhanced `chain-of-states' allows the agent to both solve longer horizon problems and to be more efficient in number of steps required to solve a task. We show that our method works across a variety of LLMs for both API-based and open source ones. Finally, we also conduct ablation studies and show that `chain-of-thoughts' helps state-tracking accuracy, while a json-structure harms overall performance. We open-source our code and annotations at <a href="https://github.com/ai-nikolai/StateAct" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item457'>[457]</a> <a href ="/abs/2410.02892" title="Abstract" id="2410.02892"> arXiv:2410.02892 </a> (replaced) [<a href="/pdf/2410.02892" title="Download PDF" id="pdf-2410.02892" aria-labelledby="pdf-2410.02892">pdf</a>, <a href="https://arxiv.org/html/2410.02892v2" title="View HTML" id="html-2410.02892" aria-labelledby="html-2410.02892" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.02892" title="Other formats" id="oth-2410.02892" aria-labelledby="oth-2410.02892">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> The Role of Deductive and Inductive Reasoning in Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Cai,+C">Chengkun Cai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+X">Xu Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+H">Haoliang Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+Z">Zhongyu Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+T">Tianfang Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Z">Zongkai Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hwang,+J">Jenq-Neng Hwang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Belongie,+S">Serge Belongie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+L">Lei Li</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 4 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL); Machine Learning (cs.LG) </div> <p class='mathjax'> Large Language Models (LLMs) have demonstrated impressive capabilities in reasoning tasks, yet their reliance on static prompt structures and limited adaptability to complex scenarios remains a significant challenge. In this paper, we propose the Deductive and InDuctive(DID) method, a novel framework that enhances LLM reasoning by dynamically integrating both deductive and inductive reasoning approaches. Drawing from cognitive science principles, DID implements a dual-metric complexity evaluation system that combines Littlestone dimension and information entropy to precisely assess task difficulty and guide decomposition strategies. DID enables the model to progressively adapt its reasoning pathways based on problem complexity, mirroring human cognitive processes. We evaluate DID's effectiveness across multiple benchmarks, including the AIW and MR-GSM8K, as well as our custom Holiday Puzzle dataset for temporal reasoning. Our results demonstrate significant improvements in reasoning quality and solution accuracy - achieving 70.3% accuracy on AIW (compared to 62.2% for Tree of Thought) while maintaining lower computational costs. The success of DID in improving LLM performance while preserving computational efficiency suggests promising directions for developing more cognitively aligned and capable language models. Our work contributes a theoretically grounded, input-centric approach to enhancing LLM reasoning capabilities, offering an efficient alternative to traditional output-exploration methods. </p> </div> </dd> <dt> <a name='item458'>[458]</a> <a href ="/abs/2410.10114" title="Abstract" id="2410.10114"> arXiv:2410.10114 </a> (replaced) [<a href="/pdf/2410.10114" title="Download PDF" id="pdf-2410.10114" aria-labelledby="pdf-2410.10114">pdf</a>, <a href="https://arxiv.org/html/2410.10114v3" title="View HTML" id="html-2410.10114" aria-labelledby="html-2410.10114" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.10114" title="Other formats" id="oth-2410.10114" aria-labelledby="oth-2410.10114">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Mixture of Experts Made Personalized: Federated Prompt Learning for Vision-Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+J">Jun Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+C">Chen Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+S">Shandong Wu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> ICLR 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computation and Language (cs.CL); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Federated prompt learning benefits federated learning with CLIP-like Vision-Language Model's (VLM's) robust representation learning ability through prompt learning. However, current federated prompt learning methods are habitually restricted to the traditional FL paradigm, where the participating clients are generally only allowed to download a single globally aggregated model from the server. While justifiable for training full-sized models under federated settings, in this work, we argue that this paradigm is ill-suited for lightweight prompts. By facilitating the clients to download multiple pre-aggregated prompts as fixed non-local experts, we propose Personalized Federated Mixture of Adaptive Prompts (pFedMoAP), a novel FL framework that personalizes the prompt learning process through the lens of Mixture of Experts (MoE). pFedMoAP implements a local attention-based gating network that learns to generate enhanced text features for better alignment with local image data, benefiting from both local and downloaded non-local adaptive prompt experts. Extensive experiments on 9 datasets under various federated settings demonstrate the efficacy of the proposed pFedMoAP algorithm. The code is available at <a href="https://github.com/ljaiverson/pFedMoAP" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item459'>[459]</a> <a href ="/abs/2410.10293" title="Abstract" id="2410.10293"> arXiv:2410.10293 </a> (replaced) [<a href="/pdf/2410.10293" title="Download PDF" id="pdf-2410.10293" aria-labelledby="pdf-2410.10293">pdf</a>, <a href="https://arxiv.org/html/2410.10293v3" title="View HTML" id="html-2410.10293" aria-labelledby="html-2410.10293" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.10293" title="Other formats" id="oth-2410.10293" aria-labelledby="oth-2410.10293">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> FunnelRAG: A Coarse-to-Fine Progressive Retrieval Paradigm for RAG </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+X">Xinping Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhong,+Y">Yan Zhong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+Z">Zetian Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+X">Xinshuo Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zhenyu Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+D">Dongfang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+B">Baotian Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+M">Min Zhang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 18 pages, 6 figures, 13 tables. Accepted by NAACL 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Information Retrieval (cs.IR)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Retrieval-Augmented Generation (RAG) prevails in Large Language Models. It mainly consists of retrieval and generation. The retrieval modules (a.k.a. retrievers) aim to find useful information used to facilitate the generation modules (a.k.a. generators). As such, generators' performance largely depends on the effectiveness and efficiency of retrievers. However, the widely used retrieval paradigm remains flat. It treats retrieval procedures as a one-off deal with constant granularity. Despite effectiveness, we argue that they suffer from two limitations: (1) flat retrieval exerts a significant burden on one retriever; (2) constant granularity limits the ceiling of retrieval performance. In this work, we propose a progressive retrieval paradigm with coarse-to-fine granularity for RAG, termed FunnelRAG, so as to balance effectiveness and efficiency. Specifically, FunnelRAG establishes a progressive retrieval pipeline by collaborating coarse-to-fine granularity, large-to-small quantity, and low-to-high capacity, which can relieve the burden on one retriever and also promote the ceiling of retrieval performance. Extensive experiments manifest that FunnelRAG achieves comparable retrieval performance while the time overhead is reduced by nearly 40 percent. </p> </div> </dd> <dt> <a name='item460'>[460]</a> <a href ="/abs/2410.12228" title="Abstract" id="2410.12228"> arXiv:2410.12228 </a> (replaced) [<a href="/pdf/2410.12228" title="Download PDF" id="pdf-2410.12228" aria-labelledby="pdf-2410.12228">pdf</a>, <a href="https://arxiv.org/html/2410.12228v2" title="View HTML" id="html-2410.12228" aria-labelledby="html-2410.12228" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.12228" title="Other formats" id="oth-2410.12228" aria-labelledby="oth-2410.12228">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Triple Modality Fusion: Aligning Visual, Textual, and Graph Data with Large Language Models for Multi-Behavior Recommendations </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+L">Luyi Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+X">Xiaohan Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fan,+Z">Zezhong Fan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+K">Kai Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+J">Jianpeng Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cho,+J">Jason Cho</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kanumala,+P">Praveen Kanumala</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nag,+K">Kaushiki Nag</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kumar,+S">Sushant Kumar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Achan,+K">Kannan Achan</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Information Retrieval (cs.IR)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> Integrating diverse data modalities is crucial for enhancing the performance of personalized recommendation systems. Traditional models, which often rely on singular data sources, lack the depth needed to accurately capture the multifaceted nature of item features and user behaviors. This paper introduces a novel framework for multi-behavior recommendations, leveraging the fusion of triple-modality, which is visual, textual, and graph data through alignment with large language models (LLMs). By incorporating visual information, we capture contextual and aesthetic item characteristics; textual data provides insights into user interests and item features in detail; and graph data elucidates relationships within the item-behavior heterogeneous graphs. Our proposed model called Triple Modality Fusion (TMF) utilizes the power of LLMs to align and integrate these three modalities, achieving a comprehensive representation of user behaviors. The LLM models the user's interactions including behaviors and item features in natural languages. Initially, the LLM is warmed up using only natural language-based prompts. We then devise the modality fusion module based on cross-attention and self-attention mechanisms to integrate different modalities from other models into the same embedding space and incorporate them into an LLM. Extensive experiments demonstrate the effectiveness of our approach in improving recommendation accuracy. Further ablation studies validate the effectiveness of our model design and benefits of the TMF. </p> </div> </dd> <dt> <a name='item461'>[461]</a> <a href ="/abs/2410.15281" title="Abstract" id="2410.15281"> arXiv:2410.15281 </a> (replaced) [<a href="/pdf/2410.15281" title="Download PDF" id="pdf-2410.15281" aria-labelledby="pdf-2410.15281">pdf</a>, <a href="https://arxiv.org/html/2410.15281v2" title="View HTML" id="html-2410.15281" aria-labelledby="html-2410.15281" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.15281" title="Other formats" id="oth-2410.15281" aria-labelledby="oth-2410.15281">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Large Language Models for Autonomous Driving (LLM4AD): oncept, Benchmark, Experiments, and Challenges </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Cui,+C">Can Cui</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+Y">Yunsheng Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+Z">Zichong Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+Y">Yupeng Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+P">Peiran Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+J">Juanwu Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+L">Lingxi Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Y">Yaobin Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Panchal,+J+H">Jitesh H. Panchal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Abdelraouf,+A">Amr Abdelraouf</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gupta,+R">Rohit Gupta</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+K">Kyungtae Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Ziran Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Human-Computer Interaction (cs.HC) </div> <p class='mathjax'> With the broader usage and highly successful development of Large Language Models (LLMs), there has been a growth of interest and demand for applying LLMs to autonomous driving technology. Driven by their natural language understanding and reasoning ability, LLMs have the potential to enhance various aspects of autonomous driving systems, from perception and scene understanding to language interaction and decision-making. In this paper, we first introduce the novel concept of designing LLMs for autonomous driving (LLM4AD). Then, we propose a comprehensive benchmark for evaluating the instruction-following abilities of LLM4AD in simulation. Furthermore, we conduct a series of experiments on real-world vehicle platforms, thoroughly evaluating the performance and potential of our LLM4AD systems. Finally, we envision the main challenges of LLM4AD, including latency, deployment, security and privacy, safety, trust and transparency, and personalization. Our research highlights the significant potential of LLMs to enhance various aspects of autonomous vehicle technology, from perception and scene understanding to language interaction and decision-making. </p> </div> </dd> <dt> <a name='item462'>[462]</a> <a href ="/abs/2410.15332" title="Abstract" id="2410.15332"> arXiv:2410.15332 </a> (replaced) [<a href="/pdf/2410.15332" title="Download PDF" id="pdf-2410.15332" aria-labelledby="pdf-2410.15332">pdf</a>, <a href="https://arxiv.org/html/2410.15332v2" title="View HTML" id="html-2410.15332" aria-labelledby="html-2410.15332" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.15332" title="Other formats" id="oth-2410.15332" aria-labelledby="oth-2410.15332">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> EPIC: Efficient Position-Independent Context Caching for Serving Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+J">Junhao Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+W">Wenrui Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+H">Haoyi Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+W">Weidong Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+T">Tiancheng Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Q">Qin Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+H">Hao Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+X">Xusheng Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shan,+Y">Yizhou Shan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xie,+T">Tao Xie</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computation and Language (cs.CL); Distributed, Parallel, and Cluster Computing (cs.DC); Performance (cs.PF) </div> <p class='mathjax'> Large Language Models (LLMs) are critical for a wide range of applications, but serving them efficiently becomes increasingly challenging as inputs become more complex. Context caching improves serving performance by exploiting inter-request dependency and reusing key-value (KV) cache across requests, thus improving time-to-first-token (TTFT). However, existing prefix-based context caching requires exact token prefix matches, limiting cache reuse in few-shot learning, multi-document QA, or retrieval-augmented generation, where prefixes may vary. In this paper, we present EPIC, an LLM serving system that introduces position-independent context caching (PIC), enabling modular KV cache reuse regardless of token chunk position (or prefix). EPIC features two key designs: AttnLink, which leverages static attention sparsity to minimize recomputation for accuracy recovery, and KVSplit, a customizable chunking method that preserves semantic coherence. Our experiments demonstrate that Epic delivers up to 8x improvements in TTFT and 7x throughput over existing systems, with negligible or no accuracy loss. By addressing the limitations of traditional caching approaches, Epic enables more scalable and efficient LLM inference. </p> </div> </dd> <dt> <a name='item463'>[463]</a> <a href ="/abs/2410.16204" title="Abstract" id="2410.16204"> arXiv:2410.16204 </a> (replaced) [<a href="/pdf/2410.16204" title="Download PDF" id="pdf-2410.16204" aria-labelledby="pdf-2410.16204">pdf</a>, <a href="https://arxiv.org/html/2410.16204v3" title="View HTML" id="html-2410.16204" aria-labelledby="html-2410.16204" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.16204" title="Other formats" id="oth-2410.16204" aria-labelledby="oth-2410.16204">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Machine Learning Approaches for Mental Illness Detection on Social Media: A Systematic Review of Biases and Methodological Challenges </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+Y">Yuchen Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dai,+J">Jianglai Dai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zhongyan Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yeyubei Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+X">Xiaorui Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Y">Yunchong Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tian,+Y">Yexin Tian</a></div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> Journal of Behavioral Data Science, 5(1) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> The global increase in mental illness requires innovative detection methods for early intervention. Social media provides a valuable platform to identify mental illness through user-generated content. This systematic review examines machine learning (ML) models for detecting mental illness, with a particular focus on depression, using social media data. It highlights biases and methodological challenges encountered throughout the ML lifecycle. A search of PubMed, IEEE Xplore, and Google Scholar identified 47 relevant studies published after 2010. The Prediction model Risk Of Bias ASsessment Tool (PROBAST) was utilized to assess methodological quality and risk of bias. <br>The review reveals significant biases affecting model reliability and generalizability. A predominant reliance on Twitter (63.8%) and English-language content (over 90%) limits diversity, with most studies focused on users from the United States and Europe. Non-probability sampling (80%) limits representativeness. Only 23% explicitly addressed linguistic nuances like negations, crucial for accurate sentiment analysis. Inconsistent hyperparameter tuning (27.7%) and inadequate data partitioning (17%) risk overfitting. While 74.5% used appropriate evaluation metrics for imbalanced data, others relied on accuracy without addressing class imbalance, potentially skewing results. Reporting transparency varied, often lacking critical methodological details. <br>These findings highlight the need to diversify data sources, standardize preprocessing, ensure consistent model development, address class imbalance, and enhance reporting transparency. By overcoming these challenges, future research can develop more robust and generalizable ML models for depression detection on social media, contributing to improved mental health outcomes globally. </p> </div> </dd> <dt> <a name='item464'>[464]</a> <a href ="/abs/2410.18057" title="Abstract" id="2410.18057"> arXiv:2410.18057 </a> (replaced) [<a href="/pdf/2410.18057" title="Download PDF" id="pdf-2410.18057" aria-labelledby="pdf-2410.18057">pdf</a>, <a href="https://arxiv.org/html/2410.18057v3" title="View HTML" id="html-2410.18057" aria-labelledby="html-2410.18057" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.18057" title="Other formats" id="oth-2410.18057" aria-labelledby="oth-2410.18057">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CLEAR: Character Unlearning in Textual and Visual Modalities </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Dontsov,+A">Alexey Dontsov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Korzh,+D">Dmitrii Korzh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhavoronkin,+A">Alexey Zhavoronkin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mikheev,+B">Boris Mikheev</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bobkov,+D">Denis Bobkov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Alanov,+A">Aibek Alanov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rogov,+O+Y">Oleg Y. Rogov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Oseledets,+I">Ivan Oseledets</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tutubalina,+E">Elena Tutubalina</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Machine Unlearning (MU) is critical for removing private or hazardous information from deep learning models. While MU has advanced significantly in unimodal (text or vision) settings, multimodal unlearning (MMU) remains underexplored due to the lack of open benchmarks for evaluating cross-modal data removal. To address this gap, we introduce CLEAR, the first open-source benchmark designed specifically for MMU. CLEAR contains 200 fictitious individuals and 3,700 images linked with corresponding question-answer pairs, enabling a thorough evaluation across modalities. We conduct a comprehensive analysis of 11 MU methods (e.g., SCRUB, gradient ascent, DPO) across four evaluation sets, demonstrating that jointly unlearning both modalities outperforms single-modality approaches. The dataset is available at <a href="https://huggingface.co/datasets/therem/CLEAR" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item465'>[465]</a> <a href ="/abs/2410.19817" title="Abstract" id="2410.19817"> arXiv:2410.19817 </a> (replaced) [<a href="/pdf/2410.19817" title="Download PDF" id="pdf-2410.19817" aria-labelledby="pdf-2410.19817">pdf</a>, <a href="https://arxiv.org/html/2410.19817v2" title="View HTML" id="html-2410.19817" aria-labelledby="html-2410.19817" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.19817" title="Other formats" id="oth-2410.19817" aria-labelledby="oth-2410.19817">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Step Guided Reasoning: Improving Mathematical Reasoning using Guidance Generation and Step Reasoning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+L">Lang Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Peng,+C">Chao Peng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+R">Renhong Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ning,+W">Wu Ning</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zou,+Y">Yingtian Zou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yitong Li</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 8 pages, 4 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL); Human-Computer Interaction (cs.HC) </div> <p class='mathjax'> Mathematical reasoning has been challenging for large language models (LLMs). However, the introduction of step-by-step Chain-of-Thought (CoT) inference has significantly advanced the mathematical capabilities of LLMs. Despite this progress, current approaches either necessitate extensive inference datasets for training or depend on few-shot methods that frequently compromise computational accuracy. To address these bottlenecks in mathematical reasoning, we propose a novel method called Step Guidied Reasoning, which is more stable and generalizable than few-shot methods and does not involve further fine-tuning of the model. In this approach, LLMs reflect on small reasoning steps, similar to how humans deliberate and focus attention on what to do next. By incorporating this reflective process into the inference stage, LLMs can effectively guide their reasoning from one step to the next. Through extensive experiments, we demonstrate the significant effect of Step Guidied Reasoning in augmenting mathematical performance in state-of-the-art language models. Qwen2-72B-Instruct outperforms its math-specific counterpart, Qwen2.5-72B-Math-Instruct, on MMLU- STEM with a score of 90.9%, compared to 87.3%. The average scores of Qwen2-7B-Instruct and Qwen2-72B-Instruct increase from 27.1% to 36.3% and from 36.5% to 47.4% on the mathematics domain, respectively. </p> </div> </dd> <dt> <a name='item466'>[466]</a> <a href ="/abs/2410.22353" title="Abstract" id="2410.22353"> arXiv:2410.22353 </a> (replaced) [<a href="/pdf/2410.22353" title="Download PDF" id="pdf-2410.22353" aria-labelledby="pdf-2410.22353">pdf</a>, <a href="https://arxiv.org/html/2410.22353v3" title="View HTML" id="html-2410.22353" aria-labelledby="html-2410.22353" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.22353" title="Other formats" id="oth-2410.22353" aria-labelledby="oth-2410.22353">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> RuleRAG: Rule-Guided Retrieval-Augmented Generation with Language Models for Question Answering </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Z">Zhongwu Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+C">Chengjin Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+D">Dingmin Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+Z">Zhen Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dou,+Y">Yong Dou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+X">Xuhui Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+J">Jian Guo</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Information Retrieval (cs.IR)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> Retrieval-augmented generation (RAG) has shown promising potential in knowledge intensive question answering (QA). However, existing approaches only consider the query itself, neither specifying the retrieval preferences for the retrievers nor informing the generators of how to refer to the retrieved documents for the answers, which poses a significant challenge to the QA performance. To address these issues, we propose Rule-guided Retrieval-Augmented Generation with LMs, which explicitly introduces rules for in-context learning (RuleRAG-ICL) to guide retrievers to recall related documents in the directions of rules and uniformly guide generators to reason attributed by the same rules. Moreover, most existing RAG datasets were constructed without considering rules and Knowledge Graphs (KGs) are recognized as providing high-quality rules. Therefore, we construct five rule-aware RAG benchmarks for QA, RuleQA, based on KGs to stress the significance of retrieval and reasoning with rules. Experiments on RuleQA demonstrate RuleRAG-ICL improves the retrieval quality of +89.2% in Recall@10 and answer accuracy of +103.1% in Exact Match, and RuleRAG-FT yields more enhancement. In addition, experiments on four existing RAG datasets show RuleRAG is also effective by offering rules in RuleQA to them, further proving the generalization of rule guidance in RuleRAG. </p> </div> </dd> <dt> <a name='item467'>[467]</a> <a href ="/abs/2411.02400" title="Abstract" id="2411.02400"> arXiv:2411.02400 </a> (replaced) [<a href="/pdf/2411.02400" title="Download PDF" id="pdf-2411.02400" aria-labelledby="pdf-2411.02400">pdf</a>, <a href="https://arxiv.org/html/2411.02400v2" title="View HTML" id="html-2411.02400" aria-labelledby="html-2411.02400" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.02400" title="Other formats" id="oth-2411.02400" aria-labelledby="oth-2411.02400">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Decomposition Dilemmas: Does Claim Decomposition Boost or Burden Fact-Checking Performance? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+Q">Qisheng Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Long,+Q">Quanyu Long</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+W">Wenya Wang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> NAACL 2025 Main </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Information Retrieval (cs.IR)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> Fact-checking pipelines increasingly adopt the Decompose-Then-Verify paradigm, where texts are broken down into smaller claims for individual verification and subsequently combined for a veracity decision. While decomposition is widely-adopted in such pipelines, its effects on final fact-checking performance remain underexplored. Some studies have reported improvements from decompostition, while others have observed performance declines, indicating its inconsistent impact. To date, no comprehensive analysis has been conducted to understand this variability. To address this gap, we present an in-depth analysis that explicitly examines the impact of decomposition on downstream verification performance. Through error case inspection and experiments, we introduce a categorization of decomposition errors and reveal a trade-off between accuracy gains and the noise introduced through decomposition. Our analysis provides new insights into understanding current system's instability and offers guidance for future studies toward improving claim decomposition in fact-checking pipelines. </p> </div> </dd> <dt> <a name='item468'>[468]</a> <a href ="/abs/2411.03823" title="Abstract" id="2411.03823"> arXiv:2411.03823 </a> (replaced) [<a href="/pdf/2411.03823" title="Download PDF" id="pdf-2411.03823" aria-labelledby="pdf-2411.03823">pdf</a>, <a href="https://arxiv.org/html/2411.03823v2" title="View HTML" id="html-2411.03823" aria-labelledby="html-2411.03823" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.03823" title="Other formats" id="oth-2411.03823" aria-labelledby="oth-2411.03823">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Both Text and Images Leaked! A Systematic Analysis of Multimodal LLM Data Contamination </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+D">Dingjie Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lai,+S">Sicheng Lai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+S">Shunian Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+L">Lichao Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+B">Benyou Wang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Code Available: <a href="https://github.com/MLLM-Data-Contamination/MM-Detect" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Multimedia (cs.MM) </div> <p class='mathjax'> The rapid progression of multimodal large language models (MLLMs) has demonstrated superior performance on various multimodal benchmarks. However, the issue of data contamination during training creates challenges in performance evaluation and comparison. While numerous methods exist for detecting models' contamination in large language models (LLMs), they are less effective for MLLMs due to their various modalities and multiple training phases. In this study, we introduce a multimodal data contamination detection framework, MM-Detect, designed for MLLMs. Our experimental results indicate that MM-Detect is quite effective and sensitive in identifying varying degrees of contamination, and can highlight significant performance improvements due to the leakage of multimodal benchmark training sets. Furthermore, we explore whether the contamination originates from the base LLMs used by MLLMs or the multimodal training phase, providing new insights into the stages at which contamination may be introduced. </p> </div> </dd> <dt> <a name='item469'>[469]</a> <a href ="/abs/2411.07546" title="Abstract" id="2411.07546"> arXiv:2411.07546 </a> (replaced) [<a href="/pdf/2411.07546" title="Download PDF" id="pdf-2411.07546" aria-labelledby="pdf-2411.07546">pdf</a>, <a href="https://arxiv.org/html/2411.07546v2" title="View HTML" id="html-2411.07546" aria-labelledby="html-2411.07546" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.07546" title="Other formats" id="oth-2411.07546" aria-labelledby="oth-2411.07546">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Contrastive Language Prompting to Ease False Positives in Medical Anomaly Detection </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Park,+Y">YeongHyeon Park</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+M+J">Myung Jin Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+H+S">Hyeong Seok Kim</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 4 pages, 3 figures, 2 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> A pre-trained visual-language model, contrastive language-image pre-training (CLIP), successfully accomplishes various downstream tasks with text prompts, such as finding images or localizing regions within the image. Despite CLIP's strong multi-modal data capabilities, it remains limited in specialized environments, such as medical applications. For this purpose, many CLIP variants-i.e., BioMedCLIP, and MedCLIP-SAMv2-have emerged, but false positives related to normal regions persist. Thus, we aim to present a simple yet important goal of reducing false positives in medical anomaly detection. We introduce a Contrastive LAnguage Prompting (CLAP) method that leverages both positive and negative text prompts. This straightforward approach identifies potential lesion regions by visual attention to the positive prompts in the given image. To reduce false positives, we attenuate attention on normal regions using negative prompts. Extensive experiments with the BMAD dataset, including six biomedical benchmarks, demonstrate that CLAP method enhances anomaly detection performance. Our future plans include developing an automated fine prompting method for more practical usage. </p> </div> </dd> <dt> <a name='item470'>[470]</a> <a href ="/abs/2411.14708" title="Abstract" id="2411.14708"> arXiv:2411.14708 </a> (replaced) [<a href="/pdf/2411.14708" title="Download PDF" id="pdf-2411.14708" aria-labelledby="pdf-2411.14708">pdf</a>, <a href="https://arxiv.org/html/2411.14708v3" title="View HTML" id="html-2411.14708" aria-labelledby="html-2411.14708" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14708" title="Other formats" id="oth-2411.14708" aria-labelledby="oth-2411.14708">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Understanding LLM Embeddings for Regression </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+E">Eric Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+B">Bangding Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+X">Xingyou Song</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Published in Transactions on Machine Learning Research (TMLR) 2025. Code can be found in <a href="https://github.com/google-research/optformer" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> With the rise of large language models (LLMs) for flexibly processing information as strings, a natural application is regression, specifically by preprocessing string representations into LLM embeddings as downstream features for metric prediction. In this paper, we provide one of the first comprehensive investigations into embedding-based regression and demonstrate that LLM embeddings as features can be better for high-dimensional regression tasks than using traditional feature engineering. This regression performance can be explained in part due to LLM embeddings over numeric data inherently preserving Lipschitz continuity over the feature space. Furthermore, we quantify the contribution of different model effects, most notably model size and language understanding, which we find surprisingly do not always improve regression performance. </p> </div> </dd> <dt> <a name='item471'>[471]</a> <a href ="/abs/2411.15737" title="Abstract" id="2411.15737"> arXiv:2411.15737 </a> (replaced) [<a href="/pdf/2411.15737" title="Download PDF" id="pdf-2411.15737" aria-labelledby="pdf-2411.15737">pdf</a>, <a href="https://arxiv.org/html/2411.15737v3" title="View HTML" id="html-2411.15737" aria-labelledby="html-2411.15737" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.15737" title="Other formats" id="oth-2411.15737" aria-labelledby="oth-2411.15737">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> TableTime: Reformulating Time Series Classification as Training-Free Table Understanding with Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+J">Jiahao Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cheng,+M">Mingyue Cheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mao,+Q">Qingyang Mao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+Y">Yitong Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+F">Feiyang Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+X">Xin Li</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL); Machine Learning (cs.LG) </div> <p class='mathjax'> Large language models (LLMs) have demonstrated their effectiveness in multivariate time series classification (MTSC). Effective adaptation of LLMs for MTSC necessitates informative data representations. Existing LLM-based methods directly encode embeddings for time series within the latent space of LLMs from scratch to align with semantic space of LLMs. Despite their effectiveness, we reveal that these methods conceal three inherent bottlenecks: (1) they struggle to encode temporal and channel-specific information in a lossless manner, both of which are critical components of multivariate time series; (2) it is much difficult to align the learned representation space with the semantic space of the LLMs; (3) they require task-specific retraining, which is both computationally expensive and labor-intensive. To bridge these gaps, we propose TableTime, which reformulates MTSC as a table understanding task. Specifically, TableTime introduces the following strategies: (1) convert multivariate time series into a tabular form, thus minimizing information loss to the greatest extent; (2) represent tabular time series in text format to achieve natural alignment with the semantic space of LLMs; (3) design a reasoning framework that integrates contextual text information, neighborhood assistance, multi-path inference and problem decomposition to enhance the reasoning ability of LLMs and realize zero-shot classification. Extensive experiments performed on 10 publicly representative datasets from UEA archive verify the superiorities of the TableTime. </p> </div> </dd> <dt> <a name='item472'>[472]</a> <a href ="/abs/2411.19378" title="Abstract" id="2411.19378"> arXiv:2411.19378 </a> (replaced) [<a href="/pdf/2411.19378" title="Download PDF" id="pdf-2411.19378" aria-labelledby="pdf-2411.19378">pdf</a>, <a href="https://arxiv.org/html/2411.19378v2" title="View HTML" id="html-2411.19378" aria-labelledby="html-2411.19378" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.19378" title="Other formats" id="oth-2411.19378" aria-labelledby="oth-2411.19378">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Libra: Leveraging Temporal Images for Biomedical Radiology Analysis </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+X">Xi Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Meng,+Z">Zaiqiao Meng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lever,+J">Jake Lever</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ho,+E+S+L">Edmond S. L. Ho</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 30 pages, 5 figures, Adding Appendix </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Machine Learning (cs.LG) </div> <p class='mathjax'> Radiology report generation (RRG) requires advanced medical image analysis, effective temporal reasoning, and accurate text generation. While multimodal large language models (MLLMs) align with pre-trained vision encoders to enhance visual-language understanding, most existing methods rely on single-image analysis or rule-based heuristics to process multiple images, failing to fully leverage temporal information in multi-modal medical datasets. In this paper, we introduce Libra, a temporal-aware MLLM tailored for chest X-ray report generation. Libra combines a radiology-specific image encoder with a novel Temporal Alignment Connector (TAC), designed to accurately capture and integrate temporal differences between paired current and prior images. Extensive experiments on the MIMIC-CXR dataset demonstrate that Libra establishes a new state-of-the-art benchmark among similarly scaled MLLMs, setting new standards in both clinical relevance and lexical accuracy. </p> </div> </dd> <dt> <a name='item473'>[473]</a> <a href ="/abs/2412.00069" title="Abstract" id="2412.00069"> arXiv:2412.00069 </a> (replaced) [<a href="/pdf/2412.00069" title="Download PDF" id="pdf-2412.00069" aria-labelledby="pdf-2412.00069">pdf</a>, <a href="https://arxiv.org/html/2412.00069v2" title="View HTML" id="html-2412.00069" aria-labelledby="html-2412.00069" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.00069" title="Other formats" id="oth-2412.00069" aria-labelledby="oth-2412.00069">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Condense, Don't Just Prune: Enhancing Efficiency and Performance in MoE Layer Pruning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+M">Mingyu Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+G">Gen Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ji,+J">Jie Ji</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+J">Jiaqi Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+X">Xiaolong Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+S">Shiwei Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yin,+L">Lu Yin</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Mixture-of-Experts (MoE) has garnered significant attention for its ability to scale up neural networks while utilizing the same or even fewer active parameters. However, MoE does not alleviate the massive memory requirements of networks, which limits their practicality in real-world applications, especially in the era of large language models (LLMs). While recent work explores the possibility of removing entire layers of MoE to reduce memory, the performance degradation is still notable. In this paper, we propose ConDense-MoE (CD-MoE), which, instead of dropping the entire MoE layer, condenses the large, sparse MoE layer into a smaller, denser layer with only a few experts activated for all tokens, while maintaining hardware friendliness. Our approach is specifically designed for fine-grained MoE with shared experts, where Feed-Forward Networks are split into many small experts, with certain experts isolated to serve as shared experts that are always activated, such as DeepSeekMoE and QwenMoE. We demonstrate the effectiveness of our method. Specifically, for the DeepSeekMoE-16B model, our approach maintains 90% of the average accuracy while reducing memory usage by 27.5% and increasing inference speed by 1.26 times. Moreover, we show that by applying lightweight expert fine-tuning -- only to the condensed layers -- and using 5 hours on a single 80G A100 GPU, we can successfully recover 98% of the original performance. Our code is available at: <a href="https://github.com/duterscmy/CD-MoE/tree/main" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item474'>[474]</a> <a href ="/abs/2412.00804" title="Abstract" id="2412.00804"> arXiv:2412.00804 </a> (replaced) [<a href="/pdf/2412.00804" title="Download PDF" id="pdf-2412.00804" aria-labelledby="pdf-2412.00804">pdf</a>, <a href="https://arxiv.org/html/2412.00804v2" title="View HTML" id="html-2412.00804" aria-labelledby="html-2412.00804" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.00804" title="Other formats" id="oth-2412.00804" aria-labelledby="oth-2412.00804">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Examining Identity Drift in Conversations of LLM Agents </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Choi,+J">Junhyuk Choi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hong,+Y">Yeseon Hong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+M">Minju Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+B">Bugeun Kim</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Under review </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computers and Society (cs.CY)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Large Language Models (LLMs) show impressive conversational abilities but sometimes show identity drift problems, where their interaction patterns or styles change over time. As the problem has not been thoroughly examined yet, this study examines identity consistency across nine LLMs. Specifically, we (1) investigate whether LLMs could maintain consistent patterns (or identity) and (2) analyze the effect of the model family, parameter sizes, and provided persona types. Our experiments involve multi-turn conversations on personal themes, analyzed in qualitative and quantitative ways. Experimental results indicate three findings. (1) Larger models experience greater identity drift. (2) Model differences exist, but their effect is not stronger than parameter sizes. (3) Assigning a persona may not help to maintain identity. We hope these three findings can help to improve persona stability in AI-driven dialogue systems, particularly in long-term conversations. </p> </div> </dd> <dt> <a name='item475'>[475]</a> <a href ="/abs/2412.06394" title="Abstract" id="2412.06394"> arXiv:2412.06394 </a> (replaced) [<a href="/pdf/2412.06394" title="Download PDF" id="pdf-2412.06394" aria-labelledby="pdf-2412.06394">pdf</a>, <a href="https://arxiv.org/html/2412.06394v5" title="View HTML" id="html-2412.06394" aria-labelledby="html-2412.06394" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.06394" title="Other formats" id="oth-2412.06394" aria-labelledby="oth-2412.06394">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> GameArena: Evaluating LLM Reasoning through Live Computer Games </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+L">Lanxiang Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Q">Qiyu Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xie,+A">Anze Xie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+N">Nan Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Stoica,+I">Ion Stoica</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jin,+H">Haojian Jin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+H">Hao Zhang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Evaluating the reasoning abilities of large language models (LLMs) is challenging. Existing benchmarks often depend on static datasets, which are vulnerable to data contamination and may get saturated over time, or on binary live human feedback that conflates reasoning with other abilities. As the most prominent dynamic benchmark, Chatbot Arena evaluates open-ended questions in real-world settings, but lacks the granularity in assessing specific reasoning capabilities. We introduce GameArena, a dynamic benchmark designed to evaluate LLM reasoning capabilities through interactive gameplay with humans. GameArena consists of three games designed to test specific reasoning capabilities (e.g., deductive and inductive reasoning), while keeping participants entertained and engaged. We analyze the gaming data retrospectively to uncover the underlying reasoning processes of LLMs and measure their fine-grained reasoning capabilities. We collect over 2000 game sessions and provide detailed assessments of various reasoning capabilities for five state-of-the-art LLMs. Our user study with 100 participants suggests that GameArena improves user engagement compared to Chatbot Arena. For the first time, GameArena enables the collection of step-by-step LLM reasoning data in the wild. </p> </div> </dd> <dt> <a name='item476'>[476]</a> <a href ="/abs/2412.06720" title="Abstract" id="2412.06720"> arXiv:2412.06720 </a> (replaced) [<a href="/pdf/2412.06720" title="Download PDF" id="pdf-2412.06720" aria-labelledby="pdf-2412.06720">pdf</a>, <a href="https://arxiv.org/html/2412.06720v4" title="View HTML" id="html-2412.06720" aria-labelledby="html-2412.06720" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.06720" title="Other formats" id="oth-2412.06720" aria-labelledby="oth-2412.06720">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> VP-MEL: Visual Prompts Guided Multimodal Entity Linking </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Mi,+H">Hongze Mi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+J">Jinyuan Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+X">Xuying Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cheng,+H">Haoran Cheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+J">Jiahao Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+D">Di Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pan,+G">Gang Pan</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Multimodal entity linking (MEL), a task aimed at linking mentions within multimodal contexts to their corresponding entities in a knowledge base (KB), has attracted much attention due to its wide applications in recent years. However, existing MEL methods often rely on mention words as retrieval cues, which limits their ability to effectively utilize information from both images and text. This reliance causes MEL to struggle with accurately retrieving entities in certain scenarios, especially when the focus is on image objects or mention words are missing from the text. To solve these issues, we introduce a Visual Prompts guided Multimodal Entity Linking (VP-MEL) task. Given a text-image pair, VP-MEL aims to link a marked region (i.e., visual prompt) in an image to its corresponding entities in the knowledge base. To facilitate this task, we present a new dataset, VPWiki, specifically designed for VP-MEL. Furthermore, we propose a framework named IIER, which enhances visual feature extraction using visual prompts and leverages the pretrained Detective-VLM model to capture latent information. Experimental results on the VPWiki dataset demonstrate that IIER outperforms baseline methods across multiple benchmarks for the VP-MEL task. </p> </div> </dd> <dt> <a name='item477'>[477]</a> <a href ="/abs/2412.11694" title="Abstract" id="2412.11694"> arXiv:2412.11694 </a> (replaced) [<a href="/pdf/2412.11694" title="Download PDF" id="pdf-2412.11694" aria-labelledby="pdf-2412.11694">pdf</a>, <a href="https://arxiv.org/html/2412.11694v2" title="View HTML" id="html-2412.11694" aria-labelledby="html-2412.11694" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.11694" title="Other formats" id="oth-2412.11694" aria-labelledby="oth-2412.11694">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> From Specific-MLLMs to Omni-MLLMs: A Survey on MLLMs Aligned with Multi-modalities </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+S">Shixin Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liang,+J">Jiafeng Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+J">Jiyuan Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dong,+X">Xuan Dong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chang,+H">Heng Chang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+W">Weijiang Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Du,+J">Jinhua Du</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+M">Ming Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qin,+B">Bing Qin</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 35 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL); Machine Learning (cs.LG) </div> <p class='mathjax'> To tackle complex tasks in real-world scenarios, more researchers are focusing on Omni-MLLMs, which aim to achieve omni-modal understanding and generation. Beyond the constraints of any specific non-linguistic modality, Omni-MLLMs map various non-linguistic modalities into the embedding space of LLMs and enable the interaction and understanding of arbitrary combinations of modalities within a single model. In this paper, we systematically investigate relevant research and provide a comprehensive survey of Omni-MLLMs. Specifically, we first explain the four core components of Omni-MLLMs for unified multi-modal modeling with a meticulous taxonomy that offers novel perspectives. Then, we introduce the effective integration achieved through two-stage training and discuss the corresponding datasets as well as evaluation. Furthermore, we summarize the main challenges of current Omni-MLLMs and outline future directions. We hope this paper serves as an introduction for beginners and promotes the advancement of related research. Resources will be made public. </p> </div> </dd> <dt> <a name='item478'>[478]</a> <a href ="/abs/2412.13631" title="Abstract" id="2412.13631"> arXiv:2412.13631 </a> (replaced) [<a href="/pdf/2412.13631" title="Download PDF" id="pdf-2412.13631" aria-labelledby="pdf-2412.13631">pdf</a>, <a href="https://arxiv.org/html/2412.13631v2" title="View HTML" id="html-2412.13631" aria-labelledby="html-2412.13631" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.13631" title="Other formats" id="oth-2412.13631" aria-labelledby="oth-2412.13631">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Mind Your Theory: Theory of Mind Goes Deeper Than Reasoning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wagner,+E">Eitan Wagner</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Alon,+N">Nitay Alon</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Barnby,+J+M">Joseph M. Barnby</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Abend,+O">Omri Abend</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 4 pages, 2 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Theory of Mind (ToM) capabilities in LLMs have recently become a central object of investigation. Cognitive science distinguishes between two steps required for ToM tasks: 1) determine whether to invoke ToM, which includes the appropriate Depth of Mentalizing (DoM), or level of recursion required to complete a task; and 2) applying the correct inference given the DoM. In this position paper, we first identify several lines of work in different communities in AI, including LLM benchmarking, ToM add-ons, ToM probing, and formal models for ToM. We argue that recent work in AI tends to focus exclusively on the second step which are typically framed as static logic problems. We conclude with suggestions for improved evaluation of ToM capabilities inspired by dynamic environments used in cognitive tasks. </p> </div> </dd> <dt> <a name='item479'>[479]</a> <a href ="/abs/2501.01558" title="Abstract" id="2501.01558"> arXiv:2501.01558 </a> (replaced) [<a href="/pdf/2501.01558" title="Download PDF" id="pdf-2501.01558" aria-labelledby="pdf-2501.01558">pdf</a>, <a href="https://arxiv.org/html/2501.01558v2" title="View HTML" id="html-2501.01558" aria-labelledby="html-2501.01558" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.01558" title="Other formats" id="oth-2501.01558" aria-labelledby="oth-2501.01558">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Predicting the Performance of Black-box LLMs through Self-Queries </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Sam,+D">Dylan Sam</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Finzi,+M">Marc Finzi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kolter,+J+Z">J. Zico Kolter</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 28 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> As large language models (LLMs) are increasingly relied on in AI systems, predicting when they make mistakes is crucial. While a great deal of work in the field uses internal representations to interpret model behavior, these representations are inaccessible when given solely black-box access through an API. In this paper, we extract features of LLMs in a black-box manner by using follow-up prompts and taking the probabilities of different responses as representations to train reliable predictors of model behavior. We demonstrate that training a linear model on these low-dimensional representations produces reliable and generalizable predictors of model performance at the instance level (e.g., if a particular generation correctly answers a question). Remarkably, these can often outperform white-box linear predictors that operate over a model's hidden state or the full distribution over its vocabulary. In addition, we demonstrate that these extracted features can be used to evaluate more nuanced aspects of a language model's state. For instance, they can be used to distinguish between a clean version of GPT-4o-mini and a version that has been influenced via an adversarial system prompt that answers question-answering tasks incorrectly or introduces bugs into generated code. Furthermore, they can reliably distinguish between different model architectures and sizes, enabling the detection of misrepresented models provided through an API (e.g., identifying if GPT-3.5 is supplied instead of GPT-4o-mini). </p> </div> </dd> <dt> <a name='item480'>[480]</a> <a href ="/abs/2501.05952" title="Abstract" id="2501.05952"> arXiv:2501.05952 </a> (replaced) [<a href="/pdf/2501.05952" title="Download PDF" id="pdf-2501.05952" aria-labelledby="pdf-2501.05952">pdf</a>, <a href="https://arxiv.org/html/2501.05952v2" title="View HTML" id="html-2501.05952" aria-labelledby="html-2501.05952" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.05952" title="Other formats" id="oth-2501.05952" aria-labelledby="oth-2501.05952">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Scalable Vision Language Model Training via High Quality Data Curation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Dong,+H">Hongyuan Dong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kang,+Z">Zijian Kang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yin,+W">Weijie Yin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liang,+X">Xiao Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+C">Chao Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ran,+J">Jiao Ran</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> In this paper, we introduce SAIL-VL (ScAlable Vision Language Model TraIning via High QuaLity Data Curation), an open-source vision language model (VLM) series achieving state-of-the-art (SOTA) performance in 2B and 8B parameters. The following three key improvements contribute to SAIL-VL's leading performance: (1) Scalable high-quality visual understanding data construction: We implement a data construction pipeline to enable hundred-million-scale high-quality recaption data annotation, and the resulted dataset SAIL-Caption is validated to be of the highest data quality compared with opensource alternatives. (2) Scalable Pretraining with High-Quality Visual Understanding Data: We scale SAIL-VL's pretraining budget up to 655B tokens and show that even a 2B VLM benefits from scaled up training data sizes, exhibiting expected data size scaling laws in visual understanding and instruction following performance. (3) Scalable SFT via data quantity and complexity scaling: We curate a high-quality SFT dataset collection which outperforms opensource alternatives in data quantity scaling effectiveness. We also demonstrate that training with progressively higher-complexity data surpasses baseline one-stage training by a large margin. SAIL-VL series models achieve the highest average score in 18 widely used VLM benchmarks in our evaluation, with the 2B model takes the top position over VLMs of comparable sizes on OpenCompass 2024 (<a href="https://rank.opencompass.org.cn/leaderboard-multimodal" rel="external noopener nofollow" class="link-external link-https">this https URL</a>) demonstrating robust visual comprehension abilities. SAIL-VL series models are released at HuggingFace (<a href="https://huggingface.co/BytedanceDouyinContent" rel="external noopener nofollow" class="link-external link-https">this https URL</a>). </p> </div> </dd> <dt> <a name='item481'>[481]</a> <a href ="/abs/2501.10711" title="Abstract" id="2501.10711"> arXiv:2501.10711 </a> (replaced) [<a href="/pdf/2501.10711" title="Download PDF" id="pdf-2501.10711" aria-labelledby="pdf-2501.10711">pdf</a>, <a href="https://arxiv.org/html/2501.10711v3" title="View HTML" id="html-2501.10711" aria-labelledby="html-2501.10711" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.10711" title="Other formats" id="oth-2501.10711" aria-labelledby="oth-2501.10711">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> How Should We Build A Benchmark? Revisiting 274 Code-Related Benchmarks For LLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+J">Jialun Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chan,+Y">Yuk-Kit Chan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ling,+Z">Zixuan Ling</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+W">Wenxuan Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+S">Shuqing Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+M">Mingwei Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qiao,+R">Ruixi Qiao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+Y">Yuting Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+C">Chaozheng Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+B">Boxi Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+P">Pinjia He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Shuai Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zheng,+Z">Zibin Zheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lyu,+M+R">Michael R. Lyu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cheung,+S">Shing-Chi Cheung</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 42 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Software Engineering (cs.SE)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> Various benchmarks have been proposed to assess the performance of large language models (LLMs) in different coding scenarios. We refer to them as code-related benchmarks. However, there are no systematic guidelines by which such a benchmark should be developed to ensure its quality, reliability, and reproducibility. We propose How2Bench, which is comprised of a 55-criteria checklist as a set of guidelines to govern the development of code-related benchmarks comprehensively. Using HOW2BENCH, we profiled 274 benchmarks released within the past decade and found concerning issues. Nearly 70% of the benchmarks did not take measures for data quality assurance; over 10% did not even open source or only partially open source. Many highly cited benchmarks have loopholes, including duplicated samples, incorrect reference codes/tests/prompts, and unremoved sensitive/confidential information. Finally, we conducted a human study involving 49 participants, which revealed significant gaps in awareness of the importance of data quality, reproducibility, and transparency. </p> </div> </dd> <dt> <a name='item482'>[482]</a> <a href ="/abs/2501.14846" title="Abstract" id="2501.14846"> arXiv:2501.14846 </a> (replaced) [<a href="/pdf/2501.14846" title="Download PDF" id="pdf-2501.14846" aria-labelledby="pdf-2501.14846">pdf</a>, <a href="/format/2501.14846" title="Other formats" id="oth-2501.14846" aria-labelledby="oth-2501.14846">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Wormhole Memory: A Rubik's Cube for Cross-Dialogue Retrieval </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+L">Libo Wang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> The experimental process and code have been uploaded to the Github repository, the link is: <a href="https://github.com/brucewang123456789/GeniusTrail/tree/main/Wormhole%20Memory%20Module" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> In view of the gap in the current large language model in sharing memory across dialogues, this research proposes a wormhole memory module (WMM) to realize memory as a Rubik's cube that can be arbitrarily retrieved between different dialogues. Through simulation experiments, the researcher built an experimental framework based on the Python environment and used setting memory barriers to simulate the current situation where memories between LLMs dialogues are difficult to share. The CoQA development data set was imported into the experiment, and the feasibility of its cross-dialogue memory retrieval function was verified for WMM's nonlinear indexing and dynamic retrieval, and a comparative analysis was conducted with the capabilities of Titans and MemGPT memory modules. Experimental results show that WMM demonstrated the ability to retrieve memory across dialogues and the stability of quantitative indicators in eight experiments. It contributes new technical approaches to the optimization of memory management of LLMs and provides experience for the practical application in the future. </p> </div> </dd> <dt> <a name='item483'>[483]</a> <a href ="/abs/2501.15857" title="Abstract" id="2501.15857"> arXiv:2501.15857 </a> (replaced) [<a href="/pdf/2501.15857" title="Download PDF" id="pdf-2501.15857" aria-labelledby="pdf-2501.15857">pdf</a>, <a href="/format/2501.15857" title="Other formats" id="oth-2501.15857" aria-labelledby="oth-2501.15857">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Are Transformers Able to Reason by Connecting Separated Knowledge in Training Data? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yin,+Y">Yutong Yin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zhaoran Wang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted by ICLR 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL); Machine Learning (cs.LG) </div> <p class='mathjax'> Humans exhibit remarkable compositional reasoning by integrating knowledge from various sources. For example, if someone learns ( B = f(A) ) from one source and ( C = g(B) ) from another, they can deduce ( C=g(B)=g(f(A)) ) even without encountering ( ABC ) together, showcasing the generalization ability of human intelligence. In this paper, we introduce a synthetic learning task, "FTCT" (Fragmented at Training, Chained at Testing), to validate the potential of Transformers in replicating this skill and interpret its inner mechanism. In the training phase, data consist of separated knowledge fragments from an overall causal graph. During testing, Transformers must infer complete causal graph traces by integrating these fragments. Our findings demonstrate that few-shot Chain-of-Thought prompting enables Transformers to perform compositional reasoning on FTCT by revealing correct combinations of fragments, even if such combinations were absent in the training data. Furthermore, the emergence of compositional reasoning ability is strongly correlated with the model complexity and training-testing data similarity. We propose, both theoretically and empirically, that Transformers learn an underlying generalizable program from training, enabling effective compositional reasoning during testing. </p> </div> </dd> <dt> <a name='item484'>[484]</a> <a href ="/abs/2501.16207" title="Abstract" id="2501.16207"> arXiv:2501.16207 </a> (replaced) [<a href="/pdf/2501.16207" title="Download PDF" id="pdf-2501.16207" aria-labelledby="pdf-2501.16207">pdf</a>, <a href="https://arxiv.org/html/2501.16207v2" title="View HTML" id="html-2501.16207" aria-labelledby="html-2501.16207" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.16207" title="Other formats" id="oth-2501.16207" aria-labelledby="oth-2501.16207">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> From Informal to Formal -- Incorporating and Evaluating LLMs on Natural Language Requirements to Verifiable Formal Proofs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+J">Jialun Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+Y">Yaojie Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+M">Meiziniu Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+H">Haoyang Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+H">Haokun Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+M">Mengda He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wen,+C">Cheng Wen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+L">Le Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+H">Hongyu Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qin,+S">Shengchao Qin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cheung,+S">Shing-Chi Cheung</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tian,+C">Cong Tian</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 19 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL); Programming Languages (cs.PL) </div> <p class='mathjax'> The research in AI-based formal mathematical reasoning has shown an unstop- pable growth trend. These studies have excelled in mathematical competitions like IMO and have made significant progress. This paper focuses on formal verification, an immediate application scenario of formal reasoning, and breaks it down into sub-tasks. We constructed 18k high-quality instruction-response pairs across five formal specification languages (Coq, Lean4, Dafny, ACSL, and TLA+) by distilling gpt-4o and evaluated against ten open-sourced LLMs, including recent popular DeepSeek-R1. We also fine-tuned several 7~8B small models to achieve comparable performance with Deepseek-R1-671B. Interestingly, we observed that fine-tuning with formal data also enhances mathematics, reasoning, and coding capabilities. Fine-tuned models are released at https: //huggingface.co/fm-universe. </p> </div> </dd> <dt> <a name='item485'>[485]</a> <a href ="/abs/2501.16344" title="Abstract" id="2501.16344"> arXiv:2501.16344 </a> (replaced) [<a href="/pdf/2501.16344" title="Download PDF" id="pdf-2501.16344" aria-labelledby="pdf-2501.16344">pdf</a>, <a href="https://arxiv.org/html/2501.16344v2" title="View HTML" id="html-2501.16344" aria-labelledby="html-2501.16344" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.16344" title="Other formats" id="oth-2501.16344" aria-labelledby="oth-2501.16344">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> WhiSPA: Semantically and Psychologically Aligned Whisper with Self-Supervised Contrastive and Student-Teacher Learning </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Rao,+R">Rajath Rao</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Ganesan,+A">Adithya Ganesan</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Kjell,+O">Oscar Kjell</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Luby,+J">Jonah Luby</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Raghavan,+A">Akshay Raghavan</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Feltman,+S">Scott Feltman</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Ringwald,+W">Whitney Ringwald</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Boyd,+R+L">Ryan L. Boyd</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Luft,+B">Benjamin Luft</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Ruggero,+C">Camilo Ruggero</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Ryant,+N">Neville Ryant</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Kotov,+R">Roman Kotov</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Schwartz,+H+A">H. Andrew Schwartz</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 15 pages, 8 figures, ACL ARR 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Sound (cs.SD) </div> <p class='mathjax'> Current speech encoding pipelines often rely on an additional text-based LM to get robust representations of human communication, even though SotA speech-to-text models often have a LM within. This work proposes an approach to improve the LM within an audio model such that the subsequent text-LM is unnecessary. We introduce WhiSPA (Whisper with Semantic and Psychological Alignment), which leverages a novel audio training objective: contrastive loss with a language model embedding as a teacher. Using over 500k speech segments from mental health audio interviews, we evaluate the utility of aligning Whisper's latent space with semantic representations from a text autoencoder (SBERT) and lexically derived embeddings of basic psychological dimensions: emotion and personality. Over self-supervised affective tasks and downstream psychological tasks, WhiSPA surpasses current speech encoders, achieving an average error reduction of 73.4% and 83.8%, respectively. WhiSPA demonstrates that it is not always necessary to run a subsequent text LM on speech-to-text output in order to get a rich psychological representation of human communication. </p> </div> </dd> <dt> <a name='item486'>[486]</a> <a href ="/abs/2502.00510" title="Abstract" id="2502.00510"> arXiv:2502.00510 </a> (replaced) [<a href="/pdf/2502.00510" title="Download PDF" id="pdf-2502.00510" aria-labelledby="pdf-2502.00510">pdf</a>, <a href="https://arxiv.org/html/2502.00510v2" title="View HTML" id="html-2502.00510" aria-labelledby="html-2502.00510" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.00510" title="Other formats" id="oth-2502.00510" aria-labelledby="oth-2502.00510">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Who's the MVP? A Game-Theoretic Evaluation Benchmark for Modular Attribution in LLM Agents </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+Y">Yingxuan Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+B">Bo Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qi,+S">Siyuan Qi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+C">Chao Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+H">Haoyi Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+Y">Yuxuan Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+J">Jinbo Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+H">Haoran Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+Z">Ziyi He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+X">Xiao Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zongyu Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qiu,+L">Lin Qiu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+X">Xuezhi Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cai,+X">Xunliang Cai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+Y">Yong Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+W">Weinan Zhang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Large Language Model (LLM) agents frameworks often employ modular architectures, incorporating components such as planning, reasoning, action execution, and reflection to tackle complex tasks. However, quantifying the contribution of each module to overall system performance remains a significant challenge, impeding optimization and interpretability. To address this, we introduce CapaBench (Capability-level Assessment Benchmark), an evaluation framework grounded in cooperative game theory's Shapley Value, which systematically measures the marginal impact of individual modules and their interactions within an agent's architecture. By replacing default modules with test variants across all possible combinations, CapaBench provides a principle method for attributing performance contributions. Key contributions include: (1) We are the first to propose a Shapley Value-based methodology for quantifying the contributions of capabilities in LLM agents; (2) Modules with high Shapley Values consistently lead to predictable performance gains when combined, enabling targeted optimization; and (3) We build a multi-round dataset of over 1,500 entries spanning diverse domains and practical task scenarios, enabling comprehensive evaluation of agent capabilities. CapaBench bridges the gap between component-level evaluation and holistic system assessment, providing actionable insights for optimizing modular LLM agents and advancing their deployment in complex, real-world scenarios. </p> </div> </dd> <dt> <a name='item487'>[487]</a> <a href ="/abs/2502.00691" title="Abstract" id="2502.00691"> arXiv:2502.00691 </a> (replaced) [<a href="/pdf/2502.00691" title="Download PDF" id="pdf-2502.00691" aria-labelledby="pdf-2502.00691">pdf</a>, <a href="https://arxiv.org/html/2502.00691v2" title="View HTML" id="html-2502.00691" aria-labelledby="html-2502.00691" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.00691" title="Other formats" id="oth-2502.00691" aria-labelledby="oth-2502.00691">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Learning Autonomous Code Integration for Math Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+H">Haozhe Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+L">Long Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qu,+C">Chao Qu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+F">Fengming Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+W">Weidi Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chu,+W">Wei Chu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+F">Fangzhen Lin</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL); Machine Learning (cs.LG) </div> <p class='mathjax'> Recent advances in mathematical problem-solving with language models (LMs) integrate chain-of-thought (CoT) reasoning and code execution to harness their complementary strengths. However, existing hybrid frameworks exhibit a critical limitation: they depend on externally dictated instructions or rigid code-integration templates, lacking metacognitive awareness -- the capacity to dynamically evaluate intrinsic capabilities and autonomously determine when and how to integrate tools. This rigidity motivates our study of autonomous code integration, enabling models to adapt tool-usage strategies as their reasoning abilities evolve during training. <br>While reinforcement learning (RL) shows promise for boosting LLM reasoning at scale (e.g., DeepSeek-R1), we demonstrate its inefficiency in learning autonomous code integration due to inadequate exploration of the vast combinatorial space of CoT-code interleaving patterns. To address this challenge, we propose a novel Expectation-Maximization (EM) framework that synergizes structured exploration (E-step) with off-policy RL optimization (M-step), creating a self-reinforcing cycle between metacognitive tool-use decisions and evolving capabilities. Experiments reveal our method achieves superior results through improved exploration. Notably, our 7B model improves over 11% on MATH500 and 9.4% on AIME without o1-like CoT. </p> </div> </dd> <dt> <a name='item488'>[488]</a> <a href ="/abs/2502.02904" title="Abstract" id="2502.02904"> arXiv:2502.02904 </a> (replaced) [<a href="/pdf/2502.02904" title="Download PDF" id="pdf-2502.02904" aria-labelledby="pdf-2502.02904">pdf</a>, <a href="https://arxiv.org/html/2502.02904v3" title="View HTML" id="html-2502.02904" aria-labelledby="html-2502.02904" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.02904" title="Other formats" id="oth-2502.02904" aria-labelledby="oth-2502.02904">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> ScholaWrite: A Dataset of End-to-End Scholarly Writing Process </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+L">Linghe Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+M">Minhwa Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Volkov,+R">Ross Volkov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chau,+L+T">Luan Tuyen Chau</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kang,+D">Dongyeop Kang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Equal contribution: Linghe Wang, Minhwa Lee | project page: <a href="https://minnesotanlp.github.io/scholawrite/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Human-Computer Interaction (cs.HC)</span>; Computation and Language (cs.CL); Neurons and Cognition (q-bio.NC) </div> <p class='mathjax'> Writing is a cognitively demanding task involving continuous decision-making, heavy use of working memory, and frequent switching between multiple activities. Scholarly writing is particularly complex as it requires authors to coordinate many pieces of multiform knowledge. To fully understand writers' cognitive thought process, one should fully decode the end-to-end writing data (from individual ideas to final manuscript) and understand their complex cognitive mechanisms in scholarly writing. We introduce ScholaWrite dataset, a first-of-its-kind keystroke corpus of an end-to-end scholarly writing process for complete manuscripts, with thorough annotations of cognitive writing intentions behind each keystroke. Our dataset includes LaTeX-based keystroke data from five preprints with nearly 62K total text changes and annotations across 4 months of paper writing. ScholaWrite shows promising usability and applications (e.g., iterative self-writing), demonstrating the importance of collection of end-to-end writing data, rather than the final manuscript, for the development of future writing assistants to support the cognitive thinking process of scientists. Our de-identified data examples and code are available on our project page. </p> </div> </dd> <dt> <a name='item489'>[489]</a> <a href ="/abs/2502.05171" title="Abstract" id="2502.05171"> arXiv:2502.05171 </a> (replaced) [<a href="/pdf/2502.05171" title="Download PDF" id="pdf-2502.05171" aria-labelledby="pdf-2502.05171">pdf</a>, <a href="https://arxiv.org/html/2502.05171v2" title="View HTML" id="html-2502.05171" aria-labelledby="html-2502.05171" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.05171" title="Other formats" id="oth-2502.05171" aria-labelledby="oth-2502.05171">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Scaling up Test-Time Compute with Latent Reasoning: A Recurrent Depth Approach </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Geiping,+J">Jonas Geiping</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=McLeish,+S">Sean McLeish</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jain,+N">Neel Jain</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kirchenbauer,+J">John Kirchenbauer</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Singh,+S">Siddharth Singh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bartoldson,+B+R">Brian R. Bartoldson</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kailkhura,+B">Bhavya Kailkhura</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bhatele,+A">Abhinav Bhatele</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Goldstein,+T">Tom Goldstein</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> The model is available at <a href="https://huggingface.co/tomg-group-umd/huginn-0125" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. Code and data recipe can be found at <a href="https://github.com/seal-rg/recurrent-pretraining" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> We study a novel language model architecture that is capable of scaling test-time computation by implicitly reasoning in latent space. Our model works by iterating a recurrent block, thereby unrolling to arbitrary depth at test-time. This stands in contrast to mainstream reasoning models that scale up compute by producing more tokens. Unlike approaches based on chain-of-thought, our approach does not require any specialized training data, can work with small context windows, and can capture types of reasoning that are not easily represented in words. We scale a proof-of-concept model to 3.5 billion parameters and 800 billion tokens. We show that the resulting model can improve its performance on reasoning benchmarks, sometimes dramatically, up to a computation load equivalent to 50 billion parameters. </p> </div> </dd> <dt> <a name='item490'>[490]</a> <a href ="/abs/2502.08557" title="Abstract" id="2502.08557"> arXiv:2502.08557 </a> (replaced) [<a href="/pdf/2502.08557" title="Download PDF" id="pdf-2502.08557" aria-labelledby="pdf-2502.08557">pdf</a>, <a href="https://arxiv.org/html/2502.08557v2" title="View HTML" id="html-2502.08557" aria-labelledby="html-2502.08557" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.08557" title="Other formats" id="oth-2502.08557" aria-labelledby="oth-2502.08557">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> QA-Expand: Multi-Question Answer Generation for Enhanced Query Expansion in Information Retrieval </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Seo,+W">Wonduk Seo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+S">Seunghyun Lee</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 8 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Information Retrieval (cs.IR)</span>; Computation and Language (cs.CL); Machine Learning (cs.LG); Multiagent Systems (cs.MA) </div> <p class='mathjax'> Query expansion is widely used in Information Retrieval (IR) to improve search outcomes by enriching queries with additional contextual information. Although recent Large Language Model (LLM) based methods generate pseudo-relevant content and expanded terms via multiple prompts, they often yield repetitive, narrow expansions that lack the diverse context needed to retrieve all relevant information. In this paper, we introduce QA-Expand, a novel and effective framework for query expansion. It first generates multiple relevant questions from the initial query and subsequently produces corresponding pseudo-answers as surrogate documents. A feedback model further rewrites and filters these answers to ensure only the most informative augmentations are incorporated. Extensive experiments on benchmarks such as BEIR and TREC demonstrate that QA-Expand enhances retrieval performance by up to 13% over state-of-the-art methods, offering a robust solution for modern retrieval challenges. </p> </div> </dd> <dt> <a name='item491'>[491]</a> <a href ="/abs/2502.09782" title="Abstract" id="2502.09782"> arXiv:2502.09782 </a> (replaced) [<a href="/pdf/2502.09782" title="Download PDF" id="pdf-2502.09782" aria-labelledby="pdf-2502.09782">pdf</a>, <a href="/format/2502.09782" title="Other formats" id="oth-2502.09782" aria-labelledby="oth-2502.09782">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Improving Acoustic Side-Channel Attacks on Keyboards Using Transformers and Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Park,+J+H">Jin Hyun Park</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ayati,+S+A">Seyyed Ali Ayati</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cai,+Y">Yichen Cai</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> We will reflect comments from the reviewers and re-submit </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Audio and Speech Processing (eess.AS) </div> <p class='mathjax'> The increasing prevalence of microphones in everyday devices and the growing reliance on online services have amplified the risk of acoustic side-channel attacks (ASCAs) targeting keyboards. This study explores deep learning techniques, specifically vision transformers (VTs) and large language models (LLMs), to enhance the effectiveness and applicability of such attacks. We present substantial improvements over prior research, with the CoAtNet model achieving state-of-the-art performance. Our CoAtNet shows a 5.0% improvement for keystrokes recorded via smartphone (Phone) and 5.9% for those recorded via Zoom compared to previous benchmarks. We also evaluate transformer architectures and language models, with the best VT model matching CoAtNet's performance. A key advancement is the introduction of a noise mitigation method for real-world scenarios. By using LLMs for contextual understanding, we detect and correct erroneous keystrokes in noisy environments, enhancing ASCA performance. Additionally, fine-tuned lightweight language models with Low-Rank Adaptation (LoRA) deliver comparable performance to heavyweight models with 67X more parameters. This integration of VTs and LLMs improves the practical applicability of ASCA mitigation, marking the first use of these technologies to address ASCAs and error correction in real-world scenarios. </p> </div> </dd> <dt> <a name='item492'>[492]</a> <a href ="/abs/2502.09969" title="Abstract" id="2502.09969"> arXiv:2502.09969 </a> (replaced) [<a href="/pdf/2502.09969" title="Download PDF" id="pdf-2502.09969" aria-labelledby="pdf-2502.09969">pdf</a>, <a href="https://arxiv.org/html/2502.09969v2" title="View HTML" id="html-2502.09969" aria-labelledby="html-2502.09969" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.09969" title="Other formats" id="oth-2502.09969" aria-labelledby="oth-2502.09969">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Data Valuation using Neural Networks for Efficient Instruction Fine-Tuning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Agarwal,+I">Ishika Agarwal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hakkani-T%C3%BCr,+D">Dilek Hakkani-T眉r</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> Influence functions provide crucial insights into model training, but existing methods suffer from large computational costs and limited generalization. Particularly, recent works have proposed various metrics and algorithms to calculate the influence of data using language models, which do not scale well with large models and datasets. This is because of the expensive forward and backward passes required for computation, substantial memory requirements to store large models, and poor generalization of influence estimates to new data. In this paper, we explore the use of small neural networks -- which we refer to as the InfluenceNetwork -- to estimate influence values, achieving up to 99% cost reduction. Our evaluation demonstrates that influence values can be estimated with models just 0.0027% the size of full language models (we use 7B and 8B versions). We apply our algorithm of estimating influence values (called NN-CIFT: Neural Networks for effiCient Instruction Fine-Tuning) to the downstream task of subset selection for general instruction fine-tuning. In our study, we include four state-of-the-art influence functions and show no compromise in performance, despite large speedups, between NN-CIFT and the original influence functions. We provide an in-depth hyperparameter analyses of NN-CIFT. The code for our method can be found here: <a href="https://github.com/agarwalishika/NN-CIFT" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item493'>[493]</a> <a href ="/abs/2502.10248" title="Abstract" id="2502.10248"> arXiv:2502.10248 </a> (replaced) [<a href="/pdf/2502.10248" title="Download PDF" id="pdf-2502.10248" aria-labelledby="pdf-2502.10248">pdf</a>, <a href="https://arxiv.org/html/2502.10248v2" title="View HTML" id="html-2502.10248" aria-labelledby="html-2502.10248" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10248" title="Other formats" id="oth-2502.10248" aria-labelledby="oth-2502.10248">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Step-Video-T2V Technical Report: The Practice, Challenges, and Future of Video Foundation Model </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+G">Guoqing Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+H">Haoyang Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yan,+K">Kun Yan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+L">Liangyu Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Duan,+N">Nan Duan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yin,+S">Shengming Yin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wan,+C">Changyi Wan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ming,+R">Ranchen Ming</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+X">Xiaoniu Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+X">Xing Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+Y">Yu Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+D">Deshan Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+D">Deyu Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+J">Jian Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tan,+K">Kaijun Tan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=An,+K">Kang An</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+M">Mei Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ji,+W">Wei Ji</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Q">Qiling Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+W">Wen Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+X">Xin Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wei,+Y">Yanan Wei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ge,+Z">Zheng Ge</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+A">Aojie Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+B">Bin Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+B">Bizhu Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+B">Bo Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+B">Brian Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Miao,+C">Changxing Miao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+C">Chen Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+C">Chenfei Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+C">Chenguang Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shi,+D">Dapeng Shi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+D">Dingyuan Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+E">Enle Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+G">Gang Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+G">Ge Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+G">Guanzhe Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yan,+G">Gulin Yan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+H">Haiyang Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nie,+H">Hao Nie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jia,+H">Haonan Jia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+H">Hanpeng Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+H">Hanqi Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yan,+H">Haolong Yan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+H">Heng Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+H">Hongcheng Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiong,+H">Huilin Xiong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiong,+H">Huixin Xiong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gong,+J">Jiahao Gong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+J">Jianchang Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+J">Jiaoren Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+J">Jie Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+J">Jie Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+J">Jiashuai Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+J">Jiashuo Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+J">Jingyang Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+J">Junjing Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+J">Junzhe Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+K">Kaixiang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+L">Lei Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xia,+L">Lei Xia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+L">Liang Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tan,+L">Liguo Tan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+L">Liwen Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shi,+L">Liying Shi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+M">Ming Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+M">Mingliang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cheng,+M">Muhua Cheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+N">Na Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Q">Qiaohui Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+Q">Qinglin He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liang,+Q">Qiuyan Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+Q">Quan Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+R">Ran Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+R">Rui Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pang,+S">Shaoliang Pang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+S">Shiliang Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+S">Sitong Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+S">Siqi Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+S">Shuli Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+T">Tiancheng Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+T">Tianyu Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ming,+W">Weipeng Ming</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+W">Wenqing He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+X">Xu Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+X">Xuelin Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zeng,+X">Xianfang Zeng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+X">Xiaojia Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+X">Xuan Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dai,+Y">Yaqi Dai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+Y">Yanbo Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Deng,+Y">Yineng Deng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yingming Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yilei Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+Y">Yuanwei Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Y">Yu Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+Y">Yu Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+Y">Yuchu Luo</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 36 pages, 14 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> We present Step-Video-T2V, a state-of-the-art text-to-video pre-trained model with 30B parameters and the ability to generate videos up to 204 frames in length. A deep compression Variational Autoencoder, Video-VAE, is designed for video generation tasks, achieving 16x16 spatial and 8x temporal compression ratios, while maintaining exceptional video reconstruction quality. User prompts are encoded using two bilingual text encoders to handle both English and Chinese. A DiT with 3D full attention is trained using Flow Matching and is employed to denoise input noise into latent frames. A video-based DPO approach, Video-DPO, is applied to reduce artifacts and improve the visual quality of the generated videos. We also detail our training strategies and share key observations and insights. Step-Video-T2V's performance is evaluated on a novel video generation benchmark, Step-Video-T2V-Eval, demonstrating its state-of-the-art text-to-video quality when compared with both open-source and commercial engines. Additionally, we discuss the limitations of current diffusion-based model paradigm and outline future directions for video foundation models. We make both Step-Video-T2V and Step-Video-T2V-Eval available at <a href="https://github.com/stepfun-ai/Step-Video-T2V" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. The online version can be accessed from <a href="https://yuewen.cn/videos" rel="external noopener nofollow" class="link-external link-https">this https URL</a> as well. Our goal is to accelerate the innovation of video foundation models and empower video content creators. </p> </div> </dd> </dl> <div class='paging'>Total of 493 entries </div> <div class='morefewer'>Showing up to 2000 entries per page: <a href=/list/cs.CL/new?skip=0&show=1000 rel="nofollow"> fewer</a> | <span style="color: #454545">more</span> | <span style="color: #454545">all</span> </div> </div> </div> </div> </main> <footer style="clear: both;"> <div class="columns is-desktop" role="navigation" aria-label="Secondary" style="margin: -0.75em -0.75em 0.75em -0.75em"> <!-- Macro-Column 1 --> <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- End Macro-Column 1 --> <!-- Macro-Column 2 --> <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> <!-- End Macro-Column 2 --> </div> </footer> </div> <script src="/static/base/1.0.1/js/member_acknowledgement.js"></script> </body> </html>