CINXE.COM
Computation and Language
<!DOCTYPE html> <html lang="en"> <head> <title>Computation and Language </title> <meta name="viewport" content="width=device-width, initial-scale=1"> <link rel="apple-touch-icon" sizes="180x180" href="/static/browse/0.3.4/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="/static/browse/0.3.4/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="/static/browse/0.3.4/images/icons/favicon-16x16.png"> <link rel="manifest" href="/static/browse/0.3.4/images/icons/site.webmanifest"> <link rel="mask-icon" href="/static/browse/0.3.4/images/icons/safari-pinned-tab.svg" color="#5bbad5"> <meta name="msapplication-TileColor" content="#da532c"> <meta name="theme-color" content="#ffffff"> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/arXiv.css?v=20241206" /> <link rel="stylesheet" type="text/css" media="print" href="/static/browse/0.3.4/css/arXiv-print.css?v=20200611" /> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/browse_search.css" /> <script language="javascript" src="/static/browse/0.3.4/js/accordion.js" /></script> <script src="/static/browse/0.3.4/js/mathjaxToggle.min.js" type="text/javascript"></script> <script type="text/javascript" language="javascript">mathjaxToggle();</script> </head> <body class="with-cu-identity"> <div class="flex-wrap-footer"> <header> <a href="#content" class="is-sr-only">Skip to main content</a> <!-- start desktop header --> <div class="columns is-vcentered is-hidden-mobile" id="cu-identity"> <div class="column" id="cu-logo"> <a href="https://www.cornell.edu/"><img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University" /></a> </div><div class="column" id="support-ack"> <span id="support-ack-url">We gratefully acknowledge support from the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors.</span> <a href="https://info.arxiv.org/about/donate.html" class="btn-header-donate">Donate</a> </div> </div> <div id="header" class="is-hidden-mobile"> <a aria-hidden="true" tabindex="-1" href="/IgnoreMe"></a> <div class="header-breadcrumbs"> <a href="/"><img src="/static/browse/0.3.4/images/arxiv-logo-one-color-white.svg" alt="arxiv logo" style="height:40px;"/></a> <span>></span> <a href="/list/cs.CL/recent">cs.CL</a> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div><!-- /end desktop header --> <div class="mobile-header"> <div class="columns is-mobile"> <div class="column logo-arxiv"><a href="https://arxiv.org/"><img src="/static/browse/0.3.4/images/arxiv-logomark-small-white.svg" alt="arXiv logo" style="height:60px;" /></a></div> <div class="column logo-cornell"><a href="https://www.cornell.edu/"> <picture> <source media="(min-width: 501px)" srcset="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg 400w" sizes="400w" /> <source srcset="/static/browse/0.3.4/images/icons/cu/cornell_seal_simple_black.svg 2x" /> <img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University Logo" /> </picture> </a></div> <div class="column nav" id="toggle-container" role="menubar"> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-white"><title>open search</title><path d="M505 442.7L405.3 343c-4.5-4.5-10.6-7-17-7H372c27.6-35.3 44-79.7 44-128C416 93.1 322.9 0 208 0S0 93.1 0 208s93.1 208 208 208c48.3 0 92.7-16.4 128-44v16.3c0 6.4 2.5 12.5 7 17l99.7 99.7c9.4 9.4 24.6 9.4 33.9 0l28.3-28.3c9.4-9.4 9.4-24.6.1-34zM208 336c-70.7 0-128-57.2-128-128 0-70.7 57.2-128 128-128 70.7 0 128 57.2 128 128 0 70.7-57.2 128-128 128z"/></svg></button> <div class="mobile-toggle-block toggle-target"> <form class="mobile-search-form" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <input class="input" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <input type="hidden" name="source" value="header"> <input type="hidden" name="searchtype" value="all"> <button class="button">GO</button> </div> </form> </div> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-white" role="menu"><title>open navigation menu</title><path d="M16 132h416c8.837 0 16-7.163 16-16V76c0-8.837-7.163-16-16-16H16C7.163 60 0 67.163 0 76v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16z"/ ></svg></button> <div class="mobile-toggle-block toggle-target"> <nav class="mobile-menu" aria-labelledby="mobilemenulabel"> <h2 id="mobilemenulabel">quick links</h2> <ul> <li><a href="https://arxiv.org/login">Login</a></li> <li><a href="https://info.arxiv.org/help">Help Pages</a></li> <li><a href="https://info.arxiv.org/about">About</a></li> </ul> </nav> </div> </div> </div> </div><!-- /end mobile-header --> </header> <main> <div id="content"> <div id='content-inner'> <div id='dlpage'> <h1>Computation and Language</h1> <ul> <li><a href="#item0">New submissions</a></li> <li><a href="#item147">Cross-lists</a></li> <li><a href="#item177">Replacements</a></li> </ul> <p>See <a id="recent-cs.CL" aria-labelledby="recent-cs.CL" href="/list/cs.CL/recent">recent</a> articles</p> <h3>Showing new listings for Wednesday, 19 February 2025</h3> <div class='paging'>Total of 293 entries </div> <div class='morefewer'>Showing up to 2000 entries per page: <a href=/list/cs.CL/new?skip=0&show=1000 rel="nofollow"> fewer</a> | <span style="color: #454545">more</span> | <span style="color: #454545">all</span> </div> <dl id='articles'> <h3>New submissions (showing 146 of 146 entries)</h3> <dt> <a name='item1'>[1]</a> <a href ="/abs/2502.12183" title="Abstract" id="2502.12183"> arXiv:2502.12183 </a> [<a href="/pdf/2502.12183" title="Download PDF" id="pdf-2502.12183" aria-labelledby="pdf-2502.12183">pdf</a>, <a href="/format/2502.12183" title="Other formats" id="oth-2502.12183" aria-labelledby="oth-2502.12183">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Leveraging large language models for structured information extraction from pathology reports </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Balasubramanian,+J+B">Jeya Balaji Balasubramanian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Adams,+D">Daniel Adams</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Roxanis,+I">Ioannis Roxanis</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=de+Gonzalez,+A+B">Amy Berrington de Gonzalez</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Coulson,+P">Penny Coulson</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Almeida,+J+S">Jonas S. Almeida</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Garc%C3%ADa-Closas,+M">Montserrat Garc铆a-Closas</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 29 pages, 6 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Background: Structured information extraction from unstructured histopathology reports facilitates data accessibility for clinical research. Manual extraction by experts is time-consuming and expensive, limiting scalability. Large language models (LLMs) offer efficient automated extraction through zero-shot prompting, requiring only natural language instructions without labeled data or training. We evaluate LLMs' accuracy in extracting structured information from breast cancer histopathology reports, compared to manual extraction by a trained human annotator. <br>Methods: We developed the Medical Report Information Extractor, a web application leveraging LLMs for automated extraction. We developed a gold standard extraction dataset to evaluate the human annotator alongside five LLMs including GPT-4o, a leading proprietary model, and the Llama 3 model family, which allows self-hosting for data privacy. Our assessment involved 111 histopathology reports from the Breast Cancer Now (BCN) Generations Study, extracting 51 pathology features specified in the study's data dictionary. <br>Results: Evaluation against the gold standard dataset showed that both Llama 3.1 405B (94.7% accuracy) and GPT-4o (96.1%) achieved extraction accuracy comparable to the human annotator (95.4%; p = 0.146 and p = 0.106, respectively). While Llama 3.1 70B (91.6%) performed below human accuracy (p <0.001), its reduced computational requirements make it a viable option for self-hosting. <br>Conclusion: We developed an open-source tool for structured information extraction that can be customized by non-programmers using natural language. Its modular design enables reuse for various extraction tasks, producing standardized, structured data from unstructured text reports to facilitate analytics through improved accessibility and interoperability. </p> </div> </dd> <dt> <a name='item2'>[2]</a> <a href ="/abs/2502.12185" title="Abstract" id="2502.12185"> arXiv:2502.12185 </a> [<a href="/pdf/2502.12185" title="Download PDF" id="pdf-2502.12185" aria-labelledby="pdf-2502.12185">pdf</a>, <a href="/format/2502.12185" title="Other formats" id="oth-2502.12185" aria-labelledby="oth-2502.12185">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Large Language Models for Extrapolative Modeling of Manufacturing Processes </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Khanghah,+K+N">Kiarash Naghavi Khanghah</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Patel,+A">Anandkumar Patel</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Malhotra,+R">Rajiv Malhotra</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+H">Hongyi Xu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Conventional predictive modeling of parametric relationships in manufacturing processes is limited by the subjectivity of human expertise and intuition on the one hand and by the cost and time of experimental data generation on the other hand. This work addresses this issue by establishing a new Large Language Model (LLM) framework. The novelty lies in combining automatic extraction of process-relevant knowledge embedded in the literature with iterative model refinement based on a small amount of experimental data. This approach is evaluated on three distinct manufacturing processes that are based on machining, deformation, and additive principles. The results show that for the same small experimental data budget the models derived by our framework have unexpectedly high extrapolative performance, often surpassing the capabilities of conventional Machine Learning. Further, our approach eliminates manual generation of initial models or expertise-dependent interpretation of the literature. The results also reveal the importance of the nature of the knowledge extracted from the literature and the significance of both the knowledge extraction and model refinement components. </p> </div> </dd> <dt> <a name='item3'>[3]</a> <a href ="/abs/2502.12187" title="Abstract" id="2502.12187"> arXiv:2502.12187 </a> [<a href="/pdf/2502.12187" title="Download PDF" id="pdf-2502.12187" aria-labelledby="pdf-2502.12187">pdf</a>, <a href="/format/2502.12187" title="Other formats" id="oth-2502.12187" aria-labelledby="oth-2502.12187">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Hallucinations are inevitable but statistically negligible </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Suzuki,+A">Atsushi Suzuki</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+Y">Yulan He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tian,+F">Feng Tian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zhongyuan Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Formal Languages and Automata Theory (cs.FL); Machine Learning (cs.LG); Statistics Theory (math.ST); Machine Learning (stat.ML) </div> <p class='mathjax'> Hallucinations, a phenomenon where a language model (LM) generates nonfactual content, pose a significant challenge to the practical deployment of LMs. While many empirical methods have been proposed to mitigate hallucinations, a recent study established a computability-theoretic result showing that any LM will inevitably generate hallucinations on an infinite set of inputs, regardless of the quality and quantity of training datasets and the choice of the language model architecture and training and inference algorithms. Although the computability-theoretic result may seem pessimistic, its significance in practical viewpoints has remained unclear. In contrast, we present a positive theoretical result from a probabilistic perspective. Specifically, we prove that hallucinations can be made statistically negligible, provided that the quality and quantity of the training data are sufficient. Interestingly, our positive result coexists with the computability-theoretic result, implying that while hallucinations on an infinite set of inputs cannot be entirely eliminated, their probability can always be reduced by improving algorithms and training data. By evaluating the two seemingly contradictory results through the lens of information theory, we argue that our probability-theoretic positive result better reflects practical considerations than the computability-theoretic negative result. </p> </div> </dd> <dt> <a name='item4'>[4]</a> <a href ="/abs/2502.12189" title="Abstract" id="2502.12189"> arXiv:2502.12189 </a> [<a href="/pdf/2502.12189" title="Download PDF" id="pdf-2502.12189" aria-labelledby="pdf-2502.12189">pdf</a>, <a href="https://arxiv.org/html/2502.12189v1" title="View HTML" id="html-2502.12189" aria-labelledby="html-2502.12189" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12189" title="Other formats" id="oth-2502.12189" aria-labelledby="oth-2502.12189">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Self-supervised Attribute-aware Dynamic Preference Ranking Alignment </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+H">Hongyu Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+Q">Qi Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=hu,+Z">Zhenhua hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+R">Rui Li</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Reinforcement Learning from Human Feedback and its variants excel in aligning with human intentions to generate helpful, harmless, and honest responses. However, most of them rely on costly human-annotated pairwise comparisons for supervised alignment, which is not suitable for list-level scenarios, such as community question answering. Additionally, human preferences are influenced by multiple intrinsic factors in responses, leading to decision-making inconsistencies. Therefore, we propose \textbf{Se}lf-supervised \textbf{A}ttribute-aware \textbf{d}ynamic \textbf{p}reference \textbf{ra}nking, called \shortname. \ It quantifies preference differences between responses based on Attribute-Perceptual Distance Factors (APDF) and dynamically determines the list-wise alignment order. Furthermore, it achieves fine-grained preference difference learning and enables precise alignment with the optimal one. We specifically constructed a challenging code preference dataset named StaCoCoQA, and introduced more cost-effective and scalable preference evaluation metrics: PrefHit and PrefRecall. Extensive experimental results show that SeAdpra exhibits superior performance and generalizability on both StaCoCoQA and preference datasets from eight popular domains. </p> </div> </dd> <dt> <a name='item5'>[5]</a> <a href ="/abs/2502.12193" title="Abstract" id="2502.12193"> arXiv:2502.12193 </a> [<a href="/pdf/2502.12193" title="Download PDF" id="pdf-2502.12193" aria-labelledby="pdf-2502.12193">pdf</a>, <a href="https://arxiv.org/html/2502.12193v1" title="View HTML" id="html-2502.12193" aria-labelledby="html-2502.12193" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12193" title="Other formats" id="oth-2502.12193" aria-labelledby="oth-2502.12193">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> AI and the Law: Evaluating ChatGPT's Performance in Legal Classification </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Weichbroth,+P">Pawel Weichbroth</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 15 pages; 1 figure; 2 tables; 32 references </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> The use of ChatGPT to analyze and classify evidence in criminal proceedings has been a topic of ongoing discussion. However, to the best of our knowledge, this issue has not been studied in the context of the Polish language. This study addresses this research gap by evaluating the effectiveness of ChatGPT in classifying legal cases under the Polish Penal Code. The results show excellent binary classification accuracy, with all positive and negative cases correctly categorized. In addition, a qualitative evaluation confirms that the legal basis provided for each case, along with the relevant legal content, was appropriate. The results obtained suggest that ChatGPT can effectively analyze and classify evidence while applying the appropriate legal rules. In conclusion, ChatGPT has the potential to assist interested parties in the analysis of evidence and serve as a valuable legal resource for individuals with less experience or knowledge in this area. </p> </div> </dd> <dt> <a name='item6'>[6]</a> <a href ="/abs/2502.12197" title="Abstract" id="2502.12197"> arXiv:2502.12197 </a> [<a href="/pdf/2502.12197" title="Download PDF" id="pdf-2502.12197" aria-labelledby="pdf-2502.12197">pdf</a>, <a href="/format/2502.12197" title="Other formats" id="oth-2502.12197" aria-labelledby="oth-2502.12197">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Closer Look at System Prompt Robustness </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Mu,+N">Norman Mu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+J">Jonathan Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lavery,+M">Michael Lavery</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wagner,+D">David Wagner</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Artifacts: <a href="https://github.com/normster/RealGuardrails" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> System prompts have emerged as a critical control surface for specifying the behavior of LLMs in chat and agent settings. Developers depend on system prompts to specify important context, output format, personalities, guardrails, content policies, and safety countermeasures, all of which require models to robustly adhere to the system prompt, especially when facing conflicting or adversarial user inputs. In practice, models often forget to consider relevant guardrails or fail to resolve conflicting demands between the system and the user. In this work, we study various methods for improving system prompt robustness by creating realistic new evaluation and fine-tuning datasets based on prompts collected from from OpenAI's GPT Store and HuggingFace's HuggingChat. Our experiments assessing models with a panel of new and existing benchmarks show that performance can be considerably improved with realistic fine-tuning data, as well as inference-time interventions such as classifier-free guidance. Finally, we analyze the results of recently released reasoning models from OpenAI and DeepSeek, which show exciting but uneven improvements on the benchmarks we study. Overall, current techniques fall short of ensuring system prompt robustness and further study is warranted. </p> </div> </dd> <dt> <a name='item7'>[7]</a> <a href ="/abs/2502.12200" title="Abstract" id="2502.12200"> arXiv:2502.12200 </a> [<a href="/pdf/2502.12200" title="Download PDF" id="pdf-2502.12200" aria-labelledby="pdf-2502.12200">pdf</a>, <a href="https://arxiv.org/html/2502.12200v1" title="View HTML" id="html-2502.12200" aria-labelledby="html-2502.12200" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12200" title="Other formats" id="oth-2502.12200" aria-labelledby="oth-2502.12200">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Efficient and Effective Prompt Tuning via Prompt Decomposition and Compressed Outer Product </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lan,+P">Pengxiang Lan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+H">Haoyu Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+E">Enneng Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liang,+Y">Yuliang Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+G">Guibing Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+J">Jianzhe Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+X">Xingwei Wang</a></div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> NAACL 2025 main conference </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Prompt tuning (PT) offers a cost-effective alternative to fine-tuning large-scale pre-trained language models (PLMs), requiring only a few parameters in soft prompt tokens added before the input text. However, existing PT approaches face two significant issues: (i) They overlook intrinsic semantic associations between soft prompt tokens, leading to high discreteness and limited interactions, thus reducing the model's comprehension and effectiveness in complex tasks. (ii) Due to the complexity of downstream tasks, long soft prompt is necessitated to improve performance, but prompt length correlates positively with memory usage and computational costs. Achieving high efficiency and performance remains an ongoing challenge. To address these issues, we propose a novel Low-parameters prompt tuning (LAMP) method, which leverages prompt decomposition and compressed outer product. Specifically, the prompt decomposition module employs Truncated SVD to reduce training parameters and significantly lower the dimensionality of the soft prompt parameter space. It then utilizes a compressed outer product module to facilitate multiple interactions among prompt tokens, exploring their intrinsic associations to enhance knowledge representation. Finally, LAMP uses average pooling to reduce memory usage and training/inference time. Extensive experiments across six architectures and eight datasets demonstrate that LAMP outperforms state-of-the-art PT-based and LoRA-based methods in performance and efficiency. </p> </div> </dd> <dt> <a name='item8'>[8]</a> <a href ="/abs/2502.12202" title="Abstract" id="2502.12202"> arXiv:2502.12202 </a> [<a href="/pdf/2502.12202" title="Download PDF" id="pdf-2502.12202" aria-labelledby="pdf-2502.12202">pdf</a>, <a href="https://arxiv.org/html/2502.12202v1" title="View HTML" id="html-2502.12202" aria-labelledby="html-2502.12202" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12202" title="Other formats" id="oth-2502.12202" aria-labelledby="oth-2502.12202">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> BoT: Breaking Long Thought Processes of o1-like Large Language Models through Backdoor Attack </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+Z">Zihao Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+H">Hongbao Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+M">Mingda Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+R">Ruotong Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+G">Guanzong Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+K">Ke Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+B">Baoyuan Wu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Longer thought, better performance: large language models with deep reasoning capabilities, particularly o1-like models, have demonstrated remarkable performance by generating extensive thought processes during inference. This trade-off reveals a potential vulnerability: adversaries could compromise model performance by forcing immediate responses without thought processes. To this end, in this paper, we introduce a novel attack scenario targeting the long thought processes of o1-like models and propose BoT (Break CoT), which can selectively break intrinsic reasoning mechanisms through backdoor attacks. BoT constructs poisoned datasets with designed triggers and injects backdoor by either supervised fine-tuning or direct preference optimization. When triggered, the model directly generates answers without thought processes, while maintaining normal reasoning capabilities for clean inputs. Extensive experiments on open-source o1-like models, including recent DeepSeek-R1, demonstrate that BoT nearly achieves high attack success rates while maintaining clean accuracy, highlighting the critical safety risk in current models. Furthermore, the relationship between task difficulty and helpfulness reveals a potential application for good, enabling users to customize model behavior based on task complexity. Code is available at \href{<a href="https://github.com/zihao-ai/BoT" rel="external noopener nofollow" class="link-external link-https">this https URL</a>}{<a href="https://github.com/zihao-ai/BoT" rel="external noopener nofollow" class="link-external link-https">this https URL</a>}. </p> </div> </dd> <dt> <a name='item9'>[9]</a> <a href ="/abs/2502.12204" title="Abstract" id="2502.12204"> arXiv:2502.12204 </a> [<a href="/pdf/2502.12204" title="Download PDF" id="pdf-2502.12204" aria-labelledby="pdf-2502.12204">pdf</a>, <a href="https://arxiv.org/html/2502.12204v1" title="View HTML" id="html-2502.12204" aria-labelledby="html-2502.12204" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12204" title="Other formats" id="oth-2502.12204" aria-labelledby="oth-2502.12204">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Predicting Depression in Screening Interviews from Interactive Multi-Theme Collaboration </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+X">Xianbing Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lyu,+Y">Yiqing Lyu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+D">Di Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+B">Buzhou Tang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Automatic depression detection provides cues for early clinical intervention by clinicians. Clinical interviews for depression detection involve dialogues centered around multiple themes. Existing studies primarily design end-to-end neural network models to capture the hierarchical structure of clinical interview dialogues. However, these methods exhibit defects in modeling the thematic content of clinical interviews: 1) they fail to capture intra-theme and inter-theme correlation explicitly, and 2) they do not allow clinicians to intervene and focus on themes of interest. To address these issues, this paper introduces an interactive depression detection framework. This framework leverages in-context learning techniques to identify themes in clinical interviews and then models both intra-theme and inter-theme correlation. Additionally, it employs AI-driven feedback to simulate the interests of clinicians, enabling interactive adjustment of theme importance. PDIMC achieves absolute improvements of 35\% and 12\% compared to the state-of-the-art on the depression detection dataset DAIC-WOZ, which demonstrates the effectiveness of modeling theme correlation and incorporating interactive external feedback. </p> </div> </dd> <dt> <a name='item10'>[10]</a> <a href ="/abs/2502.12210" title="Abstract" id="2502.12210"> arXiv:2502.12210 </a> [<a href="/pdf/2502.12210" title="Download PDF" id="pdf-2502.12210" aria-labelledby="pdf-2502.12210">pdf</a>, <a href="/format/2502.12210" title="Other formats" id="oth-2502.12210" aria-labelledby="oth-2502.12210">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Enhancing Frame Detection with Retrieval Augmented Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Diallo,+P+A+K+K">Papa Abdou Karim Karou Diallo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zouaq,+A">Amal Zouaq</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Recent advancements in Natural Language Processing have significantly improved the extraction of structured semantic representations from unstructured text, especially through Frame Semantic Role Labeling (FSRL). Despite this progress, the potential of Retrieval-Augmented Generation (RAG) models for frame detection remains under-explored. In this paper, we present the first RAG-based approach for frame detection called RCIF (Retrieve Candidates and Identify Frames). RCIF is also the first approach to operate without the need for explicit target span and comprises three main stages: (1) generation of frame embeddings from various representations ; (2) retrieval of candidate frames given an input text; and (3) identification of the most suitable frames. We conducted extensive experiments across multiple configurations, including zero-shot, few-shot, and fine-tuning settings. Our results show that our retrieval component significantly reduces the complexity of the task by narrowing the search space thus allowing the frame identifier to refine and complete the set of candidates. Our approach achieves state-of-the-art performance on FrameNet 1.5 and 1.7, demonstrating its robustness in scenarios where only raw text is provided. Furthermore, we leverage the structured representation obtained through this method as a proxy to enhance generalization across lexical variations in the task of translating natural language questions into SPARQL queries. </p> </div> </dd> <dt> <a name='item11'>[11]</a> <a href ="/abs/2502.12214" title="Abstract" id="2502.12214"> arXiv:2502.12214 </a> [<a href="/pdf/2502.12214" title="Download PDF" id="pdf-2502.12214" aria-labelledby="pdf-2502.12214">pdf</a>, <a href="https://arxiv.org/html/2502.12214v1" title="View HTML" id="html-2502.12214" aria-labelledby="html-2502.12214" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12214" title="Other formats" id="oth-2502.12214" aria-labelledby="oth-2502.12214">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Zero Token-Driven Deep Thinking in LLMs: Unlocking the Full Potential of Existing Parameters via Cyclic Refinement </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+G">Guanghao Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+W">Wenhao Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+L">Li Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+M">Ming Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+C">Chun Yuan</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Resource limitations often constrain the parameter counts of Large Language Models (LLMs), hindering their performance. While existing methods employ parameter sharing to reuse the same parameter set under fixed budgets, such approaches typically force each layer to assume multiple roles with a predetermined number of iterations, restricting efficiency and adaptability. In this work, we propose the Zero Token Transformer (ZTT), which features a head-tail decoupled parameter cycling method. We disentangle the first (head) and last (tail) layers from parameter cycling and iteratively refine only the intermediate layers. Furthermore, we introduce a Zero-Token Mechanism, an internal architectural component rather than an input token, to guide layer-specific computation. At each cycle, the model retrieves a zero token (with trainable key values) from a Zero-Token Pool, integrating it alongside regular tokens in the attention mechanism. The corresponding attention scores not only reflect each layer's computational importance but also enable dynamic early exits without sacrificing overall model accuracy. Our approach achieves superior performance under tight parameter budgets, effectively reduces computational overhead via early exits, and can be readily applied to fine-tune existing pre-trained models for enhanced efficiency and adaptability. </p> </div> </dd> <dt> <a name='item12'>[12]</a> <a href ="/abs/2502.12223" title="Abstract" id="2502.12223"> arXiv:2502.12223 </a> [<a href="/pdf/2502.12223" title="Download PDF" id="pdf-2502.12223" aria-labelledby="pdf-2502.12223">pdf</a>, <a href="/format/2502.12223" title="Other formats" id="oth-2502.12223" aria-labelledby="oth-2502.12223">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> GLoT: A Novel Gated-Logarithmic Transformer for Efficient Sign Language Translation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Shahin,+N">Nada Shahin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ismail,+L">Leila Ismail</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Machine Translation has played a critical role in reducing language barriers, but its adaptation for Sign Language Machine Translation (SLMT) has been less explored. Existing works on SLMT mostly use the Transformer neural network which exhibits low performance due to the dynamic nature of the sign language. In this paper, we propose a novel Gated-Logarithmic Transformer (GLoT) that captures the long-term temporal dependencies of the sign language as a time-series data. We perform a comprehensive evaluation of GloT with the transformer and transformer-fusion models as a baseline, for Sign-to-Gloss-to-Text translation. Our results demonstrate that GLoT consistently outperforms the other models across all metrics. These findings underscore its potential to address the communication challenges faced by the Deaf and Hard of Hearing community. </p> </div> </dd> <dt> <a name='item13'>[13]</a> <a href ="/abs/2502.12257" title="Abstract" id="2502.12257"> arXiv:2502.12257 </a> [<a href="/pdf/2502.12257" title="Download PDF" id="pdf-2502.12257" aria-labelledby="pdf-2502.12257">pdf</a>, <a href="https://arxiv.org/html/2502.12257v1" title="View HTML" id="html-2502.12257" aria-labelledby="html-2502.12257" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12257" title="Other formats" id="oth-2502.12257" aria-labelledby="oth-2502.12257">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> InfoQuest: Evaluating Multi-Turn Dialogue Agents for Open-Ended Conversations with Hidden Context </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=de+Oliveira,+B+L+M">Bryan L. M. de Oliveira</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Martins,+L+G+B">Luana G. B. Martins</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Brand%C3%A3o,+B">Bruno Brand茫o</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Melo,+L+C">Luckeciano C. Melo</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> While large language models excel at following explicit instructions, they often struggle with ambiguous or incomplete user requests, defaulting to verbose, generic responses rather than seeking clarification. We introduce InfoQuest, a multi-turn chat benchmark designed to evaluate how dialogue agents handle hidden context in open-ended user requests. The benchmark presents intentionally ambiguous scenarios that require models to engage in information-seeking dialogue through clarifying questions before providing appropriate responses. Our evaluation of both open and closed-source models reveals that while proprietary models generally perform better, all current assistants struggle with effectively gathering critical information, often requiring multiple turns to infer user intent and frequently defaulting to generic responses without proper clarification. We provide a systematic methodology for generating diverse scenarios and evaluating models' information-seeking capabilities, offering insights into the current limitations of language models in handling ambiguous requests through multi-turn interactions. </p> </div> </dd> <dt> <a name='item14'>[14]</a> <a href ="/abs/2502.12276" title="Abstract" id="2502.12276"> arXiv:2502.12276 </a> [<a href="/pdf/2502.12276" title="Download PDF" id="pdf-2502.12276" aria-labelledby="pdf-2502.12276">pdf</a>, <a href="/format/2502.12276" title="Other formats" id="oth-2502.12276" aria-labelledby="oth-2502.12276">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Story Grammar Semantic Matching for Literary Study </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Swenor,+A">Abigail Swenor</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Coffee,+N">Neil Coffee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Scheirer,+W">Walter Scheirer</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Submitted to Journal of Computational Literary Studies </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> In Natural Language Processing (NLP), semantic matching algorithms have traditionally relied on the feature of word co-occurrence to measure semantic similarity. While this feature approach has proven valuable in many contexts, its simplistic nature limits its analytical and explanatory power when used to understand literary texts. To address these limitations, we propose a more transparent approach that makes use of story structure and related elements. Using a BERT language model pipeline, we label prose and epic poetry with story element labels and perform semantic matching by only considering these labels as features. This new method, Story Grammar Semantic Matching, guides literary scholars to allusions and other semantic similarities across texts in a way that allows for characterizing patterns and literary technique. </p> </div> </dd> <dt> <a name='item15'>[15]</a> <a href ="/abs/2502.12289" title="Abstract" id="2502.12289"> arXiv:2502.12289 </a> [<a href="/pdf/2502.12289" title="Download PDF" id="pdf-2502.12289" aria-labelledby="pdf-2502.12289">pdf</a>, <a href="https://arxiv.org/html/2502.12289v1" title="View HTML" id="html-2502.12289" aria-labelledby="html-2502.12289" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12289" title="Other formats" id="oth-2502.12289" aria-labelledby="oth-2502.12289">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Evaluating Step-by-step Reasoning Traces: A Survey </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+J">Jinu Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hockenmaier,+J">Julia Hockenmaier</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 20 pages (8 pages of main content), 6 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Step-by-step reasoning is widely used to enhance the reasoning ability of large language models (LLMs) in complex problems. Evaluating the quality of reasoning traces is crucial for understanding and improving LLM reasoning. However, the evaluation criteria remain highly unstandardized, leading to fragmented efforts in developing metrics and meta-evaluation benchmarks. To address this gap, this survey provides a comprehensive overview of step-by-step reasoning evaluation, proposing a taxonomy of evaluation criteria with four top-level categories (groundedness, validity, coherence, and utility). We then categorize metrics based on their implementations, survey which metrics are used for assessing each criterion, and explore whether evaluator models can transfer across different criteria. Finally, we identify key directions for future research. </p> </div> </dd> <dt> <a name='item16'>[16]</a> <a href ="/abs/2502.12301" title="Abstract" id="2502.12301"> arXiv:2502.12301 </a> [<a href="/pdf/2502.12301" title="Download PDF" id="pdf-2502.12301" aria-labelledby="pdf-2502.12301">pdf</a>, <a href="https://arxiv.org/html/2502.12301v1" title="View HTML" id="html-2502.12301" aria-labelledby="html-2502.12301" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12301" title="Other formats" id="oth-2502.12301" aria-labelledby="oth-2502.12301">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SMOL: Professionally translated parallel data for 115 under-represented languages </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Caswell,+I">Isaac Caswell</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nielsen,+E">Elizabeth Nielsen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+J">Jiaming Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cherry,+C">Colin Cherry</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kovacs,+G">Geza Kovacs</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shemtov,+H">Hadar Shemtov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Talukdar,+P">Partha Talukdar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tewari,+D">Dinesh Tewari</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Diane,+B+M">Baba Mamadi Diane</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Doumbouya,+K+M">Koulako Moussa Doumbouya</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Diane,+D">Djibrila Diane</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ciss%C3%A9,+S+F">Solo Farabado Ciss茅</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> ~10 pages with appendices </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> We open-source SMOL (Set of Maximal Overall Leverage), a suite of training data to unlock translation for low-resource languages (LRLs). SMOL has been translated into 115 under-resourced languages, including many for which there exist no previous public resources, for a total of 6.1M translated tokens. SMOL comprises two sub-datasets, each carefully chosen for maximum impact given its size: SMOL-Sent, a set of sentences chosen for broad unique token coverage, and SMOL-Doc, a document-level source focusing on a broad topic coverage. They join the already released GATITOS for a trifecta of paragraph, sentence, and token-level content. We demonstrate that using SMOL to prompt or fine-tune Large Language Models yields robust ChrF improvements. In addition to translation, we provide factuality ratings and rationales for all documents in SMOL-Doc, yielding the first factuality datasets for most of these languages. </p> </div> </dd> <dt> <a name='item17'>[17]</a> <a href ="/abs/2502.12304" title="Abstract" id="2502.12304"> arXiv:2502.12304 </a> [<a href="/pdf/2502.12304" title="Download PDF" id="pdf-2502.12304" aria-labelledby="pdf-2502.12304">pdf</a>, <a href="https://arxiv.org/html/2502.12304v1" title="View HTML" id="html-2502.12304" aria-labelledby="html-2502.12304" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12304" title="Other formats" id="oth-2502.12304" aria-labelledby="oth-2502.12304">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Warmup Generations: A Task-Agnostic Approach for Guiding Sequence-to-Sequence Learning with Unsupervised Initial State Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+S">Senyu Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+Z">Zipeng Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+J">Jiayi Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+X">Xue Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Stenetorp,+P">Pontus Stenetorp</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Reddy,+S">Siva Reddy</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Adelani,+D+I">David Ifeoluwa Adelani</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Traditional supervised fine-tuning (SFT) strategies for sequence-to-sequence tasks often train models to directly generate the target output. Recent work has shown that guiding models with intermediate steps, such as keywords, outlines, or reasoning chains, can significantly improve performance, coherence, and interpretability. However, these methods often depend on predefined intermediate formats and annotated data, limiting their scalability and generalizability. In this work, we introduce a task-agnostic framework that enables models to generate intermediate "warmup" sequences. These warmup sequences, serving as an initial state for subsequent generation, are optimized to enhance the probability of generating the target sequence without relying on external supervision or human-designed structures. Drawing inspiration from reinforcement learning principles, our method iteratively refines these intermediate steps to maximize their contribution to the final output, similar to reward-driven optimization in reinforcement learning with human feedback. Experimental results across tasks such as translation, summarization, and multi-choice question answering for logical reasoning show that our approach outperforms traditional SFT methods, and offers a scalable and flexible solution for sequence-to-sequence tasks. </p> </div> </dd> <dt> <a name='item18'>[18]</a> <a href ="/abs/2502.12317" title="Abstract" id="2502.12317"> arXiv:2502.12317 </a> [<a href="/pdf/2502.12317" title="Download PDF" id="pdf-2502.12317" aria-labelledby="pdf-2502.12317">pdf</a>, <a href="/format/2502.12317" title="Other formats" id="oth-2502.12317" aria-labelledby="oth-2502.12317">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Can Language Models Learn Typologically Implausible Languages? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+T">Tianyang Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kuribayashi,+T">Tatsuki Kuribayashi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Oseki,+Y">Yohei Oseki</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cotterell,+R">Ryan Cotterell</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Warstadt,+A">Alex Warstadt</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Grammatical features across human languages show intriguing correlations often attributed to learning biases in humans. However, empirical evidence has been limited to experiments with highly simplified artificial languages, and whether these correlations arise from domain-general or language-specific biases remains a matter of debate. Language models (LMs) provide an opportunity to study artificial language learning at a large scale and with a high degree of naturalism. In this paper, we begin with an in-depth discussion of how LMs allow us to better determine the role of domain-general learning biases in language universals. We then assess learnability differences for LMs resulting from typologically plausible and implausible languages closely following the word-order universals identified by linguistic typologists. We conduct a symmetrical cross-lingual study training and testing LMs on an array of highly naturalistic but counterfactual versions of the English (head-initial) and Japanese (head-final) languages. Compared to similar work, our datasets are more naturalistic and fall closer to the boundary of plausibility. Our experiments show that these LMs are often slower to learn these subtly implausible languages, while ultimately achieving similar performance on some metrics regardless of typological plausibility. These findings lend credence to the conclusion that LMs do show some typologically-aligned learning preferences, and that the typological patterns may result from, at least to some degree, domain-general learning biases. </p> </div> </dd> <dt> <a name='item19'>[19]</a> <a href ="/abs/2502.12325" title="Abstract" id="2502.12325"> arXiv:2502.12325 </a> [<a href="/pdf/2502.12325" title="Download PDF" id="pdf-2502.12325" aria-labelledby="pdf-2502.12325">pdf</a>, <a href="https://arxiv.org/html/2502.12325v1" title="View HTML" id="html-2502.12325" aria-labelledby="html-2502.12325" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12325" title="Other formats" id="oth-2502.12325" aria-labelledby="oth-2502.12325">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> From Dense to Dynamic: Token-Difficulty Driven MoEfication of Pre-Trained LLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Nishu,+K">Kumari Nishu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mehta,+S">Sachin Mehta</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Abnar,+S">Samira Abnar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Farajtabar,+M">Mehrdad Farajtabar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Horton,+M">Maxwell Horton</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Najibi,+M">Mahyar Najibi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nabi,+M">Moin Nabi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cho,+M">Minsik Cho</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Naik,+D">Devang Naik</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Training large language models (LLMs) for different inference constraints is computationally expensive, limiting control over efficiency-accuracy trade-offs. Moreover, once trained, these models typically process tokens uniformly, regardless of their complexity, leading to static and inflexible behavior. In this paper, we introduce a post-training optimization framework, DynaMoE, that adapts a pre-trained dense LLM to a token-difficulty-driven Mixture-of-Experts model with minimal fine-tuning cost. This adaptation makes the model dynamic, with sensitivity control to customize the balance between efficiency and accuracy. DynaMoE features a token-difficulty-aware router that predicts the difficulty of tokens and directs them to the appropriate sub-networks or experts, enabling larger experts to handle more complex tokens and smaller experts to process simpler ones. Our experiments demonstrate that DynaMoE can generate a range of adaptive model variants of the existing trained LLM with a single fine-tuning step, utilizing only $10B$ tokens, a minimal cost compared to the base model's training. Each variant offers distinct trade-offs between accuracy and performance. Compared to the baseline post-training optimization framework, Flextron, our method achieves similar aggregated accuracy across downstream tasks, despite using only $\frac{1}{9}\text{th}$ of their fine-tuning cost. </p> </div> </dd> <dt> <a name='item20'>[20]</a> <a href ="/abs/2502.12328" title="Abstract" id="2502.12328"> arXiv:2502.12328 </a> [<a href="/pdf/2502.12328" title="Download PDF" id="pdf-2502.12328" aria-labelledby="pdf-2502.12328">pdf</a>, <a href="https://arxiv.org/html/2502.12328v1" title="View HTML" id="html-2502.12328" aria-labelledby="html-2502.12328" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12328" title="Other formats" id="oth-2502.12328" aria-labelledby="oth-2502.12328">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LM Agents for Coordinating Multi-User Information Gathering </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Jhamtani,+H">Harsh Jhamtani</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Andreas,+J">Jacob Andreas</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Van+Durme,+B">Benjamin Van Durme</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> This paper introduces PeopleJoin, a benchmark for evaluating LM-mediated collaborative problem solving. Given a user request, PeopleJoin agents must identify teammates who might be able to assist, converse with these teammates to gather information, and finally compile a useful answer or summary for the original user. PeopleJoin comprises two evaluation domains: PeopleJoin-QA, focused on questions about tabular data, and PeopleJoin-DocCreation, focused on document creation tasks. The two domains are adapted from existing NLP benchmarks for database question answering and multi-document summarization; here, however, the information needed to complete these tasks is distributed across synthetic ``organizations'' of 2--20 users, simulating natural multi-user collaboration scenarios. We implemented several popular LM agent architectures, evaluating their accuracy and efficiency at completing tasks, and highlight new research questions that can be studied using PeopleJoin. </p> </div> </dd> <dt> <a name='item21'>[21]</a> <a href ="/abs/2502.12361" title="Abstract" id="2502.12361"> arXiv:2502.12361 </a> [<a href="/pdf/2502.12361" title="Download PDF" id="pdf-2502.12361" aria-labelledby="pdf-2502.12361">pdf</a>, <a href="https://arxiv.org/html/2502.12361v1" title="View HTML" id="html-2502.12361" aria-labelledby="html-2502.12361" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12361" title="Other formats" id="oth-2502.12361" aria-labelledby="oth-2502.12361">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> ConFit v2: Improving Resume-Job Matching using Hypothetical Resume Embedding and Runner-Up Hard-Negative Mining </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+X">Xiao Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+R">Ruize Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xue,+C">Chengyuan Xue</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+J">Jinzhong Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+Z">Zhou Yu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> arXiv admin note: text overlap with <a href="https://arxiv.org/abs/2401.16349" data-arxiv-id="2401.16349" class="link-https">arXiv:2401.16349</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> A reliable resume-job matching system helps a company recommend suitable candidates from a pool of resumes and helps a job seeker find relevant jobs from a list of job posts. However, since job seekers apply only to a few jobs, interaction labels in resume-job datasets are sparse. We introduce ConFit v2, an improvement over ConFit to tackle this sparsity problem. We propose two techniques to enhance the encoder's contrastive training process: augmenting job data with hypothetical reference resume generated by a large language model; and creating high-quality hard negatives from unlabeled resume/job pairs using a novel hard-negative mining strategy. We evaluate ConFit v2 on two real-world datasets and demonstrate that it outperforms ConFit and prior methods (including BM25 and OpenAI text-embedding-003), achieving an average absolute improvement of 13.8% in recall and 17.5% in nDCG across job-ranking and resume-ranking tasks. </p> </div> </dd> <dt> <a name='item22'>[22]</a> <a href ="/abs/2502.12362" title="Abstract" id="2502.12362"> arXiv:2502.12362 </a> [<a href="/pdf/2502.12362" title="Download PDF" id="pdf-2502.12362" aria-labelledby="pdf-2502.12362">pdf</a>, <a href="/format/2502.12362" title="Other formats" id="oth-2502.12362" aria-labelledby="oth-2502.12362">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Classifiers of Data Sharing Statements in Clinical Trial Records </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Mamaghani,+S+J">Saber Jelodari Mamaghani</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Strantz,+C">Cosima Strantz</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Toddenroth,+D">Dennis Toddenroth</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Published in Proceedings of MIE 2024, IOS Press eBooks. Studies in Health Technology and Informatics, Vol. 316, pp. 834-838. Conference held in Athens, Greece </div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> Studies in Health Technology and Informatics, Vol. 316, pp. 834-838, IOS Press, 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Digital individual participant data (IPD) from clinical trials are increasingly distributed for potential scientific reuse. The identification of available IPD, however, requires interpretations of textual data-sharing statements (DSS) in large databases. Recent advancements in computational linguistics include pre-trained language models that promise to simplify the implementation of effective classifiers based on textual inputs. In a subset of 5,000 textual DSS from <a href="http://ClinicalTrials.gov" rel="external noopener nofollow" class="link-external link-http">this http URL</a>, we evaluate how well classifiers based on domain-specific pre-trained language models reproduce original availability categories as well as manually annotated labels. Typical metrics indicate that classifiers that predicted manual annotations outperformed those that learned to output the original availability categories. This suggests that the textual DSS descriptions contain applicable information that the availability categories do not, and that such classifiers could thus aid the automatic identification of available IPD in large trial databases. </p> </div> </dd> <dt> <a name='item23'>[23]</a> <a href ="/abs/2502.12372" title="Abstract" id="2502.12372"> arXiv:2502.12372 </a> [<a href="/pdf/2502.12372" title="Download PDF" id="pdf-2502.12372" aria-labelledby="pdf-2502.12372">pdf</a>, <a href="https://arxiv.org/html/2502.12372v1" title="View HTML" id="html-2502.12372" aria-labelledby="html-2502.12372" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12372" title="Other formats" id="oth-2502.12372" aria-labelledby="oth-2502.12372">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Factual Inconsistency in Data-to-Text Generation Scales Exponentially with LLM Size: A Statistical Validation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Mahapatra,+J">Joy Mahapatra</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Roy,+S">Soumyajit Roy</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Garain,+U">Utpal Garain</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 21 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Monitoring factual inconsistency is essential for ensuring trustworthiness in data-to-text generation (D2T). While large language models (LLMs) have demonstrated exceptional performance across various D2T tasks, previous studies on scaling laws have primarily focused on generalization error through power law scaling to LLM size (i.e., the number of model parameters). However, no research has examined the impact of LLM size on factual inconsistency in D2T. In this paper, we investigate how factual inconsistency in D2T scales with LLM size by exploring two scaling laws: power law and exponential scaling. To rigorously evaluate and compare these scaling laws, we employ a statistical validation framework consisting of three key stages: predictive performance estimation, goodness-of-fit assessment, and comparative analysis. For a comprehensive empirical study, we analyze three popular LLM families across five D2T datasets, measuring factual inconsistency inversely using four state-of-the-art consistency metrics. Our findings, based on exhaustive empirical results and validated through our framework, reveal that, contrary to the widely assumed power law scaling, factual inconsistency in D2T follows an exponential scaling with LLM size. </p> </div> </dd> <dt> <a name='item24'>[24]</a> <a href ="/abs/2502.12375" title="Abstract" id="2502.12375"> arXiv:2502.12375 </a> [<a href="/pdf/2502.12375" title="Download PDF" id="pdf-2502.12375" aria-labelledby="pdf-2502.12375">pdf</a>, <a href="https://arxiv.org/html/2502.12375v1" title="View HTML" id="html-2502.12375" aria-labelledby="html-2502.12375" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12375" title="Other formats" id="oth-2502.12375" aria-labelledby="oth-2502.12375">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> UltraGen: Extremely Fine-grained Controllable Generation via Attribute Reconstruction and Global Preference Optimization </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yun,+L">Longfei Yun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Peng,+L">Letian Peng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shang,+J">Jingbo Shang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Fine granularity is an essential requirement for controllable text generation, which has seen rapid growth with the ability of LLMs. However, existing methods focus mainly on a small set of attributes like 3 to 5, and their performance degrades significantly when the number of attributes increases to the next order of magnitude. To address this challenge, we propose a novel zero-shot approach for extremely fine-grained controllable generation (EFCG), proposing auto-reconstruction (AR) and global preference optimization (GPO). In the AR phase, we leverage LLMs to extract soft attributes (e.g., Emphasis on simplicity and minimalism in design) from raw texts, and combine them with programmatically derived hard attributes (e.g., The text should be between 300 and 400 words) to construct massive (around 45) multi-attribute requirements, which guide the fine-grained text reconstruction process under weak supervision. In the GPO phase, we apply direct preference optimization (DPO) to refine text generation under diverse attribute combinations, enabling efficient exploration of the global combination space. Additionally, we introduce an efficient attribute sampling strategy to identify and correct potentially erroneous attributes, further improving global optimization. Our framework significantly improves the constraint satisfaction rate (CSR) and text quality for EFCG by mitigating position bias and alleviating attention dilution. </p> </div> </dd> <dt> <a name='item25'>[25]</a> <a href ="/abs/2502.12378" title="Abstract" id="2502.12378"> arXiv:2502.12378 </a> [<a href="/pdf/2502.12378" title="Download PDF" id="pdf-2502.12378" aria-labelledby="pdf-2502.12378">pdf</a>, <a href="https://arxiv.org/html/2502.12378v1" title="View HTML" id="html-2502.12378" aria-labelledby="html-2502.12378" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12378" title="Other formats" id="oth-2502.12378" aria-labelledby="oth-2502.12378">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Pragmatics in the Era of Large Language Models: A Survey on Datasets, Evaluation, Opportunities and Challenges </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+B">Bolei Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yuting Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+W">Wei Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gong,+Z">Ziwei Gong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Y+J">Yang Janet Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jasinskaja,+K">Katja Jasinskaja</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Friedrich,+A">Annemarie Friedrich</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hirschberg,+J">Julia Hirschberg</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kreuter,+F">Frauke Kreuter</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Plank,+B">Barbara Plank</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Understanding pragmatics-the use of language in context-is crucial for developing NLP systems capable of interpreting nuanced language use. Despite recent advances in language technologies, including large language models, evaluating their ability to handle pragmatic phenomena such as implicatures and references remains challenging. To advance pragmatic abilities in models, it is essential to understand current evaluation trends and identify existing limitations. In this survey, we provide a comprehensive review of resources designed for evaluating pragmatic capabilities in NLP, categorizing datasets by the pragmatics phenomena they address. We analyze task designs, data collection methods, evaluation approaches, and their relevance to real-world applications. By examining these resources in the context of modern language models, we highlight emerging trends, challenges, and gaps in existing benchmarks. Our survey aims to clarify the landscape of pragmatic evaluation and guide the development of more comprehensive and targeted benchmarks, ultimately contributing to more nuanced and context-aware NLP models. </p> </div> </dd> <dt> <a name='item26'>[26]</a> <a href ="/abs/2502.12404" title="Abstract" id="2502.12404"> arXiv:2502.12404 </a> [<a href="/pdf/2502.12404" title="Download PDF" id="pdf-2502.12404" aria-labelledby="pdf-2502.12404">pdf</a>, <a href="https://arxiv.org/html/2502.12404v1" title="View HTML" id="html-2502.12404" aria-labelledby="html-2502.12404" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12404" title="Other formats" id="oth-2502.12404" aria-labelledby="oth-2502.12404">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> WMT24++: Expanding the Language Coverage of WMT24 to 55 Languages & Dialects </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Deutsch,+D">Daniel Deutsch</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Briakou,+E">Eleftheria Briakou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Caswell,+I">Isaac Caswell</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Finkelstein,+M">Mara Finkelstein</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Galor,+R">Rebecca Galor</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Juraska,+J">Juraj Juraska</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kovacs,+G">Geza Kovacs</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lui,+A">Alison Lui</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rei,+R">Ricardo Rei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Riesa,+J">Jason Riesa</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rijhwani,+S">Shruti Rijhwani</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Riley,+P">Parker Riley</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Salesky,+E">Elizabeth Salesky</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Trabelsi,+F">Firas Trabelsi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Winkler,+S">Stephanie Winkler</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+B">Biao Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Freitag,+M">Markus Freitag</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> As large language models (LLM) become more and more capable in languages other than English, it is important to collect benchmark datasets in order to evaluate their multilingual performance, including on tasks like machine translation (MT). In this work, we extend the WMT24 dataset to cover 55 languages by collecting new human-written references and post-edits for 46 new languages and dialects in addition to post-edits of the references in 8 out of 9 languages in the original WMT24 dataset. The dataset covers four domains: literary, news, social, and speech. We benchmark a variety of MT providers and LLMs on the collected dataset using automatic metrics and find that LLMs are the best-performing MT systems in all 55 languages. These results should be confirmed using a human-based evaluation, which we leave for future work. </p> </div> </dd> <dt> <a name='item27'>[27]</a> <a href ="/abs/2502.12408" title="Abstract" id="2502.12408"> arXiv:2502.12408 </a> [<a href="/pdf/2502.12408" title="Download PDF" id="pdf-2502.12408" aria-labelledby="pdf-2502.12408">pdf</a>, <a href="https://arxiv.org/html/2502.12408v1" title="View HTML" id="html-2502.12408" aria-labelledby="html-2502.12408" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12408" title="Other formats" id="oth-2502.12408" aria-labelledby="oth-2502.12408">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> On the Robust Approximation of ASR Metrics </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Waheed,+A">Abdul Waheed</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Atwany,+H">Hanin Atwany</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Singh,+R">Rita Singh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Raj,+B">Bhiksha Raj</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 25 Pages. Work in Progress </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Recent advances in speech foundation models are largely driven by scaling both model size and data, enabling them to perform a wide range of tasks, including speech recognition. Traditionally, ASR models are evaluated using metrics like Word Error Rate (WER) and Character Error Rate (CER), which depend on ground truth labels. As a result of limited labeled data from diverse domains and testing conditions, the true generalization capabilities of these models beyond standard benchmarks remain unclear. Moreover, labeling data is both costly and time-consuming. To address this, we propose a novel label-free approach for approximating ASR performance metrics, eliminating the need for ground truth labels. Our method utilizes multimodal embeddings in a unified space for speech and transcription representations, combined with a high-quality proxy model to compute proxy metrics. These features are used to train a regression model to predict key ASR metrics like Word Error Rate (WER) and Character Error Rate (CER). We experiment with over 40 models across 14 datasets representing both standard and in-the-wild testing conditions. Our results show that we approximate the metrics within a single-digit absolute difference across all experimental configurations, outperforming the most recent baseline by more than 50\%. </p> </div> </dd> <dt> <a name='item28'>[28]</a> <a href ="/abs/2502.12411" title="Abstract" id="2502.12411"> arXiv:2502.12411 </a> [<a href="/pdf/2502.12411" title="Download PDF" id="pdf-2502.12411" aria-labelledby="pdf-2502.12411">pdf</a>, <a href="https://arxiv.org/html/2502.12411v1" title="View HTML" id="html-2502.12411" aria-labelledby="html-2502.12411" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12411" title="Other formats" id="oth-2502.12411" aria-labelledby="oth-2502.12411">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Gradient Co-occurrence Analysis for Detecting Unsafe Prompts in Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+J">Jingyuan Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yan,+B">Bowen Yan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+R">Rongjun Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+Z">Ziyu Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+X">Xin Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+Z">Zhiyong Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Peng,+W">Wei Peng</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Unsafe prompts pose significant safety risks to large language models (LLMs). Existing methods for detecting unsafe prompts rely on data-driven fine-tuning to train guardrail models, necessitating significant data and computational resources. In contrast, recent few-shot gradient-based methods emerge, requiring only few safe and unsafe reference prompts. A gradient-based approach identifies unsafe prompts by analyzing consistent patterns of the gradients of safety-critical parameters in LLMs. Although effective, its restriction to directional similarity (cosine similarity) introduces ``directional bias'', limiting its capability to identify unsafe prompts. To overcome this limitation, we introduce GradCoo, a novel gradient co-occurrence analysis method that expands the scope of safety-critical parameter identification to include unsigned gradient similarity, thereby reducing the impact of ``directional bias'' and enhancing the accuracy of unsafe prompt detection. Comprehensive experiments on the widely-used benchmark datasets ToxicChat and XStest demonstrate that our proposed method can achieve state-of-the-art (SOTA) performance compared to existing methods. Moreover, we confirm the generalizability of GradCoo in detecting unsafe prompts across a range of LLM base models with various sizes and origins. </p> </div> </dd> <dt> <a name='item29'>[29]</a> <a href ="/abs/2502.12414" title="Abstract" id="2502.12414"> arXiv:2502.12414 </a> [<a href="/pdf/2502.12414" title="Download PDF" id="pdf-2502.12414" aria-labelledby="pdf-2502.12414">pdf</a>, <a href="https://arxiv.org/html/2502.12414v1" title="View HTML" id="html-2502.12414" aria-labelledby="html-2502.12414" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12414" title="Other formats" id="oth-2502.12414" aria-labelledby="oth-2502.12414">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Lost in Transcription, Found in Distribution Shift: Demystifying Hallucination in Speech Foundation Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Atwany,+H">Hanin Atwany</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Waheed,+A">Abdul Waheed</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Singh,+R">Rita Singh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Choudhury,+M">Monojit Choudhury</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Raj,+B">Bhiksha Raj</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> The first two authors contributed equally as co-first authors. The manuscript is 21 pages long and is a work in progress </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Speech foundation models trained at a massive scale, both in terms of model and data size, result in robust systems capable of performing multiple speech tasks, including automatic speech recognition (ASR). These models transcend language and domain barriers, yet effectively measuring their performance remains a challenge. Traditional metrics like word error rate (WER) and character error rate (CER) are commonly used to evaluate ASR performance but often fail to reflect transcription quality in critical contexts, particularly when detecting fabricated outputs. This phenomenon, known as hallucination, is especially concerning in high-stakes domains such as healthcare, legal, and aviation, where errors can have severe consequences. In our work, we address this gap by investigating hallucination in ASR models. We examine how factors such as distribution shifts, model size, and model architecture influence the hallucination error rate (HER), a metric we introduce to quantify hallucinations. Our analysis of 20 ASR models reveals \numinsights~key insights: (1) High WERs can mask low hallucination rates, while low WERs may conceal dangerous hallucinations. (2) Synthetic noise, both adversarial and common perturbations like white noise, pitch shift, and time stretching, increase HER. (3) Distribution shift correlates strongly with HER ($\alpha = 0.91$). Our findings highlight the importance of incorporating HER alongside traditional metrics like WER to better assess ASR model performance, particularly in high-stakes domains. </p> </div> </dd> <dt> <a name='item30'>[30]</a> <a href ="/abs/2502.12420" title="Abstract" id="2502.12420"> arXiv:2502.12420 </a> [<a href="/pdf/2502.12420" title="Download PDF" id="pdf-2502.12420" aria-labelledby="pdf-2502.12420">pdf</a>, <a href="https://arxiv.org/html/2502.12420v1" title="View HTML" id="html-2502.12420" aria-labelledby="html-2502.12420" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12420" title="Other formats" id="oth-2502.12420" aria-labelledby="oth-2502.12420">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Sens-Merging: Sensitivity-Guided Parameter Balancing for Merging Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+S">Shuqi Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+H">Han Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+B">Bowei He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+X">Xiongwei Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+M">Mingxuan Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+L">Linqin Song</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Recent advances in large language models have led to numerous task-specialized fine-tuned variants, creating a need for efficient model merging techniques that preserve specialized capabilities while avoiding costly retraining. While existing task vector-based merging methods show promise, they typically apply uniform coefficients across all parameters, overlooking varying parameter importance both within and across tasks. We present Sens-Merging, a sensitivity-guided coefficient adjustment method that enhances existing model merging techniques by operating at both task-specific and cross-task levels. Our method analyzes parameter sensitivity within individual tasks and evaluates cross-task transferability to determine optimal merging coefficients. Extensive experiments on Mistral 7B and LLaMA2-7B/13B models demonstrate that Sens-Merging significantly improves performance across general knowledge, mathematical reasoning, and code generation tasks. Notably, when combined with existing merging techniques, our method enables merged models to outperform specialized fine-tuned models, particularly in code generation tasks. Our findings reveal important trade-offs between task-specific and cross-task scalings, providing insights for future model merging strategies. </p> </div> </dd> <dt> <a name='item31'>[31]</a> <a href ="/abs/2502.12421" title="Abstract" id="2502.12421"> arXiv:2502.12421 </a> [<a href="/pdf/2502.12421" title="Download PDF" id="pdf-2502.12421" aria-labelledby="pdf-2502.12421">pdf</a>, <a href="https://arxiv.org/html/2502.12421v1" title="View HTML" id="html-2502.12421" aria-labelledby="html-2502.12421" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12421" title="Other formats" id="oth-2502.12421" aria-labelledby="oth-2502.12421">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Wi-Chat: Large Language Model Powered Wi-Fi Sensing </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+H">Haopeng Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ren,+Y">Yili Ren</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+H">Haohan Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+J">Jingzhe Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+Y">Yitong Shen</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Recent advancements in Large Language Models (LLMs) have demonstrated remarkable capabilities across diverse tasks. However, their potential to integrate physical model knowledge for real-world signal interpretation remains largely unexplored. In this work, we introduce Wi-Chat, the first LLM-powered Wi-Fi-based human activity recognition system. We demonstrate that LLMs can process raw Wi-Fi signals and infer human activities by incorporating Wi-Fi sensing principles into prompts. Our approach leverages physical model insights to guide LLMs in interpreting Channel State Information (CSI) data without traditional signal processing techniques. Through experiments on real-world Wi-Fi datasets, we show that LLMs exhibit strong reasoning capabilities, achieving zero-shot activity recognition. These findings highlight a new paradigm for Wi-Fi sensing, expanding LLM applications beyond conventional language tasks and enhancing the accessibility of wireless sensing for real-world deployments. </p> </div> </dd> <dt> <a name='item32'>[32]</a> <a href ="/abs/2502.12436" title="Abstract" id="2502.12436"> arXiv:2502.12436 </a> [<a href="/pdf/2502.12436" title="Download PDF" id="pdf-2502.12436" aria-labelledby="pdf-2502.12436">pdf</a>, <a href="https://arxiv.org/html/2502.12436v1" title="View HTML" id="html-2502.12436" aria-labelledby="html-2502.12436" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12436" title="Other formats" id="oth-2502.12436" aria-labelledby="oth-2502.12436">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Should I Trust You? Detecting Deception in Negotiations using Counterfactual RL </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wongkamjan,+W">Wichayaporn Wongkamjan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yanze Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gu,+F">Feng Gu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Peskoff,+D">Denis Peskoff</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kummerfeld,+J+K">Jonathan K. Kummerfeld</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=May,+J">Jonathan May</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Boyd-Graber,+J+L">Jordan Lee Boyd-Graber</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> An increasingly prevalent socio-technical problem is people being taken in by offers that sound ``too good to be true'', where persuasion and trust shape decision-making. This paper investigates how \abr{ai} can help detect these deceptive scenarios. We analyze how humans strategically deceive each other in \textit{Diplomacy}, a board game that requires both natural language communication and strategic reasoning. This requires extracting logical forms of proposed agreements in player communications and computing the relative rewards of the proposal using agents' value functions. Combined with text-based features, this can improve our deception detection. Our method detects human deception with a high precision when compared to a Large Language Model approach that flags many true messages as deceptive. Future human-\abr{ai} interaction tools can build on our methods for deception detection by triggering \textit{friction} to give users a chance of interrogating suspicious proposals. </p> </div> </dd> <dt> <a name='item33'>[33]</a> <a href ="/abs/2502.12446" title="Abstract" id="2502.12446"> arXiv:2502.12446 </a> [<a href="/pdf/2502.12446" title="Download PDF" id="pdf-2502.12446" aria-labelledby="pdf-2502.12446">pdf</a>, <a href="https://arxiv.org/html/2502.12446v1" title="View HTML" id="html-2502.12446" aria-labelledby="html-2502.12446" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12446" title="Other formats" id="oth-2502.12446" aria-labelledby="oth-2502.12446">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Multi-Attribute Steering of Language Models via Targeted Intervention </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Nguyen,+D">Duy Nguyen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Prasad,+A">Archiki Prasad</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Stengel-Eskin,+E">Elias Stengel-Eskin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bansal,+M">Mohit Bansal</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 15 pages, code link: <a href="https://github.com/duykhuongnguyen/MAT-Steer" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Inference-time intervention (ITI) has emerged as a promising method for steering large language model (LLM) behavior in a particular direction (e.g., improving helpfulness) by intervening on token representations without costly updates to the LLM's parameters. However, existing ITI approaches fail to scale to multi-attribute settings with conflicts, such as enhancing helpfulness while also reducing toxicity. To address this, we introduce Multi-Attribute Targeted Steering (MAT-Steer), a novel steering framework designed for selective token-level intervention across multiple attributes. MAT-Steer learns steering vectors using an alignment objective that shifts the model's internal representations of undesirable outputs closer to those of desirable ones while enforcing sparsity and orthogonality among vectors for different attributes, thereby reducing inter-attribute conflicts. We evaluate MAT-Steer in two distinct settings: (i) on question answering (QA) tasks where we balance attributes like truthfulness, bias, and toxicity; (ii) on generative tasks where we simultaneously improve attributes like helpfulness, correctness, and coherence. MAT-Steer outperforms existing ITI and parameter-efficient finetuning approaches across both task types (e.g., 3% average accuracy gain across QA tasks and 55.82% win rate against the best ITI baseline). </p> </div> </dd> <dt> <a name='item34'>[34]</a> <a href ="/abs/2502.12455" title="Abstract" id="2502.12455"> arXiv:2502.12455 </a> [<a href="/pdf/2502.12455" title="Download PDF" id="pdf-2502.12455" aria-labelledby="pdf-2502.12455">pdf</a>, <a href="https://arxiv.org/html/2502.12455v1" title="View HTML" id="html-2502.12455" aria-labelledby="html-2502.12455" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12455" title="Other formats" id="oth-2502.12455" aria-labelledby="oth-2502.12455">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> DSMoE: Matrix-Partitioned Experts with Dynamic Routing for Computation-Efficient Dense LLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lv,+M">Minxuan Lv</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Su,+Z">Zhenpeng Su</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pan,+L">Leiyu Pan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiong,+Y">Yizhe Xiong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+Z">Zijia Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+H">Hui Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+W">Wei Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+J">Jungong Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ding,+G">Guiguang Ding</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+C">Cheng Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+D">Di Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gai,+K">Kun Gai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+S">Songlin Hu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> As large language models continue to scale, computational costs and resource consumption have emerged as significant challenges. While existing sparsification methods like pruning reduce computational overhead, they risk losing model knowledge through parameter removal. This paper proposes DSMoE (Dynamic Sparse Mixture-of-Experts), a novel approach that achieves sparsification by partitioning pre-trained FFN layers into computational blocks. We implement adaptive expert routing using sigmoid activation and straight-through estimators, enabling tokens to flexibly access different aspects of model knowledge based on input complexity. Additionally, we introduce a sparsity loss term to balance performance and computational efficiency. Extensive experiments on LLaMA models demonstrate that under equivalent computational constraints, DSMoE achieves superior performance compared to existing pruning and MoE approaches across language modeling and downstream tasks, particularly excelling in generation tasks. Analysis reveals that DSMoE learns distinctive layerwise activation patterns, providing new insights for future MoE architecture design. </p> </div> </dd> <dt> <a name='item35'>[35]</a> <a href ="/abs/2502.12458" title="Abstract" id="2502.12458"> arXiv:2502.12458 </a> [<a href="/pdf/2502.12458" title="Download PDF" id="pdf-2502.12458" aria-labelledby="pdf-2502.12458">pdf</a>, <a href="https://arxiv.org/html/2502.12458v1" title="View HTML" id="html-2502.12458" aria-labelledby="html-2502.12458" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12458" title="Other formats" id="oth-2502.12458" aria-labelledby="oth-2502.12458">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> An Empirical Evaluation of Encoder Architectures for Fast Real-Time Long Conversational Understanding </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Senthilnathan,+A">Annamalai Senthilnathan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Arumae,+K">Kristjan Arumae</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Khalilia,+M">Mohammed Khalilia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xing,+Z">Zhengzheng Xing</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Colak,+A+R">Aaron R. Colak</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Analyzing long text data such as customer call transcripts is a cost-intensive and tedious task. Machine learning methods, namely Transformers, are leveraged to model agent-customer interactions. Unfortunately, Transformers adhere to fixed-length architectures and their self-attention mechanism scales quadratically with input length. Such limitations make it challenging to leverage traditional Transformers for long sequence tasks, such as conversational understanding, especially in real-time use cases. In this paper we explore and evaluate recently proposed efficient Transformer variants (e.g. Performer, Reformer) and a CNN-based architecture for real-time and near real-time long conversational understanding tasks. We show that CNN-based models are dynamic, ~2.6x faster to train, ~80% faster inference and ~72% more memory efficient compared to Transformers on average. Additionally, we evaluate the CNN model using the Long Range Arena benchmark to demonstrate competitiveness in general long document analysis. </p> </div> </dd> <dt> <a name='item36'>[36]</a> <a href ="/abs/2502.12459" title="Abstract" id="2502.12459"> arXiv:2502.12459 </a> [<a href="/pdf/2502.12459" title="Download PDF" id="pdf-2502.12459" aria-labelledby="pdf-2502.12459">pdf</a>, <a href="https://arxiv.org/html/2502.12459v1" title="View HTML" id="html-2502.12459" aria-labelledby="html-2502.12459" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12459" title="Other formats" id="oth-2502.12459" aria-labelledby="oth-2502.12459">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Stress Testing Generalization: How Minor Modifications Undermine Large Language Model Performance </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+G">Guangxiang Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+S">Saier Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jian,+X">Xiaoqi Jian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+J">Jinzhu Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Y">Yuhan Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jia,+C">Change Jia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+L">Lin Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+X">Xiangzheng Zhang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Submitted to ACL 2025 theme track on the Generalization of NLP models </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> This paper investigates the fragility of Large Language Models (LLMs) in generalizing to novel inputs, specifically focusing on minor perturbations in well-established benchmarks (e.g., slight changes in question format or distractor length). Despite high benchmark scores, LLMs exhibit significant accuracy drops and unexpected biases (e.g., preference for longer distractors) when faced with these minor but content-preserving modifications. For example, Qwen 2.5 1.5B's MMLU score rises from 60 to 89 and drops from 89 to 36 when option lengths are changed without altering the question. Even GPT-4 experiences a 25-point accuracy loss when question types are changed, with a 6-point drop across all three modification categories. These analyses suggest that LLMs rely heavily on superficial cues rather than forming robust, abstract representations that generalize across formats, lexical variations, and irrelevant content shifts. This work aligns with the ACL 2025 theme track on the Generalization of NLP models, proposing a "Generalization Stress Test" to assess performance shifts under controlled perturbations. The study calls for reevaluating benchmarks and developing more reliable evaluation methodologies to capture LLM generalization abilities better. </p> </div> </dd> <dt> <a name='item37'>[37]</a> <a href ="/abs/2502.12462" title="Abstract" id="2502.12462"> arXiv:2502.12462 </a> [<a href="/pdf/2502.12462" title="Download PDF" id="pdf-2502.12462" aria-labelledby="pdf-2502.12462">pdf</a>, <a href="https://arxiv.org/html/2502.12462v1" title="View HTML" id="html-2502.12462" aria-labelledby="html-2502.12462" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12462" title="Other formats" id="oth-2502.12462" aria-labelledby="oth-2502.12462">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Emulating Retrieval Augmented Generation via Prompt Engineering for Enhanced Long Context Comprehension in LLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Park,+J">Joon Park</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Atarashi,+K">Kyohei Atarashi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Takeuchi,+K">Koh Takeuchi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kashima,+H">Hisashi Kashima</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 11 pages, 2 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> This paper addresses the challenge of comprehending very long contexts in Large Language Models (LLMs) by proposing a method that emulates Retrieval Augmented Generation (RAG) through specialized prompt engineering and chain-of-thought (CoT) reasoning. While recent LLMs support over 100,000 tokens in a single prompt, simply enlarging context windows has not guaranteed robust multi-hop reasoning when key details are scattered across massive input. Our approach treats the model as both the retriever and the reasoner: it first tags relevant segments within a long passage, then employs a stepwise CoT workflow to integrate these pieces of evidence. This single-pass method thereby reduces reliance on an external retriever, yet maintains focus on crucial segments. We evaluate our approach on selected tasks from BABILong, which interleaves standard bAbI QA problems with large amounts of distractor text. Compared to baseline (no retrieval) and naive RAG pipelines, our approach more accurately handles multi-fact questions such as object location tracking, counting, and indefinite knowledge. Furthermore, we analyze how prompt structure, including the order of question, relevant-text tags, and overall instructions, significantly affects performance. These findings underscore that optimized prompt engineering, combined with guided reasoning, can enhance LLMs' long-context comprehension and serve as a lightweight alternative to traditional retrieval pipelines. </p> </div> </dd> <dt> <a name='item38'>[38]</a> <a href ="/abs/2502.12464" title="Abstract" id="2502.12464"> arXiv:2502.12464 </a> [<a href="/pdf/2502.12464" title="Download PDF" id="pdf-2502.12464" aria-labelledby="pdf-2502.12464">pdf</a>, <a href="https://arxiv.org/html/2502.12464v1" title="View HTML" id="html-2502.12464" aria-labelledby="html-2502.12464" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12464" title="Other formats" id="oth-2502.12464" aria-labelledby="oth-2502.12464">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SafeRoute: Adaptive Model Selection for Efficient and Accurate Safety Guardrails in Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+S">Seanie Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+D+B">Dong Bok Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wagner,+D">Dominik Wagner</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kang,+M">Minki Kang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Seong,+H">Haebin Seong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bocklet,+T">Tobias Bocklet</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+J">Juho Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hwang,+S+J">Sung Ju Hwang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 9 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Deploying large language models (LLMs) in real-world applications requires robust safety guard models to detect and block harmful user prompts. While large safety guard models achieve strong performance, their computational cost is substantial. To mitigate this, smaller distilled models are used, but they often underperform on "hard" examples where the larger model provides accurate predictions. We observe that many inputs can be reliably handled by the smaller model, while only a small fraction require the larger model's capacity. Motivated by this, we propose SafeRoute, a binary router that distinguishes hard examples from easy ones. Our method selectively applies the larger safety guard model to the data that the router considers hard, improving efficiency while maintaining accuracy compared to solely using the larger safety guard model. Experimental results on multiple benchmark datasets demonstrate that our adaptive model selection significantly enhances the trade-off between computational cost and safety performance, outperforming relevant baselines. </p> </div> </dd> <dt> <a name='item39'>[39]</a> <a href ="/abs/2502.12470" title="Abstract" id="2502.12470"> arXiv:2502.12470 </a> [<a href="/pdf/2502.12470" title="Download PDF" id="pdf-2502.12470" aria-labelledby="pdf-2502.12470">pdf</a>, <a href="https://arxiv.org/html/2502.12470v1" title="View HTML" id="html-2502.12470" aria-labelledby="html-2502.12470" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12470" title="Other formats" id="oth-2502.12470" aria-labelledby="oth-2502.12470">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Reasoning on a Spectrum: Aligning LLMs to System 1 and System 2 Thinking </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ziabari,+A+S">Alireza S. Ziabari</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ghazizadeh,+N">Nona Ghazizadeh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sourati,+Z">Zhivar Sourati</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Karimi-Malekabadi,+F">Farzan Karimi-Malekabadi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Piray,+P">Payam Piray</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dehghani,+M">Morteza Dehghani</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large Language Models (LLMs) exhibit impressive reasoning abilities, yet their reliance on structured step-by-step processing reveals a critical limitation. While human cognition fluidly adapts between intuitive, heuristic (System 1) and analytical, deliberative (System 2) reasoning depending on the context, LLMs lack this dynamic flexibility. This rigidity can lead to brittle and unreliable performance when faced with tasks that deviate from their trained patterns. To address this, we create a dataset of 2,000 samples with valid System 1 and System 2 answers, explicitly align LLMs with these reasoning styles, and evaluate their performance across reasoning benchmarks. Our results reveal an accuracy-efficiency trade-off: System 2-aligned models excel in arithmetic and symbolic reasoning, while System 1-aligned models perform better in commonsense tasks. A mechanistic analysis of model responses shows that System 1 models employ more definitive answers, whereas System 2 models demonstrate greater uncertainty. Interpolating between these extremes produces a monotonic transition in reasoning accuracy, preserving coherence. This work challenges the assumption that step-by-step reasoning is always optimal and highlights the need for adapting reasoning strategies based on task demands. </p> </div> </dd> <dt> <a name='item40'>[40]</a> <a href ="/abs/2502.12476" title="Abstract" id="2502.12476"> arXiv:2502.12476 </a> [<a href="/pdf/2502.12476" title="Download PDF" id="pdf-2502.12476" aria-labelledby="pdf-2502.12476">pdf</a>, <a href="https://arxiv.org/html/2502.12476v1" title="View HTML" id="html-2502.12476" aria-labelledby="html-2502.12476" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12476" title="Other formats" id="oth-2502.12476" aria-labelledby="oth-2502.12476">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CoCo-CoLa: Evaluating Language Adherence in Multilingual LLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Rahmati,+E">Elnaz Rahmati</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ziabari,+A+S">Alireza S. Ziabari</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dehghani,+M">Morteza Dehghani</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 13 pages, 7 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Multilingual Large Language Models (LLMs) develop cross-lingual abilities despite being trained on limited parallel data. However, they often struggle to generate responses in the intended language, favoring high-resource languages such as English. In this work, we introduce CoCo-CoLa (Correct Concept - Correct Language), a novel metric to evaluate language adherence in multilingual LLMs. Using fine-tuning experiments on a closed-book QA task across seven languages, we analyze how training in one language affects others' performance. Our findings reveal that multilingual models share task knowledge across languages but exhibit biases in the selection of output language. We identify language-specific layers, showing that final layers play a crucial role in determining output language. Accordingly, we propose a partial training strategy that selectively fine-tunes key layers, improving language adherence while significantly reducing computational cost. Our method achieves comparable or superior performance to full fine-tuning, particularly for low-resource languages, offering a more efficient multilingual adaptation. </p> </div> </dd> <dt> <a name='item41'>[41]</a> <a href ="/abs/2502.12477" title="Abstract" id="2502.12477"> arXiv:2502.12477 </a> [<a href="/pdf/2502.12477" title="Download PDF" id="pdf-2502.12477" aria-labelledby="pdf-2502.12477">pdf</a>, <a href="https://arxiv.org/html/2502.12477v1" title="View HTML" id="html-2502.12477" aria-labelledby="html-2502.12477" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12477" title="Other formats" id="oth-2502.12477" aria-labelledby="oth-2502.12477">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Savaal: Scalable Concept-Driven Question Generation to Enhance Human Learning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Noorbakhsh,+K">Kimia Noorbakhsh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chandler,+J">Joseph Chandler</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Karimi,+P">Pantea Karimi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Alizadeh,+M">Mohammad Alizadeh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Balakrishnan,+H">Hari Balakrishnan</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Kimia Noorbakhsh, Joseph Chandler, and Pantea Karimi contributed equally to the work </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Assessing and enhancing human learning through question-answering is vital, yet automating this process remains challenging. While large language models (LLMs) excel at summarization and query responses, their ability to generate meaningful questions for learners is underexplored. <br>We propose Savaal, a scalable question-generation system with three objectives: (i) scalability, enabling question generation from hundreds of pages of text (ii) depth of understanding, producing questions beyond factual recall to test conceptual reasoning, and (iii) domain-independence, automatically generating questions across diverse knowledge areas. Instead of providing an LLM with large documents as context, Savaal improves results with a three-stage processing pipeline. Our evaluation with 76 human experts on 71 papers and PhD dissertations shows that Savaal generates questions that better test depth of understanding by 6.5X for dissertations and 1.5X for papers compared to a direct-prompting LLM baseline. Notably, as document length increases, Savaal's advantages in higher question quality and lower cost become more pronounced. </p> </div> </dd> <dt> <a name='item42'>[42]</a> <a href ="/abs/2502.12478" title="Abstract" id="2502.12478"> arXiv:2502.12478 </a> [<a href="/pdf/2502.12478" title="Download PDF" id="pdf-2502.12478" aria-labelledby="pdf-2502.12478">pdf</a>, <a href="https://arxiv.org/html/2502.12478v1" title="View HTML" id="html-2502.12478" aria-labelledby="html-2502.12478" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12478" title="Other formats" id="oth-2502.12478" aria-labelledby="oth-2502.12478">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MSE-Adapter: A Lightweight Plugin Endowing LLMs with the Capability to Perform Multimodal Sentiment Analysis and Emotion Recognition </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+Y">Yang Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dong,+X">Xunde Dong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qiang,+Y">Yupeng Qiang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Current Multimodal Sentiment Analysis (MSA) and Emotion Recognition in Conversations (ERC) methods based on pre-trained language models exhibit two primary limitations: <br>1) Once trained for MSA and ERC tasks, these pre-trained language models lose their original generalized capabilities. 2) They demand considerable computational resources. As the size of pre-trained language models continues to grow, training larger multimodal sentiment analysis models using previous approaches could result in unnecessary computational cost. In response to this challenge, we propose \textbf{M}ultimodal \textbf{S}entiment Analysis and \textbf{E}motion Recognition \textbf{Adapter} (MSE-Adapter), a lightweight and adaptable plugin. This plugin enables a large language model (LLM) to carry out MSA or ERC tasks with minimal computational overhead (only introduces approximately 2.6M to 2.8M trainable parameters upon the 6/7B models), while preserving the intrinsic capabilities of the LLM. In the MSE-Adapter, the Text-Guide-Mixer (TGM) module is introduced to establish explicit connections between non-textual and textual modalities through the Hadamard product. This allows non-textual modalities to better align with textual modalities at the feature level, promoting the generation of higher-quality pseudo tokens. Extensive experiments were conducted on four public English and Chinese datasets using consumer-grade GPUs and open-source LLMs (Qwen-1.8B, ChatGLM3-6B-base, and LLaMA2-7B) as the backbone. The results demonstrate the effectiveness of the proposed plugin. The code will be released on GitHub after a blind review. </p> </div> </dd> <dt> <a name='item43'>[43]</a> <a href ="/abs/2502.12483" title="Abstract" id="2502.12483"> arXiv:2502.12483 </a> [<a href="/pdf/2502.12483" title="Download PDF" id="pdf-2502.12483" aria-labelledby="pdf-2502.12483">pdf</a>, <a href="https://arxiv.org/html/2502.12483v1" title="View HTML" id="html-2502.12483" aria-labelledby="html-2502.12483" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12483" title="Other formats" id="oth-2502.12483" aria-labelledby="oth-2502.12483">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> The Knowledge Microscope: Features as Better Analytical Lenses than Neurons </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Y">Yuheng Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+P">Pengfei Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+K">Kang Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+J">Jun Zhao</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> ARR February UnderReview </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Previous studies primarily utilize MLP neurons as units of analysis for understanding the mechanisms of factual knowledge in Language Models (LMs); however, neurons suffer from polysemanticity, leading to limited knowledge expression and poor interpretability. In this paper, we first conduct preliminary experiments to validate that Sparse Autoencoders (SAE) can effectively decompose neurons into features, which serve as alternative analytical units. With this established, our core findings reveal three key advantages of features over neurons: (1) Features exhibit stronger influence on knowledge expression and superior interpretability. (2) Features demonstrate enhanced monosemanticity, showing distinct activation patterns between related and unrelated facts. (3) Features achieve better privacy protection than neurons, demonstrated through our proposed FeatureEdit method, which significantly outperforms existing neuron-based approaches in erasing privacy-sensitive information from <a href="http://LMs.Code" rel="external noopener nofollow" class="link-external link-http">this http URL</a> and dataset will be available. </p> </div> </dd> <dt> <a name='item44'>[44]</a> <a href ="/abs/2502.12485" title="Abstract" id="2502.12485"> arXiv:2502.12485 </a> [<a href="/pdf/2502.12485" title="Download PDF" id="pdf-2502.12485" aria-labelledby="pdf-2502.12485">pdf</a>, <a href="https://arxiv.org/html/2502.12485v1" title="View HTML" id="html-2502.12485" aria-labelledby="html-2502.12485" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12485" title="Other formats" id="oth-2502.12485" aria-labelledby="oth-2502.12485">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Safe at the Margins: A General Approach to Safety Alignment in Low-Resource English Languages -- A Singlish Case Study </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lim,+I">Isaac Lim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Khoo,+S">Shaun Khoo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chua,+W">Watson Chua</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiayi,+G">Goh Jiayi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Foo,+J">Jessica Foo</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> To ensure safe usage, Large Language Models (LLMs) typically undergo alignment with human-defined values. However, this alignment often relies on primarily English data and is biased towards Western-centric values, limiting its effectiveness in low-resource language settings. In this paper, we describe our approach for aligning SEA-Lion-v2.1-Instruct (a Llama3-8B variant) to minimize toxicity in Singlish, an English creole specific to Singapore. We find that supervised fine-tuning and Kahneman-Tversky Optimization (KTO) on paired and unpaired preferences is more sample efficient and yields significantly better results than Direct Preference Optimization (DPO). Our analysis reveals that DPO implicitly enforces a weaker safety objective than KTO, and that SFT complements KTO by improving training stability. Finally, we introduce a simple but novel modification to KTO, KTO-S, which improves training stability through better gradient exploitation. Overall, we present a general approach for safety alignment conducive to low-resource English languages, successfully reducing toxicity by 99\% on our Singlish benchmark, with gains generalizing to the broader TOXIGEN dataset while maintaining strong performance across standard LLM benchmarks. </p> </div> </dd> <dt> <a name='item45'>[45]</a> <a href ="/abs/2502.12486" title="Abstract" id="2502.12486"> arXiv:2502.12486 </a> [<a href="/pdf/2502.12486" title="Download PDF" id="pdf-2502.12486" aria-labelledby="pdf-2502.12486">pdf</a>, <a href="https://arxiv.org/html/2502.12486v1" title="View HTML" id="html-2502.12486" aria-labelledby="html-2502.12486" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12486" title="Other formats" id="oth-2502.12486" aria-labelledby="oth-2502.12486">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> EPO: Explicit Policy Optimization for Strategic Reasoning in LLMs via Reinforcement Learning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+X">Xiaoqian Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+K">Ke Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yongbin Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Y">Yuchuan Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+W">Wentao Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kong,+A">Aobo Kong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+F">Fei Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiao,+J">Jianbin Jiao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+J">Junge Zhang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 9 pages, 4 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large Language Models (LLMs) have shown impressive reasoning capabilities in well-defined problems with clear solutions, such as mathematics and coding. However, they still struggle with complex real-world scenarios like business negotiations, which require strategic reasoning-an ability to navigate dynamic environments and align long-term goals amidst uncertainty. Existing methods for strategic reasoning face challenges in adaptability, scalability, and transferring strategies to new contexts. To address these issues, we propose explicit policy optimization (EPO) for strategic reasoning, featuring an LLM that provides strategies in open-ended action space and can be plugged into arbitrary LLM agents to motivate goal-directed behavior. To improve adaptability and policy transferability, we train the strategic reasoning model via multi-turn reinforcement learning (RL) using process rewards and iterative self-play, without supervised fine-tuning (SFT) as a preliminary step. Experiments across social and physical domains demonstrate EPO's ability of long-term goal alignment through enhanced strategic reasoning, achieving state-of-the-art performance on social dialogue and web navigation tasks. Our findings reveal various collaborative reasoning mechanisms emergent in EPO and its effectiveness in generating novel strategies, underscoring its potential for strategic reasoning in real-world applications. </p> </div> </dd> <dt> <a name='item46'>[46]</a> <a href ="/abs/2502.12490" title="Abstract" id="2502.12490"> arXiv:2502.12490 </a> [<a href="/pdf/2502.12490" title="Download PDF" id="pdf-2502.12490" aria-labelledby="pdf-2502.12490">pdf</a>, <a href="https://arxiv.org/html/2502.12490v1" title="View HTML" id="html-2502.12490" aria-labelledby="html-2502.12490" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12490" title="Other formats" id="oth-2502.12490" aria-labelledby="oth-2502.12490">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> UniGenCoder: Merging Seq2Seq and Seq2Tree Paradigms for Unified Code Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Shao,+L">Liangying Shao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yan,+Y">Yanfu Yan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Poshyvanyk,+D">Denys Poshyvanyk</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Su,+J">Jinsong Su</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> ICSE2025 NIER track </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Deep learning-based code generation has completely transformed the way developers write programs today. Existing approaches to code generation have focused either on the Sequence-to-Sequence paradigm, which generates target code as a sequence of tokens, or the Sequence-to-Tree paradigm, which outputs code as a sequence of actions. While these two paradigms are intuitively complementary, their combination has not been previously explored. By comparing the code generated under these two paradigms, we find that integrating them holds significant potential. In this paper, we propose UniGenCoder for code-related generation tasks, which consists of a shared encoder, a shared decoder with a minimal set of additional parameters to unify two paradigms, and a selector that dynamically chooses optimal paradigm for each instance. Also, during the model training, we first perform the multi-task learning and distillation strategies to facilitate knowledge transfer between two paradigms, and then leverage contrastive learning to train the selector. Experimental results on the text-to-code and code-to-code generation tasks demonstrate the effectiveness of our proposed model. We release our code at <a href="https://github.com/DeepLearnXMU/UniGenCoder" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item47'>[47]</a> <a href ="/abs/2502.12501" title="Abstract" id="2502.12501"> arXiv:2502.12501 </a> [<a href="/pdf/2502.12501" title="Download PDF" id="pdf-2502.12501" aria-labelledby="pdf-2502.12501">pdf</a>, <a href="https://arxiv.org/html/2502.12501v1" title="View HTML" id="html-2502.12501" aria-labelledby="html-2502.12501" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12501" title="Other formats" id="oth-2502.12501" aria-labelledby="oth-2502.12501">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Crowd Comparative Reasoning: Unlocking Comprehensive Evaluations for LLM-as-a-Judge </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Q">Qiyuan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yufei Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+Y">Yuxin Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+L">Liangyou Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+C">Chuhan Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yasheng Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+X">Xin Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shang,+L">Lifeng Shang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+R">Ruiming Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lyu,+F">Fuyuan Lyu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+C">Chen Ma</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> LLM-as-a-Judge, which generates chain-of-thought (CoT) judgments, has become a widely adopted auto-evaluation method. However, its reliability is compromised by the CoT reasoning's inability to capture comprehensive and deeper details, often leading to incomplete outcomes. Existing methods mainly rely on majority voting or criteria expansion, which is insufficient to address the limitation in CoT. We propose Crowd-based Comparative Evaluation, which introduces additional crowd responses to compare with the candidate responses, thereby exposing deeper and more comprehensive details within the candidate responses. This process effectively guides LLM-as-a-Judge to provide a more detailed CoT judgment. Extensive experiments demonstrate that our approach enhances evaluation reliability, achieving an average accuracy gain of 6.7% across five benchmarks. Moreover, our method produces higher-quality CoTs that facilitate judge distillation and exhibit superior performance in rejection sampling for supervised fine-tuning (SFT), referred to as crowd rejection sampling, thereby enabling more efficient SFT. Our analysis confirms that CoTs generated by ours are more comprehensive and of higher quality, and evaluation accuracy improves as inference scales. </p> </div> </dd> <dt> <a name='item48'>[48]</a> <a href ="/abs/2502.12502" title="Abstract" id="2502.12502"> arXiv:2502.12502 </a> [<a href="/pdf/2502.12502" title="Download PDF" id="pdf-2502.12502" aria-labelledby="pdf-2502.12502">pdf</a>, <a href="https://arxiv.org/html/2502.12502v1" title="View HTML" id="html-2502.12502" aria-labelledby="html-2502.12502" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12502" title="Other formats" id="oth-2502.12502" aria-labelledby="oth-2502.12502">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Efficient OpAmp Adaptation for Zoom Attention to Golden Contexts </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+H">Haoyuan Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ming,+R">Rui Ming</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zheng,+H">Haisheng Zheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+Z">Zhuolun He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+B">Bei Yu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large language models (LLMs) have shown significant promise in question-answering (QA) tasks, particularly in retrieval-augmented generation (RAG) scenarios and long-context applications. However, their performance is hindered by noisy reference documents, which often distract from essential information. Despite fine-tuning efforts, Transformer-based architectures struggle to prioritize relevant content. This is evidenced by their tendency to allocate disproportionate attention to irrelevant or later-positioned documents. Recent work proposes the differential attention mechanism to address this issue, but this mechanism is limited by an unsuitable common-mode rejection ratio (CMRR) and high computational costs. Inspired by the operational amplifier (OpAmp), we propose the OpAmp adaptation to address these challenges, which is implemented with adapters efficiently. By integrating the adapter into pre-trained Transformer blocks, our approach enhances focus on the golden context without costly training from scratch. Empirical evaluations on noisy-context benchmarks reveal that our Qwen2.5-OpAmp-72B model, trained with our OpAmp adaptation, surpasses the performance of state-of-the-art LLMs, including DeepSeek-V3 and GPT-4o. </p> </div> </dd> <dt> <a name='item49'>[49]</a> <a href ="/abs/2502.12509" title="Abstract" id="2502.12509"> arXiv:2502.12509 </a> [<a href="/pdf/2502.12509" title="Download PDF" id="pdf-2502.12509" aria-labelledby="pdf-2502.12509">pdf</a>, <a href="https://arxiv.org/html/2502.12509v1" title="View HTML" id="html-2502.12509" aria-labelledby="html-2502.12509" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12509" title="Other formats" id="oth-2502.12509" aria-labelledby="oth-2502.12509">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LegalCore: A Dataset for Legal Documents Event Coreference Resolution </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wei,+K">Kangda Wei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shi,+X">Xi Shi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tong,+J">Jonathan Tong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Reddy,+S+R">Sai Ramana Reddy</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Natarajan,+A">Anandhavelu Natarajan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jain,+R">Rajiv Jain</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Garimella,+A">Aparna Garimella</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+R">Ruihong Huang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Recognizing events and their coreferential mentions in a document is essential for understanding semantic meanings of text. The existing research on event coreference resolution is mostly limited to news articles. In this paper, we present the first dataset for the legal domain, LegalCore, which has been annotated with comprehensive event and event coreference information. The legal contract documents we annotated in this dataset are several times longer than news articles, with an average length of around 25k tokens per document. The annotations show that legal documents have dense event mentions and feature both short-distance and super long-distance coreference links between event mentions. We further benchmark mainstream Large Language Models (LLMs) on this dataset for both event detection and event coreference resolution tasks, and find that this dataset poses significant challenges for state-of-the-art open-source and proprietary LLMs, which perform significantly worse than a supervised baseline. We will publish the dataset as well as the code. </p> </div> </dd> <dt> <a name='item50'>[50]</a> <a href ="/abs/2502.12510" title="Abstract" id="2502.12510"> arXiv:2502.12510 </a> [<a href="/pdf/2502.12510" title="Download PDF" id="pdf-2502.12510" aria-labelledby="pdf-2502.12510">pdf</a>, <a href="https://arxiv.org/html/2502.12510v1" title="View HTML" id="html-2502.12510" aria-labelledby="html-2502.12510" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12510" title="Other formats" id="oth-2502.12510" aria-labelledby="oth-2502.12510">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Aspect-Guided Multi-Level Perturbation Analysis of Large Language Models in Automated Peer Review </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+J">Jiatao Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yanheng Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+X">Xinyu Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+M">Mingqi Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wan,+X">Xiaojun Wan</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Under Review </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> We propose an aspect-guided, multi-level perturbation framework to evaluate the robustness of Large Language Models (LLMs) in automated peer review. Our framework explores perturbations in three key components of the peer review process-papers, reviews, and rebuttals-across several quality aspects, including contribution, soundness, presentation, tone, and completeness. By applying targeted perturbations and examining their effects on both LLM-as-Reviewer and LLM-as-Meta-Reviewer, we investigate how aspect-based manipulations, such as omitting methodological details from papers or altering reviewer conclusions, can introduce significant biases in the review process. We identify several potential vulnerabilities: review conclusions that recommend a strong reject may significantly influence meta-reviews, negative or misleading reviews may be wrongly interpreted as thorough, and incomplete or hostile rebuttals can unexpectedly lead to higher acceptance rates. Statistical tests show that these biases persist under various Chain-of-Thought prompting strategies, highlighting the lack of robust critical evaluation in current LLMs. Our framework offers a practical methodology for diagnosing these vulnerabilities, thereby contributing to the development of more reliable and robust automated reviewing systems. </p> </div> </dd> <dt> <a name='item51'>[51]</a> <a href ="/abs/2502.12516" title="Abstract" id="2502.12516"> arXiv:2502.12516 </a> [<a href="/pdf/2502.12516" title="Download PDF" id="pdf-2502.12516" aria-labelledby="pdf-2502.12516">pdf</a>, <a href="https://arxiv.org/html/2502.12516v1" title="View HTML" id="html-2502.12516" aria-labelledby="html-2502.12516" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12516" title="Other formats" id="oth-2502.12516" aria-labelledby="oth-2502.12516">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Can LLMs Extract Frame-Semantic Arguments? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Devasier,+J">Jacob Devasier</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mediratta,+R">Rishabh Mediratta</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+C">Chengkai Li</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Frame-semantic parsing is a critical task in natural language understanding, yet the ability of large language models (LLMs) to extract frame-semantic arguments remains underexplored. This paper presents a comprehensive evaluation of LLMs on frame-semantic argument identification, analyzing the impact of input representation formats, model architectures, and generalization to unseen and out-of-domain samples. Our experiments, spanning models from 0.5B to 78B parameters, reveal that JSON-based representations significantly enhance performance, and while larger models generally perform better, smaller models can achieve competitive results through fine-tuning. We also introduce a novel approach to frame identification leveraging predicted frame elements, achieving state-of-the-art performance on ambiguous targets. Despite strong generalization capabilities, our analysis finds that LLMs still struggle with out-of-domain data. </p> </div> </dd> <dt> <a name='item52'>[52]</a> <a href ="/abs/2502.12530" title="Abstract" id="2502.12530"> arXiv:2502.12530 </a> [<a href="/pdf/2502.12530" title="Download PDF" id="pdf-2502.12530" aria-labelledby="pdf-2502.12530">pdf</a>, <a href="https://arxiv.org/html/2502.12530v1" title="View HTML" id="html-2502.12530" aria-labelledby="html-2502.12530" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12530" title="Other formats" id="oth-2502.12530" aria-labelledby="oth-2502.12530">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Policy-to-Language: Train LLMs to Explain Decisions with Flow-Matching Generated Rewards </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+X">Xinyi Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zeng,+L">Liang Zeng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dong,+H">Heng Dong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+C">Chao Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+X">Xiaoran Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+H">Huazhong Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yu Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tambe,+M">Milind Tambe</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+T">Tonghan Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> As humans increasingly share environments with diverse agents powered by RL, LLMs, and beyond, the ability to explain their policies in natural language will be vital for reliable coexistence. In this paper, we build a model-agnostic explanation generator based on an LLM. The technical novelty is that the rewards for training this LLM are generated by a generative flow matching model. This model has a specially designed structure with a hidden layer merged with an LLM to harness the linguistic cues of explanations into generating appropriate rewards. Experiments on both RL and LLM tasks demonstrate that our method can generate dense and effective rewards while saving on expensive human feedback; it thus enables effective explanations and even improves the accuracy of the decisions in original tasks. </p> </div> </dd> <dt> <a name='item53'>[53]</a> <a href ="/abs/2502.12560" title="Abstract" id="2502.12560"> arXiv:2502.12560 </a> [<a href="/pdf/2502.12560" title="Download PDF" id="pdf-2502.12560" aria-labelledby="pdf-2502.12560">pdf</a>, <a href="https://arxiv.org/html/2502.12560v1" title="View HTML" id="html-2502.12560" aria-labelledby="html-2502.12560" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12560" title="Other formats" id="oth-2502.12560" aria-labelledby="oth-2502.12560">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> How does a Language-Specific Tokenizer affect LLMs? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Seo,+J">Jean Seo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+J">Jaeyoon Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Byun,+S">SungJoo Byun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shin,+H">Hyopil Shin</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The necessity of language-specific tokenizers intuitively appears crucial for effective natural language processing, yet empirical analyses on their significance and underlying reasons are lacking. This study explores how language-specific tokenizers influence the behavior of Large Language Models predominantly trained with English text data, through the case study of Korean. The research unfolds in two main stages: (1) the development of a Korean-specific extended tokenizer and (2) experiments to compare models with the basic tokenizer and the extended tokenizer through various Next Token Prediction tasks. Our in-depth analysis reveals that the extended tokenizer decreases confidence in incorrect predictions during generation and reduces cross-entropy in complex tasks, indicating a tendency to produce less nonsensical outputs. Consequently, the extended tokenizer provides stability during generation, potentially leading to higher performance in downstream tasks. </p> </div> </dd> <dt> <a name='item54'>[54]</a> <a href ="/abs/2502.12562" title="Abstract" id="2502.12562"> arXiv:2502.12562 </a> [<a href="/pdf/2502.12562" title="Download PDF" id="pdf-2502.12562" aria-labelledby="pdf-2502.12562">pdf</a>, <a href="https://arxiv.org/html/2502.12562v1" title="View HTML" id="html-2502.12562" aria-labelledby="html-2502.12562" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12562" title="Other formats" id="oth-2502.12562" aria-labelledby="oth-2502.12562">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SEA: Low-Resource Safety Alignment for Multimodal Large Language Models via Synthetic Embeddings </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+W">Weikai Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Peng,+H">Hao Peng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhuang,+H">Huiping Zhuang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+C">Cen Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zeng,+Z">Ziqian Zeng</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Cryptography and Security (cs.CR); Multimedia (cs.MM) </div> <p class='mathjax'> Multimodal Large Language Models (MLLMs) have serious security <a href="http://vulnerabilities.While" rel="external noopener nofollow" class="link-external link-http">this http URL</a> safety alignment using multimodal datasets consisting of text and data of additional modalities can effectively enhance MLLM's security, it is costly to construct these datasets. Existing low-resource security alignment methods, including textual alignment, have been found to struggle with the security risks posed by additional modalities. To address this, we propose Synthetic Embedding augmented safety Alignment (SEA), which optimizes embeddings of additional modality through gradient updates to expand textual datasets. This enables multimodal safety alignment training even when only textual data is available. Extensive experiments on image, video, and audio-based MLLMs demonstrate that SEA can synthesize a high-quality embedding on a single RTX3090 GPU within 24 seconds. SEA significantly improves the security of MLLMs when faced with threats from additional modalities. To assess the security risks introduced by video and audio, we also introduced a new benchmark called VA-SafetyBench. High attack success rates across multiple MLLMs validate its challenge. Our code and data will be available at <a href="https://github.com/ZeroNLP/SEA" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item55'>[55]</a> <a href ="/abs/2502.12563" title="Abstract" id="2502.12563"> arXiv:2502.12563 </a> [<a href="/pdf/2502.12563" title="Download PDF" id="pdf-2502.12563" aria-labelledby="pdf-2502.12563">pdf</a>, <a href="https://arxiv.org/html/2502.12563v1" title="View HTML" id="html-2502.12563" aria-labelledby="html-2502.12563" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12563" title="Other formats" id="oth-2502.12563" aria-labelledby="oth-2502.12563">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Evaluating Language Models on Grooming Risk Estimation Using Fuzzy Theory </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Bihani,+G">Geetanjali Bihani</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ringenberg,+T">Tatiana Ringenberg</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rayz,+J">Julia Rayz</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 9 pages, 2 figures. Accepted for publication in the Proceedings of the NAFIPS International Conference on Fuzzy Systems, Soft Computing, and Explainable AI. NAFIPS'2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Encoding implicit language presents a challenge for language models, especially in high-risk domains where maintaining high precision is important. Automated detection of online child grooming is one such critical domain, where predators manipulate victims using a combination of explicit and implicit language to convey harmful intentions. While recent studies have shown the potential of Transformer language models like SBERT for preemptive grooming detection, they primarily depend on surface-level features and approximate real victim grooming processes using vigilante and law enforcement conversations. The question of whether these features and approximations are reasonable has not been addressed thus far. In this paper, we address this gap and study whether SBERT can effectively discern varying degrees of grooming risk inherent in conversations, and evaluate its results across different participant groups. Our analysis reveals that while fine-tuning aids language models in learning to assign grooming scores, they show high variance in predictions, especially for contexts containing higher degrees of grooming risk. These errors appear in cases that 1) utilize indirect speech pathways to manipulate victims and 2) lack sexually explicit content. This finding underscores the necessity for robust modeling of indirect speech acts by language models, particularly those employed by predators. </p> </div> </dd> <dt> <a name='item56'>[56]</a> <a href ="/abs/2502.12565" title="Abstract" id="2502.12565"> arXiv:2502.12565 </a> [<a href="/pdf/2502.12565" title="Download PDF" id="pdf-2502.12565" aria-labelledby="pdf-2502.12565">pdf</a>, <a href="https://arxiv.org/html/2502.12565v1" title="View HTML" id="html-2502.12565" aria-labelledby="html-2502.12565" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12565" title="Other formats" id="oth-2502.12565" aria-labelledby="oth-2502.12565">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Self Iterative Label Refinement via Robust Unlabeled Learning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Asano,+H">Hikaru Asano</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kozuno,+T">Tadashi Kozuno</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Baba,+Y">Yukino Baba</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 17 pages, 8 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Recent advances in large language models (LLMs) have yielded impressive performance on various tasks, yet they often depend on high-quality feedback that can be costly. Self-refinement methods attempt to leverage LLMs' internal evaluation mechanisms with minimal human supervision; however, these approaches frequently suffer from inherent biases and overconfidence, especially in domains where the models lack sufficient internal knowledge, resulting in performance degradation. As an initial step toward enhancing self-refinement for broader applications, we introduce an iterative refinement pipeline that employs the Unlabeled-Unlabeled learning framework to improve LLM-generated pseudo-labels for classification tasks. By exploiting two unlabeled datasets with differing positive class ratios, our approach iteratively denoises and refines the initial pseudo-labels, thereby mitigating the adverse effects of internal biases with minimal human supervision. Evaluations on diverse datasets, including low-resource language corpora, patent classifications, and protein structure categorizations, demonstrate that our method consistently outperforms both initial LLM's classification performance and the self-refinement approaches by cutting-edge models (e.g., GPT-4o and DeepSeek-R1). </p> </div> </dd> <dt> <a name='item57'>[57]</a> <a href ="/abs/2502.12568" title="Abstract" id="2502.12568"> arXiv:2502.12568 </a> [<a href="/pdf/2502.12568" title="Download PDF" id="pdf-2502.12568" aria-labelledby="pdf-2502.12568">pdf</a>, <a href="https://arxiv.org/html/2502.12568v1" title="View HTML" id="html-2502.12568" aria-labelledby="html-2502.12568" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12568" title="Other formats" id="oth-2502.12568" aria-labelledby="oth-2502.12568">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Cognitive Writing Perspective for Constrained Long-Form Text Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wan,+K">Kaiyang Wan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mu,+H">Honglin Mu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hao,+R">Rui Hao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+H">Haoran Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gu,+T">Tianle Gu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+X">Xiuying Chen</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 13 pages, 6 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Like humans, Large Language Models (LLMs) struggle to generate high-quality long-form text that adheres to strict requirements in a single pass. This challenge is unsurprising, as successful human writing, according to the Cognitive Writing Theory, is a complex cognitive process involving iterative planning, translating, reviewing, and monitoring. Motivated by these cognitive principles, we aim to equip LLMs with human-like cognitive writing capabilities through CogWriter, a novel training-free framework that transforms LLM constrained long-form text generation into a systematic cognitive writing paradigm. Our framework consists of two key modules: (1) a Planning Agent that performs hierarchical planning to decompose the task, and (2) multiple Generation Agents that execute these plans in parallel. The system maintains quality via continuous monitoring and reviewing mechanisms, which evaluate outputs against specified requirements and trigger necessary revisions. CogWriter demonstrates exceptional performance on LongGenBench, a benchmark for complex constrained long-form text generation. Even when using Qwen-2.5-14B as its backbone, CogWriter surpasses GPT-4o by 22% in complex instruction completion accuracy while reliably generating texts exceeding 10,000 words. We hope this cognitive science-inspired approach provides a paradigm for LLM writing advancements: \href{<a href="https://github.com/KaiyangWan/CogWriter" rel="external noopener nofollow" class="link-external link-https">this https URL</a>}{CogWriter}. </p> </div> </dd> <dt> <a name='item58'>[58]</a> <a href ="/abs/2502.12576" title="Abstract" id="2502.12576"> arXiv:2502.12576 </a> [<a href="/pdf/2502.12576" title="Download PDF" id="pdf-2502.12576" aria-labelledby="pdf-2502.12576">pdf</a>, <a href="https://arxiv.org/html/2502.12576v1" title="View HTML" id="html-2502.12576" aria-labelledby="html-2502.12576" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12576" title="Other formats" id="oth-2502.12576" aria-labelledby="oth-2502.12576">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Fuzzy Evaluation of Sentence Encoders on Grooming Risk Classification </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Bihani,+G">Geetanjali Bihani</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rayz,+J">Julia Rayz</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 8 pages, 2 figures. Accepted for publication in the Proceedings of the NAFIPS International Conference on Fuzzy Systems, Soft Computing, and Explainable AI. NAFIPS'2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> With the advent of social media, children are becoming increasingly vulnerable to the risk of grooming in online settings. Detecting grooming instances in an online conversation poses a significant challenge as the interactions are not necessarily sexually explicit, since the predators take time to build trust and a relationship with their victim. Moreover, predators evade detection using indirect and coded language. While previous studies have fine-tuned Transformers to automatically identify grooming in chat conversations, they overlook the impact of coded and indirect language on model predictions, and how these align with human perceptions of grooming. In this paper, we address this gap and evaluate bi-encoders on the task of classifying different degrees of grooming risk in chat contexts, for three different participant groups, i.e. law enforcement officers, real victims, and decoys. Using a fuzzy-theoretic framework, we map human assessments of grooming behaviors to estimate the actual degree of grooming risk. Our analysis reveals that fine-tuned models fail to tag instances where the predator uses indirect speech pathways and coded language to evade detection. Further, we find that such instances are characterized by a higher presence of out-of-vocabulary (OOV) words in samples, causing the model to misclassify. Our findings highlight the need for more robust models to identify coded language from noisy chat inputs in grooming contexts. </p> </div> </dd> <dt> <a name='item59'>[59]</a> <a href ="/abs/2502.12583" title="Abstract" id="2502.12583"> arXiv:2502.12583 </a> [<a href="/pdf/2502.12583" title="Download PDF" id="pdf-2502.12583" aria-labelledby="pdf-2502.12583">pdf</a>, <a href="https://arxiv.org/html/2502.12583v1" title="View HTML" id="html-2502.12583" aria-labelledby="html-2502.12583" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12583" title="Other formats" id="oth-2502.12583" aria-labelledby="oth-2502.12583">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LongFaith: Enhancing Long-Context Reasoning in LLMs with Faithful Synthetic Data </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+C">Cehao Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+X">Xueyuan Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+C">Chengjin Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+X">Xuhui Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+S">Shengjie Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+A">Aofan Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiong,+H">Hui Xiong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+J">Jian Guo</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Despite the growing development of long-context large language models (LLMs), data-centric approaches relying on synthetic data have been hindered by issues related to faithfulness, which limit their effectiveness in enhancing model performance on tasks such as long-context reasoning and question answering (QA). These challenges are often exacerbated by misinformation caused by lack of verification, reasoning without attribution, and potential knowledge conflicts. We propose LongFaith, a novel pipeline for synthesizing faithful long-context reasoning instruction datasets. By integrating ground truth and citation-based reasoning prompts, we eliminate distractions and improve the accuracy of reasoning chains, thus mitigating the need for costly verification processes. We open-source two synthesized datasets, LongFaith-SFT and LongFaith-PO, which systematically address multiple dimensions of faithfulness, including verified reasoning, attribution, and contextual grounding. Extensive experiments on multi-hop reasoning datasets and LongBench demonstrate that models fine-tuned on these datasets significantly improve performance. Our ablation studies highlight the scalability and adaptability of the LongFaith pipeline, showcasing its broad applicability in developing long-context LLMs. </p> </div> </dd> <dt> <a name='item60'>[60]</a> <a href ="/abs/2502.12587" title="Abstract" id="2502.12587"> arXiv:2502.12587 </a> [<a href="/pdf/2502.12587" title="Download PDF" id="pdf-2502.12587" aria-labelledby="pdf-2502.12587">pdf</a>, <a href="https://arxiv.org/html/2502.12587v1" title="View HTML" id="html-2502.12587" aria-labelledby="html-2502.12587" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12587" title="Other formats" id="oth-2502.12587" aria-labelledby="oth-2502.12587">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> RSMLP: A light Sampled MLP Structure for Incomplete Utterance Rewrite </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+L">Lunjun Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+W">Weilai Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yaonan Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> The Incomplete Utterance Rewriting (IUR) task has garnered significant attention in recent years. Its goal is to reconstruct conversational utterances to better align with the current context, thereby enhancing comprehension. In this paper, we introduce a novel and versatile lightweight method, Rewritten-Sampled MLP (RSMLP). By employing an MLP based architecture with a carefully designed down-sampling strategy, RSMLP effectively extracts latent semantic information between utterances and makes appropriate edits to restore incomplete utterances. Due to its simple yet efficient structure, our method achieves competitive performance on public IUR datasets and in real-world applications. </p> </div> </dd> <dt> <a name='item61'>[61]</a> <a href ="/abs/2502.12594" title="Abstract" id="2502.12594"> arXiv:2502.12594 </a> [<a href="/pdf/2502.12594" title="Download PDF" id="pdf-2502.12594" aria-labelledby="pdf-2502.12594">pdf</a>, <a href="https://arxiv.org/html/2502.12594v1" title="View HTML" id="html-2502.12594" aria-labelledby="html-2502.12594" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12594" title="Other formats" id="oth-2502.12594" aria-labelledby="oth-2502.12594">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> PASER: Post-Training Data Selection for Efficient Pruned Large Language Model Recovery </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=He,+B">Bowei He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yin,+L">Lihao Yin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhen,+H">Hui-Ling Zhen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+X">Xiaokun Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+M">Mingxuan Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+C">Chen Ma</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Model pruning is an effective approach for compressing large language models. However, this process often leads to significant degradation of model capabilities. While post-training techniques such as instruction tuning are commonly employed to recover model performance, existing methods often overlook the uneven deterioration of model capabilities and incur high computational costs. Moreover, some instruction data irrelevant to model capability recovery may introduce negative effects. To address these challenges, we propose the \textbf{P}ost-training d\textbf{A}ta \textbf{S}election method for \textbf{E}fficient pruned large language model \textbf{R}ecovery (\textbf{PASER}). PASER aims to identify instructions where model capabilities are most severely compromised within a certain recovery data budget. Our approach first applies manifold learning and spectral clustering to group recovery data in the semantic space, revealing capability-specific instruction sets. We then adaptively allocate the data budget to different clusters based on the degrees of model capability degradation. In each cluster, we prioritize data samples where model performance has declined dramatically. To mitigate potential negative transfer, we also detect and filter out conflicting or irrelevant recovery data. Extensive experiments demonstrate that PASER significantly outperforms conventional baselines, effectively recovering the general capabilities of pruned LLMs while utilizing merely 4\%-20\% of the original post-training data. </p> </div> </dd> <dt> <a name='item62'>[62]</a> <a href ="/abs/2502.12598" title="Abstract" id="2502.12598"> arXiv:2502.12598 </a> [<a href="/pdf/2502.12598" title="Download PDF" id="pdf-2502.12598" aria-labelledby="pdf-2502.12598">pdf</a>, <a href="https://arxiv.org/html/2502.12598v1" title="View HTML" id="html-2502.12598" aria-labelledby="html-2502.12598" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12598" title="Other formats" id="oth-2502.12598" aria-labelledby="oth-2502.12598">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Bring Your Own Knowledge: A Survey of Methods for LLM Knowledge Expansion </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+M">Mingyang Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Stoll,+A">Alisa Stoll</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lange,+L">Lukas Lange</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Adel,+H">Heike Adel</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sch%C3%BCtze,+H">Hinrich Sch眉tze</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Str%C3%B6tgen,+J">Jannik Str枚tgen</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Adapting large language models (LLMs) to new and diverse knowledge is essential for their lasting effectiveness in real-world applications. This survey provides an overview of state-of-the-art methods for expanding the knowledge of LLMs, focusing on integrating various knowledge types, including factual information, domain expertise, language proficiency, and user preferences. We explore techniques, such as continual learning, model editing, and retrieval-based explicit adaptation, while discussing challenges like knowledge consistency and scalability. Designed as a guide for researchers and practitioners, this survey sheds light on opportunities for advancing LLMs as adaptable and robust knowledge systems. </p> </div> </dd> <dt> <a name='item63'>[63]</a> <a href ="/abs/2502.12601" title="Abstract" id="2502.12601"> arXiv:2502.12601 </a> [<a href="/pdf/2502.12601" title="Download PDF" id="pdf-2502.12601" aria-labelledby="pdf-2502.12601">pdf</a>, <a href="https://arxiv.org/html/2502.12601v1" title="View HTML" id="html-2502.12601" aria-labelledby="html-2502.12601" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12601" title="Other formats" id="oth-2502.12601" aria-labelledby="oth-2502.12601">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> COPU: Conformal Prediction for Uncertainty Quantification in Natural Language Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Sean Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+Y">Yicheng Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+Y">Yuxin Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cheng,+L">Lu Cheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+H">Hanjie Chen</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Uncertainty Quantification (UQ) for Natural Language Generation (NLG) is crucial for assessing the performance of Large Language Models (LLMs), as it reveals confidence in predictions, identifies failure modes, and gauges output reliability. Conformal Prediction (CP), a model-agnostic method that generates prediction sets with a specified error rate, has been adopted for UQ in classification tasks, where the size of the prediction set indicates the model's uncertainty. However, when adapting CP to NLG, the sampling-based method for generating candidate outputs cannot guarantee the inclusion of the ground truth, limiting its applicability across a wide range of error rates. To address this, we propose \ourmethod, a method that explicitly adds the ground truth to the candidate outputs and uses logit scores to measure nonconformity. Our experiments with six LLMs on four NLG tasks show that \ourmethod outperforms baseline methods in calibrating error rates and empirical cover rates, offering accurate UQ across a wide range of user-specified error rates. </p> </div> </dd> <dt> <a name='item64'>[64]</a> <a href ="/abs/2502.12611" title="Abstract" id="2502.12611"> arXiv:2502.12611 </a> [<a href="/pdf/2502.12611" title="Download PDF" id="pdf-2502.12611" aria-labelledby="pdf-2502.12611">pdf</a>, <a href="/format/2502.12611" title="Other formats" id="oth-2502.12611" aria-labelledby="oth-2502.12611">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Who Writes What: Unveiling the Impact of Author Roles on AI-generated Text Detection </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+J">Jiatao Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wan,+X">Xiaojun Wan</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Under Review </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The rise of Large Language Models (LLMs) necessitates accurate AI-generated text detection. However, current approaches largely overlook the influence of author characteristics. We investigate how sociolinguistic attributes-gender, CEFR proficiency, academic field, and language environment-impact state-of-the-art AI text detectors. Using the ICNALE corpus of human-authored texts and parallel AI-generated texts from diverse LLMs, we conduct a rigorous evaluation employing multi-factor ANOVA and weighted least squares (WLS). Our results reveal significant biases: CEFR proficiency and language environment consistently affected detector accuracy, while gender and academic field showed detector-dependent effects. These findings highlight the crucial need for socially aware AI text detection to avoid unfairly penalizing specific demographic groups. We offer novel empirical evidence, a robust statistical framework, and actionable insights for developing more equitable and reliable detection systems in real-world, out-of-domain contexts. This work paves the way for future research on bias mitigation, inclusive evaluation benchmarks, and socially responsible LLM detectors. </p> </div> </dd> <dt> <a name='item65'>[65]</a> <a href ="/abs/2502.12614" title="Abstract" id="2502.12614"> arXiv:2502.12614 </a> [<a href="/pdf/2502.12614" title="Download PDF" id="pdf-2502.12614" aria-labelledby="pdf-2502.12614">pdf</a>, <a href="https://arxiv.org/html/2502.12614v1" title="View HTML" id="html-2502.12614" aria-labelledby="html-2502.12614" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12614" title="Other formats" id="oth-2502.12614" aria-labelledby="oth-2502.12614">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Label Drop for Multi-Aspect Relation Modeling in Universal Information Extraction </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+L">Lu Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+J">Jiajia Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ci,+E">En Ci</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+L">Lefei Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Z">Zuchao Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+P">Ping Wang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted to NAACL-main 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Universal Information Extraction (UIE) has garnered significant attention due to its ability to address model explosion problems effectively. Extractive UIE can achieve strong performance using a relatively small model, making it widely adopted. Extractive UIEs generally rely on task instructions for different tasks, including single-target instructions and multiple-target instructions. Single-target instruction UIE enables the extraction of only one type of relation at a time, limiting its ability to model correlations between relations and thus restricting its capability to extract complex relations. While multiple-target instruction UIE allows for the extraction of multiple relations simultaneously, the inclusion of irrelevant relations introduces decision complexity and impacts extraction accuracy. Therefore, for multi-relation extraction, we propose LDNet, which incorporates multi-aspect relation modeling and a label drop mechanism. By assigning different relations to different levels for understanding and decision-making, we reduce decision confusion. Additionally, the label drop mechanism effectively mitigates the impact of irrelevant relations. Experiments show that LDNet outperforms or achieves competitive performance with state-of-the-art systems on 9 tasks, 33 datasets, in both single-modal and multi-modal, few-shot and zero-shot settings.\footnote{<a href="https://github.com/Lu-Yang666/LDNet" rel="external noopener nofollow" class="link-external link-https">this https URL</a>} </p> </div> </dd> <dt> <a name='item66'>[66]</a> <a href ="/abs/2502.12616" title="Abstract" id="2502.12616"> arXiv:2502.12616 </a> [<a href="/pdf/2502.12616" title="Download PDF" id="pdf-2502.12616" aria-labelledby="pdf-2502.12616">pdf</a>, <a href="https://arxiv.org/html/2502.12616v1" title="View HTML" id="html-2502.12616" aria-labelledby="html-2502.12616" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12616" title="Other formats" id="oth-2502.12616" aria-labelledby="oth-2502.12616">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Improving Chain-of-Thought Reasoning via Quasi-Symbolic Abstractions </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ranaldi,+L">Leonardo Ranaldi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Valentino,+M">Marco Valentino</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Polonsky,+A">Alexander Polonsky</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Freitas,+A">Andr猫 Freitas</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Chain-of-Though (CoT) represents a common strategy for reasoning in Large Language Models (LLMs) by decomposing complex tasks into intermediate inference steps. However, explanations generated via CoT are susceptible to content biases that negatively affect their robustness and faithfulness. To mitigate existing limitations, recent work has proposed using logical formalisms coupled with external symbolic solvers. However, fully symbolic approaches possess the bottleneck of requiring a complete translation from natural language to formal languages, a process that affects efficiency and flexibility. To achieve a trade-off, this paper investigates methods to disentangle content from logical reasoning without a complete formalisation. In particular, we present QuaSAR (for Quasi-Symbolic Abstract Reasoning), a variation of CoT that guides LLMs to operate at a higher level of abstraction via quasi-symbolic explanations. Our framework leverages the capability of LLMs to formalise only relevant variables and predicates, enabling the coexistence of symbolic elements with natural language. We show the impact of QuaSAR for in-context learning and for constructing demonstrations to improve the reasoning capabilities of smaller models. Our experiments show that quasi-symbolic abstractions can improve CoT-based methods by up to 8% accuracy, enhancing robustness and consistency on challenging adversarial variations on both natural language (i.e. MMLU-Redux) and symbolic reasoning tasks (i.e., GSM-Symbolic). </p> </div> </dd> <dt> <a name='item67'>[67]</a> <a href ="/abs/2502.12633" title="Abstract" id="2502.12633"> arXiv:2502.12633 </a> [<a href="/pdf/2502.12633" title="Download PDF" id="pdf-2502.12633" aria-labelledby="pdf-2502.12633">pdf</a>, <a href="https://arxiv.org/html/2502.12633v1" title="View HTML" id="html-2502.12633" aria-labelledby="html-2502.12633" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12633" title="Other formats" id="oth-2502.12633" aria-labelledby="oth-2502.12633">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> \textit{One Size doesn't Fit All}: A Personalized Conversational Tutoring Agent for Mathematics Instruction </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+B">Ben Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+J">Jihan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+F">Fangquan Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jia,+X">Xu Jia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Peng,+M">Min Peng</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Large language models (LLMs) have been increasingly employed in various intelligent educational systems, simulating human tutors to facilitate effective human-machine interaction. However, previous studies often overlook the significance of recognizing and adapting to individual learner characteristics. Such adaptation is crucial for enhancing student engagement and learning efficiency, particularly in mathematics instruction, where diverse learning styles require personalized strategies to promote comprehension and enthusiasm. In this paper, we propose a \textbf{P}erson\textbf{A}lized \textbf{C}onversational tutoring ag\textbf{E}nt (PACE) for mathematics instruction. PACE simulates students' learning styles based on the Felder and Silverman learning style model, aligning with each student's persona. In this way, our PACE can effectively assess the personality of students, allowing to develop individualized teaching strategies that resonate with their unique learning styles. To further enhance students' comprehension, PACE employs the Socratic teaching method to provide instant feedback and encourage deep thinking. By constructing personalized teaching data and training models, PACE demonstrates the ability to identify and adapt to the unique needs of each student, significantly improving the overall learning experience and outcomes. Moreover, we establish multi-aspect evaluation criteria and conduct extensive analysis to assess the performance of personalized teaching. Experimental results demonstrate the superiority of our model in personalizing the educational experience and motivating students compared to existing methods. </p> </div> </dd> <dt> <a name='item68'>[68]</a> <a href ="/abs/2502.12658" title="Abstract" id="2502.12658"> arXiv:2502.12658 </a> [<a href="/pdf/2502.12658" title="Download PDF" id="pdf-2502.12658" aria-labelledby="pdf-2502.12658">pdf</a>, <a href="https://arxiv.org/html/2502.12658v1" title="View HTML" id="html-2502.12658" aria-labelledby="html-2502.12658" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12658" title="Other formats" id="oth-2502.12658" aria-labelledby="oth-2502.12658">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> R.R.: Unveiling LLM Training Privacy through Recollection and Ranking </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Meng,+W">Wenlong Meng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+Z">Zhenyuan Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+L">Lenan Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gong,+C">Chen Gong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+W">Wenyan Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+W">Weixian Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wei,+C">Chengkun Wei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+W">Wenzhi Chen</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 13 pages, 9 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large Language Models (LLMs) pose significant privacy risks, potentially leaking training data due to implicit memorization. Existing privacy attacks primarily focus on membership inference attacks (MIAs) or data extraction attacks, but reconstructing specific personally identifiable information (PII) in LLM's training data remains challenging. In this paper, we propose R.R. (Recollect and Rank), a novel two-step privacy stealing attack that enables attackers to reconstruct PII entities from scrubbed training data where the PII entities have been masked. In the first stage, we introduce a prompt paradigm named recollection, which instructs the LLM to repeat a masked text but fill in masks. Then we can use PII identifiers to extract recollected PII candidates. In the second stage, we design a new criterion to score each PII candidate and rank them. Motivated by membership inference, we leverage the reference model as a calibration to our criterion. Experiments across three popular PII datasets demonstrate that the R.R. achieves better PII identical performance compared to baselines. These results highlight the vulnerability of LLMs to PII leakage even when training data has been scrubbed. We release the replicate package of R.R. at a link. </p> </div> </dd> <dt> <a name='item69'>[69]</a> <a href ="/abs/2502.12663" title="Abstract" id="2502.12663"> arXiv:2502.12663 </a> [<a href="/pdf/2502.12663" title="Download PDF" id="pdf-2502.12663" aria-labelledby="pdf-2502.12663">pdf</a>, <a href="https://arxiv.org/html/2502.12663v1" title="View HTML" id="html-2502.12663" aria-labelledby="html-2502.12663" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12663" title="Other formats" id="oth-2502.12663" aria-labelledby="oth-2502.12663">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Demystifying Multilingual Chain-of-Thought in Process Reward Modeling </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+W">Weixuan Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+M">Minghao Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Haddow,+B">Barry Haddow</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Birch,+A">Alexandra Birch</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large language models (LLMs) are designed to perform a wide range of tasks. To improve their ability to solve complex problems requiring multi-step reasoning, recent research leverages process reward modeling to provide fine-grained feedback at each step of the reasoning process for reinforcement learning (RL), but it predominantly focuses on English. In this paper, we tackle the critical challenge of extending process reward models (PRMs) to multilingual settings. To achieve this, we train multilingual PRMs on a dataset spanning seven languages, which is translated from English. Through comprehensive evaluations on two widely used reasoning benchmarks across 11 languages, we demonstrate that multilingual PRMs not only improve average accuracy but also reduce early-stage reasoning errors. Furthermore, our results highlight the sensitivity of multilingual PRMs to both the number of training languages and the volume of English data, while also uncovering the benefits arising from more candidate responses and trainable parameters. This work opens promising avenues for robust multilingual applications in complex, multi-step reasoning tasks. In addition, we release the code to foster research along this line. </p> </div> </dd> <dt> <a name='item70'>[70]</a> <a href ="/abs/2502.12665" title="Abstract" id="2502.12665"> arXiv:2502.12665 </a> [<a href="/pdf/2502.12665" title="Download PDF" id="pdf-2502.12665" aria-labelledby="pdf-2502.12665">pdf</a>, <a href="https://arxiv.org/html/2502.12665v1" title="View HTML" id="html-2502.12665" aria-labelledby="html-2502.12665" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12665" title="Other formats" id="oth-2502.12665" aria-labelledby="oth-2502.12665">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A$^2$ATS: Retrieval-Based KV Cache Reduction via Windowed Rotary Position Embedding and Query-Aware Vector Quantization </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=He,+J">Junhui He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xing,+J">Junna Xing</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+N">Nan Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+R">Rui Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+S">Shangyu Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+P">Peng Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Q">Qiang Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xue,+C+J">Chun Jason Xue</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Q">Qingan Li</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Long context large language models (LLMs) pose significant challenges for efficient serving due to the large memory footprint and high access overhead of KV cache. Retrieval-based KV cache reduction methods can mitigate these challenges, typically by offloading the complete KV cache to CPU and retrieving necessary tokens on demand during inference. However, these methods still suffer from unsatisfactory accuracy degradation and extra retrieval overhead. To address these limitations, this paper proposes A$^2$ATS, a novel retrieval-based KV cache reduction method. A$^2$ATS aims to obtain an accurate approximation of attention scores by applying the vector quantization technique to key states, thereby enabling efficient and precise retrieval of the top-K tokens. First, we propose Windowed Rotary Position Embedding, which decouples the positional dependency from query and key states after position embedding. Then, we propose query-aware vector quantization that optimizes the objective of attention score approximation directly. Finally, we design the heterogeneous inference architecture for KV cache offloading, enabling long context serving with larger batch sizes. Experimental results demonstrate that A$^2$ATS can achieve a lower performance degradation with similar or lower overhead compared to existing methods, thereby increasing long context serving throughput by up to $2.7 \times$. </p> </div> </dd> <dt> <a name='item71'>[71]</a> <a href ="/abs/2502.12668" title="Abstract" id="2502.12668"> arXiv:2502.12668 </a> [<a href="/pdf/2502.12668" title="Download PDF" id="pdf-2502.12668" aria-labelledby="pdf-2502.12668">pdf</a>, <a href="https://arxiv.org/html/2502.12668v1" title="View HTML" id="html-2502.12668" aria-labelledby="html-2502.12668" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12668" title="Other formats" id="oth-2502.12668" aria-labelledby="oth-2502.12668">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Evaluation of Best-of-N Sampling Strategies for Language Model Alignment </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ichihara,+Y">Yuki Ichihara</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jinnai,+Y">Yuu Jinnai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Morimura,+T">Tetsuro Morimura</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ariu,+K">Kaito Ariu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Abe,+K">Kenshi Abe</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sakamoto,+M">Mitsuki Sakamoto</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Uchibe,+E">Eiji Uchibe</a></div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> Transactions on Machine Learning Research (TMLR), 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Best-of-N (BoN) sampling with a reward model has been shown to be an effective strategy for aligning Large Language Models (LLMs) with human preferences at the time of decoding. BoN sampling is susceptible to a problem known as reward hacking. Since the reward model is an imperfect proxy for the true objective, an excessive focus on optimizing its value can lead to a compromise of its performance on the true objective. Previous work proposes Regularized BoN sampling (RBoN), a BoN sampling with regularization to the objective, and shows that it outperforms BoN sampling so that it mitigates reward hacking and empirically (Jinnai et al., 2024). However, Jinnai et al. (2024) introduce RBoN based on a heuristic and they lack the analysis of why such regularization strategy improves the performance of BoN sampling. The aim of this study is to analyze the effect of BoN sampling on regularization strategies. Using the regularization strategies corresponds to robust optimization, which maximizes the worst case over a set of possible perturbations in the proxy reward. Although the theoretical guarantees are not directly applicable to RBoN, RBoN corresponds to a practical implementation. This paper proposes an extension of the RBoN framework, called Stochastic RBoN sampling (SRBoN), which is a theoretically guaranteed approach to worst-case RBoN in proxy reward. We then perform an empirical evaluation using the AlpacaFarm and Anthropic's hh-rlhf datasets to evaluate which factors of the regularization strategies contribute to the improvement of the true proxy reward. In addition, we also propose another simple RBoN method, the Sentence Length Regularized BoN, which has a better performance in the experiment as compared to the previous methods. </p> </div> </dd> <dt> <a name='item72'>[72]</a> <a href ="/abs/2502.12671" title="Abstract" id="2502.12671"> arXiv:2502.12671 </a> [<a href="/pdf/2502.12671" title="Download PDF" id="pdf-2502.12671" aria-labelledby="pdf-2502.12671">pdf</a>, <a href="https://arxiv.org/html/2502.12671v1" title="View HTML" id="html-2502.12671" aria-labelledby="html-2502.12671" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12671" title="Other formats" id="oth-2502.12671" aria-labelledby="oth-2502.12671">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Baichuan-M1: Pushing the Medical Capability of Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+B">Bingning Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+H">Haizhou Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+H">Huozhi Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+L">Liang Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+M">Mingyu Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cheng,+W">Wei Cheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zeng,+X">Xiangrong Zeng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yupeng Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huo,+Y">Yuqi Huo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zecheng Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+Z">Zhengyun Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pan,+D">Da Pan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+F">Fan Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kou,+F">Fei Kou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+F">Fei Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+F">Fuzhong Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dong,+G">Guosheng Dong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+H">Han Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+H">Hongda Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+J">Jin He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+J">Jinjie Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+K">Kangxi Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+K">Kegeng Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Su,+L">Lei Su</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Niu,+L">Linlin Niu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+L">Linzhuang Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+M">Mang Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fan,+P">Pengcheng Fan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+Q">Qianli Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xin,+R">Rihui Xin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dang,+S">Shunya Dang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+S">Songchi Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+W">Weipeng Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+W">Wenjing Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+X">Xin Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Men,+X">Xin Men</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+X">Xionghai Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dong,+X">Xuezhen Dong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Duan,+Y">Yifei Duan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+Y">Yuyan Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+Z">Zhi Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Z">Zhiying Wu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 33 pages, technical report </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The current generation of large language models (LLMs) is typically designed for broad, general-purpose applications, while domain-specific LLMs, especially in vertical fields like medicine, remain relatively scarce. In particular, the development of highly efficient and practical LLMs for the medical domain is challenging due to the complexity of medical knowledge and the limited availability of high-quality data. To bridge this gap, we introduce Baichuan-M1, a series of large language models specifically optimized for medical applications. Unlike traditional approaches that simply continue pretraining on existing models or apply post-training to a general base model, Baichuan-M1 is trained from scratch with a dedicated focus on enhancing medical capabilities. Our model is trained on 20 trillion tokens and incorporates a range of effective training methods that strike a balance between general capabilities and medical expertise. As a result, Baichuan-M1 not only performs strongly across general domains such as mathematics and coding but also excels in specialized medical fields. We have open-sourced Baichuan-M1-14B, a mini version of our model, which can be accessed through the following links. </p> </div> </dd> <dt> <a name='item73'>[73]</a> <a href ="/abs/2502.12672" title="Abstract" id="2502.12672"> arXiv:2502.12672 </a> [<a href="/pdf/2502.12672" title="Download PDF" id="pdf-2502.12672" aria-labelledby="pdf-2502.12672">pdf</a>, <a href="https://arxiv.org/html/2502.12672v1" title="View HTML" id="html-2502.12672" aria-labelledby="html-2502.12672" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12672" title="Other formats" id="oth-2502.12672" aria-labelledby="oth-2502.12672">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Speech-FT: A Fine-tuning Strategy for Enhancing Speech Representation Models Without Compromising Generalization Ability </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+T">Tzu-Quan Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+W">Wei-Ping Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+H">Hao Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+H">Hung-yi Lee</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Speech representation models are highly effective at extracting general features for various tasks. While fine-tuning can enhance these representations for specific applications, it often compromises their generalization ability. To address this challenge, we propose Speech-FT, a fine-tuning strategy for speech representation models that leverages model merging to preserve generalization ability while still benefiting from fine-tuning. Speech-FT is effective across different fine-tuning scenarios and is compatible with various types of speech representation models, providing a versatile solution. Speech-FT offers an efficient and practical approach to further improving general speech representations after pre-training. </p> </div> </dd> <dt> <a name='item74'>[74]</a> <a href ="/abs/2502.12685" title="Abstract" id="2502.12685"> arXiv:2502.12685 </a> [<a href="/pdf/2502.12685" title="Download PDF" id="pdf-2502.12685" aria-labelledby="pdf-2502.12685">pdf</a>, <a href="/format/2502.12685" title="Other formats" id="oth-2502.12685" aria-labelledby="oth-2502.12685">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Theoretical Guarantees for Minimum Bayes Risk Decoding </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ichihara,+Y">Yuki Ichihara</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jinnai,+Y">Yuu Jinnai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ariu,+K">Kaito Ariu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Morimura,+T">Tetsuro Morimura</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Uchibe,+E">Eiji Uchibe</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Minimum Bayes Risk (MBR) decoding optimizes output selection by maximizing the expected utility value of an underlying human distribution. While prior work has shown the effectiveness of MBR decoding through empirical evaluation, few studies have analytically investigated why the method is effective. As a result of our analysis, we show that, given the size $n$ of the reference hypothesis set used in computation, MBR decoding approaches the optimal solution with high probability at a rate of $O\left(n^{-\frac{1}{2}}\right)$, under certain assumptions, even though the language space $Y$ is significantly larger $Y\gg n$. This result helps to theoretically explain the strong performance observed in several prior empirical studies on MBR decoding. In addition, we provide the performance gap for maximum-a-posteriori (MAP) decoding and compare it to MBR decoding. The result of this paper indicates that MBR decoding tends to converge to the optimal solution faster than MAP decoding in several cases. </p> </div> </dd> <dt> <a name='item75'>[75]</a> <a href ="/abs/2502.12700" title="Abstract" id="2502.12700"> arXiv:2502.12700 </a> [<a href="/pdf/2502.12700" title="Download PDF" id="pdf-2502.12700" aria-labelledby="pdf-2502.12700">pdf</a>, <a href="https://arxiv.org/html/2502.12700v1" title="View HTML" id="html-2502.12700" aria-labelledby="html-2502.12700" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12700" title="Other formats" id="oth-2502.12700" aria-labelledby="oth-2502.12700">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Multi-Novelty: Improve the Diversity and Novelty of Contents Generated by Large Language Models via inference-time Multi-Views Brainstorming </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lagzian,+A">Arash Lagzian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Anumasa,+S">Srinivas Anumasa</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+D">Dianbo Liu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large Language Models (LLMs) demonstrate remarkable proficiency in generating accurate and fluent text. However, they often struggle with diversity and novelty, leading to repetitive or overly deterministic responses. These limitations stem from constraints in training data, including gaps in specific knowledge domains, outdated information, and an over-reliance on textual sources. Such shortcomings reduce their effectiveness in tasks requiring creativity, multi-perspective reasoning, and exploratory thinking, such as LLM based AI scientist agents and creative artist agents . To address this challenge, we introduce inference-time multi-view brainstorming method, a novel approach that enriches input prompts with diverse perspectives derived from both textual and visual sources, which we refere to as "Multi-Novelty". By incorporating additional contextual information as diverse starting point for chain of thoughts, this method enhances the variety and creativity of generated outputs. Importantly, our approach is model-agnostic, requiring no architectural modifications and being compatible with both open-source and proprietary LLMs. </p> </div> </dd> <dt> <a name='item76'>[76]</a> <a href ="/abs/2502.12701" title="Abstract" id="2502.12701"> arXiv:2502.12701 </a> [<a href="/pdf/2502.12701" title="Download PDF" id="pdf-2502.12701" aria-labelledby="pdf-2502.12701">pdf</a>, <a href="https://arxiv.org/html/2502.12701v1" title="View HTML" id="html-2502.12701" aria-labelledby="html-2502.12701" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12701" title="Other formats" id="oth-2502.12701" aria-labelledby="oth-2502.12701">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Translate Smart, not Hard: Cascaded Translation Systems with Quality-Aware Deferral </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Farinhas,+A">Ant贸nio Farinhas</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guerreiro,+N+M">Nuno M. Guerreiro</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Agrawal,+S">Sweta Agrawal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rei,+R">Ricardo Rei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Martins,+A+F">Andr茅 F.T. Martins</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Preprint </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Larger models often outperform smaller ones but come with high computational costs. Cascading offers a potential solution. By default, it uses smaller models and defers only some instances to larger, more powerful models. However, designing effective deferral rules remains a challenge. In this paper, we propose a simple yet effective approach for machine translation, using existing quality estimation (QE) metrics as deferral rules. We show that QE-based deferral allows a cascaded system to match the performance of a larger model while invoking it for a small fraction (30% to 50%) of the examples, significantly reducing computational costs. We validate this approach through both automatic and human evaluation. </p> </div> </dd> <dt> <a name='item77'>[77]</a> <a href ="/abs/2502.12714" title="Abstract" id="2502.12714"> arXiv:2502.12714 </a> [<a href="/pdf/2502.12714" title="Download PDF" id="pdf-2502.12714" aria-labelledby="pdf-2502.12714">pdf</a>, <a href="https://arxiv.org/html/2502.12714v1" title="View HTML" id="html-2502.12714" aria-labelledby="html-2502.12714" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12714" title="Other formats" id="oth-2502.12714" aria-labelledby="oth-2502.12714">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Playing with Voices: Tabletop Role-Playing Game Recordings as a Diarization Challenge </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Remme,+L">Lian Remme</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+K">Kevin Tang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 15 pages, 14 figures, published in NAACL Findings 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Sound (cs.SD) </div> <p class='mathjax'> This paper provides a proof of concept that audio of tabletop role-playing games (TTRPG) could serve as a challenge for diarization systems. TTRPGs are carried out mostly by conversation. Participants often alter their voices to indicate that they are talking as a fictional character. Audio processing systems are susceptible to voice conversion with or without technological assistance. TTRPG present a conversational phenomenon in which voice conversion is an inherent characteristic for an immersive gaming experience. This could make it more challenging for diarizers to pick the real speaker and determine that impersonating is just that. We present the creation of a small TTRPG audio dataset and compare it against the AMI and the ICSI corpus. The performance of two diarizers, <a href="http://pyannote.audio" rel="external noopener nofollow" class="link-external link-http">this http URL</a> and wespeaker, were evaluated. We observed that TTRPGs' properties result in a higher confusion rate for both diarizers. Additionally, wespeaker strongly underestimates the number of speakers in the TTRPG audio files. We propose TTRPG audio as a promising challenge for diarization systems. </p> </div> </dd> <dt> <a name='item78'>[78]</a> <a href ="/abs/2502.12737" title="Abstract" id="2502.12737"> arXiv:2502.12737 </a> [<a href="/pdf/2502.12737" title="Download PDF" id="pdf-2502.12737" aria-labelledby="pdf-2502.12737">pdf</a>, <a href="https://arxiv.org/html/2502.12737v1" title="View HTML" id="html-2502.12737" aria-labelledby="html-2502.12737" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12737" title="Other formats" id="oth-2502.12737" aria-labelledby="oth-2502.12737">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Beyond Seen Data: Improving KBQA Generalization Through Schema-Guided Logical Form Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+S">Shengxiang Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lau,+J+H">Jey Han Lau</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qi,+J">Jianzhong Qi</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 17 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Knowledge base question answering (KBQA) aims to answer user questions in natural language using rich human knowledge stored in large KBs. As current KBQA methods struggle with unseen knowledge base elements at test time,we introduce SG-KBQA: a novel model that injects schema contexts into entity retrieval and logical form generation to tackle this issue. It uses the richer semantics and awareness of the knowledge base structure provided by schema contexts to enhance generalizability. We show that SG-KBQA achieves strong generalizability, outperforming state-of-the-art models on two commonly used benchmark datasets across a variety of test settings. Code will be released upon paper publication. </p> </div> </dd> <dt> <a name='item79'>[79]</a> <a href ="/abs/2502.12743" title="Abstract" id="2502.12743"> arXiv:2502.12743 </a> [<a href="/pdf/2502.12743" title="Download PDF" id="pdf-2502.12743" aria-labelledby="pdf-2502.12743">pdf</a>, <a href="https://arxiv.org/html/2502.12743v1" title="View HTML" id="html-2502.12743" aria-labelledby="html-2502.12743" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12743" title="Other formats" id="oth-2502.12743" aria-labelledby="oth-2502.12743">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> "I know myself better, but not really greatly": Using LLMs to Detect and Explain LLM-Generated Texts </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ji,+J">Jiazhou Ji</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+J">Jie Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qiu,+W">Weidong Qiu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+Z">Zheng Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+Y">Yang Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+X">Xinru Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+X">Xiaoyu Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+R">Ruizhe Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+S">Shujun Li</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Under review </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Large language models (LLMs) have demonstrated impressive capabilities in generating human-like texts, but the potential misuse of such LLM-generated texts raises the need to distinguish between human-generated and LLM-generated content. This paper explores the detection and explanation capabilities of LLM-based detectors of LLM-generated texts, in the context of a binary classification task (human-generated texts vs LLM-generated texts) and a ternary classification task (human-generated texts, LLM-generated texts, and undecided). By evaluating on six close/open-source LLMs with different sizes, our findings reveal that while self-detection consistently outperforms cross-detection, i.e., LLMs can detect texts generated by themselves more accurately than those generated by other LLMs, the performance of self-detection is still far from ideal, indicating that further improvements are needed. We also show that extending the binary to the ternary classification task with a new class "Undecided" can enhance both detection accuracy and explanation quality, with improvements being statistically significant and consistent across all LLMs. We finally conducted comprehensive qualitative and quantitative analyses on the explanation errors, which are categorized into three types: reliance on inaccurate features (the most frequent error), hallucinations, and incorrect reasoning. These findings with our human-annotated dataset emphasize the need for further research into improving both self-detection and self-explanation, particularly to address overfitting issues that may hinder generalization. </p> </div> </dd> <dt> <a name='item80'>[80]</a> <a href ="/abs/2502.12744" title="Abstract" id="2502.12744"> arXiv:2502.12744 </a> [<a href="/pdf/2502.12744" title="Download PDF" id="pdf-2502.12744" aria-labelledby="pdf-2502.12744">pdf</a>, <a href="https://arxiv.org/html/2502.12744v1" title="View HTML" id="html-2502.12744" aria-labelledby="html-2502.12744" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12744" title="Other formats" id="oth-2502.12744" aria-labelledby="oth-2502.12744">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Self-Enhanced Reasoning Training: Activating Latent Reasoning in Small Models for Enhanced Reasoning Distillation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yong Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+B">Bingyuan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Z">Zhitao Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+M">Ming Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cheng,+N">Ning Cheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+M">Minchuan Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wei,+T">Tao Wei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+J">Jun Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Shaojun Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiao,+J">Jing Xiao</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted by the 50th IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP 2025) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The rapid advancement of large language models (LLMs) has significantly enhanced their reasoning abilities, enabling increasingly complex tasks. However, these capabilities often diminish in smaller, more computationally efficient models like GPT-2. Recent research shows that reasoning distillation can help small models acquire reasoning capabilities, but most existing methods focus primarily on improving teacher-generated reasoning paths. Our observations reveal that small models can generate high-quality reasoning paths during sampling, even without chain-of-thought prompting, though these paths are often latent due to their low probability under standard decoding strategies. To address this, we propose Self-Enhanced Reasoning Training (SERT), which activates and leverages latent reasoning capabilities in small models through self-training on filtered, self-generated reasoning paths under zero-shot conditions. Experiments using OpenAI's GPT-3.5 as the teacher model and GPT-2 models as the student models demonstrate that SERT enhances the reasoning abilities of small models, improving their performance in reasoning distillation. </p> </div> </dd> <dt> <a name='item81'>[81]</a> <a href ="/abs/2502.12745" title="Abstract" id="2502.12745"> arXiv:2502.12745 </a> [<a href="/pdf/2502.12745" title="Download PDF" id="pdf-2502.12745" aria-labelledby="pdf-2502.12745">pdf</a>, <a href="https://arxiv.org/html/2502.12745v1" title="View HTML" id="html-2502.12745" aria-labelledby="html-2502.12745" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12745" title="Other formats" id="oth-2502.12745" aria-labelledby="oth-2502.12745">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MediaMind: Revolutionizing Media Monitoring using Agentification </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Gunduz,+A">Ahmet Gunduz</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuksel,+K+A">Kamer Ali Yuksel</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sawaf,+H">Hassan Sawaf</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> In an era of rapid technological advancements, agentification of software tools has emerged as a critical innovation, enabling systems to function autonomously and adaptively. This paper introduces MediaMind as a case study to demonstrate the agentification process, highlighting how existing software can be transformed into intelligent agents capable of independent decision-making and dynamic interaction. Developed by aiXplain, MediaMind leverages agent-based architecture to autonomously monitor, analyze, and provide insights from multilingual media content in real time. The focus of this paper is on the technical methodologies and design principles behind agentifying MediaMind, showcasing how agentification enhances adaptability, efficiency, and responsiveness. Through detailed case studies and practical examples, we illustrate how the agentification of MediaMind empowers organizations to streamline workflows, optimize decision-making, and respond to evolving trends. This work underscores the broader potential of agentification to revolutionize software tools across various domains. </p> </div> </dd> <dt> <a name='item82'>[82]</a> <a href ="/abs/2502.12755" title="Abstract" id="2502.12755"> arXiv:2502.12755 </a> [<a href="/pdf/2502.12755" title="Download PDF" id="pdf-2502.12755" aria-labelledby="pdf-2502.12755">pdf</a>, <a href="https://arxiv.org/html/2502.12755v1" title="View HTML" id="html-2502.12755" aria-labelledby="html-2502.12755" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12755" title="Other formats" id="oth-2502.12755" aria-labelledby="oth-2502.12755">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Efficient Machine Translation Corpus Generation: Integrating Human-in-the-Loop Post-Editing with Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yuksel,+K+A">Kamer Ali Yuksel</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gunduz,+A">Ahmet Gunduz</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Anees,+A+B">Abdul Baseet Anees</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sawaf,+H">Hassan Sawaf</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Human-Computer Interaction (cs.HC) </div> <p class='mathjax'> This paper introduces an advanced methodology for machine translation (MT) corpus generation, integrating semi-automated, human-in-the-loop post-editing with large language models (LLMs) to enhance efficiency and translation quality. Building upon previous work that utilized real-time training of a custom MT quality estimation metric, this system incorporates novel LLM features such as Enhanced Translation Synthesis and Assisted Annotation Analysis, which improve initial translation hypotheses and quality assessments, respectively. Additionally, the system employs LLM-Driven Pseudo Labeling and a Translation Recommendation System to reduce human annotator workload in specific contexts. These improvements not only retain the original benefits of cost reduction and enhanced post-edit quality but also open new avenues for leveraging cutting-edge LLM advancements. The project's source code is available for community use, promoting collaborative developments in the field. The demo video can be accessed here. </p> </div> </dd> <dt> <a name='item83'>[83]</a> <a href ="/abs/2502.12767" title="Abstract" id="2502.12767"> arXiv:2502.12767 </a> [<a href="/pdf/2502.12767" title="Download PDF" id="pdf-2502.12767" aria-labelledby="pdf-2502.12767">pdf</a>, <a href="https://arxiv.org/html/2502.12767v1" title="View HTML" id="html-2502.12767" aria-labelledby="html-2502.12767" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12767" title="Other formats" id="oth-2502.12767" aria-labelledby="oth-2502.12767">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> R2-KG: General-Purpose Dual-Agent Framework for Reliable Reasoning on Knowledge Graphs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Jo,+S">Sumin Jo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Choi,+J">Junseong Choi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+J">Jiho Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Choi,+E">Edward Choi</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Recent studies have combined Large Language Models (LLMs) with Knowledge Graphs (KGs) to enhance reasoning, improving inference accuracy without additional training while mitigating hallucination. However, existing frameworks are often rigid, struggling to adapt to KG or task changes. They also rely heavily on powerful LLMs for reliable (i.e., trustworthy) reasoning. To address this, We introduce R2-KG, a plug-and-play, dual-agent framework that separates reasoning into two roles: an Operator (a low-capacity LLM) that gathers evidence and a Supervisor (a high-capacity LLM) that makes final judgments. This design is cost-efficient for LLM inference while still maintaining strong reasoning accuracy. Additionally, R2-KG employs an Abstention mechanism, generating answers only when sufficient evidence is collected from KG, which significantly enhances reliability. Experiments across multiple KG-based reasoning tasks show that R2-KG consistently outperforms baselines in both accuracy and reliability, regardless of the inherent capability of LLMs used as the Operator. Further experiments reveal that the single-agent version of R2-KG, equipped with a strict self-consistency strategy, achieves significantly higher-than-baseline reliability while reducing inference cost. However, it also leads to a higher abstention rate in complex KGs. Our findings establish R2-KG as a flexible and cost-effective solution for KG-based reasoning. It reduces reliance on high-capacity LLMs while ensuring trustworthy inference. </p> </div> </dd> <dt> <a name='item84'>[84]</a> <a href ="/abs/2502.12769" title="Abstract" id="2502.12769"> arXiv:2502.12769 </a> [<a href="/pdf/2502.12769" title="Download PDF" id="pdf-2502.12769" aria-labelledby="pdf-2502.12769">pdf</a>, <a href="https://arxiv.org/html/2502.12769v1" title="View HTML" id="html-2502.12769" aria-labelledby="html-2502.12769" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12769" title="Other formats" id="oth-2502.12769" aria-labelledby="oth-2502.12769">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> How Much Do LLMs Hallucinate across Languages? On Multilingual Estimation of LLM Hallucination in the Wild </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Islam,+S+O+u">Saad Obaid ul Islam</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lauscher,+A">Anne Lauscher</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Glava%C5%A1,+G">Goran Glava拧</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Under Review </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> In the age of misinformation, hallucination -- the tendency of Large Language Models (LLMs) to generate non-factual or unfaithful responses -- represents the main risk for their global utility. Despite LLMs becoming increasingly multilingual, the vast majority of research on detecting and quantifying LLM hallucination are (a) English-centric and (b) focus on machine translation (MT) and summarization, tasks that are less common ``in the wild'' than open information seeking. In contrast, we aim to quantify the extent of LLM hallucination across languages in knowledge-intensive long-form question answering. To this end, we train a multilingual hallucination detection model and conduct a large-scale study across 30 languages and 6 open-source LLM families. We start from an English hallucination detection dataset and rely on MT to generate (noisy) training data in other languages. We also manually annotate gold data for five high-resource languages; we then demonstrate, for these languages, that the estimates of hallucination rates are similar between silver (LLM-generated) and gold test sets, validating the use of silver data for estimating hallucination rates for other languages. For the final rates estimation, we build a knowledge-intensive QA dataset for 30 languages with LLM-generated prompts and Wikipedia articles as references. We find that, while LLMs generate longer responses with more hallucinated tokens for higher-resource languages, there is no correlation between length-normalized hallucination rates of languages and their digital representation. Further, we find that smaller LLMs exhibit larger hallucination rates than larger models. </p> </div> </dd> <dt> <a name='item85'>[85]</a> <a href ="/abs/2502.12771" title="Abstract" id="2502.12771"> arXiv:2502.12771 </a> [<a href="/pdf/2502.12771" title="Download PDF" id="pdf-2502.12771" aria-labelledby="pdf-2502.12771">pdf</a>, <a href="https://arxiv.org/html/2502.12771v1" title="View HTML" id="html-2502.12771" aria-labelledby="html-2502.12771" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12771" title="Other formats" id="oth-2502.12771" aria-labelledby="oth-2502.12771">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Mind the Gap: Aligning the Brain with Language Models Requires a Nonlinear and Multimodal Approach </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+D+D">Danny Dongyeop Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cho,+Y">Yunju Cho</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cha,+J">Jiook Cha</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+J">Jay-Yoon Lee</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Neurons and Cognition (q-bio.NC) </div> <p class='mathjax'> Self-supervised language and audio models effectively predict brain responses to speech. However, traditional prediction models rely on linear mappings from unimodal features, despite the complex integration of auditory signals with linguistic and semantic information across widespread brain networks during speech comprehension. Here, we introduce a nonlinear, multimodal prediction model that combines audio and linguistic features from pre-trained models (e.g., LLAMA, Whisper). Our approach achieves a 17.2% and 17.9% improvement in prediction performance (unnormalized and normalized correlation) over traditional unimodal linear models, as well as a 7.7% and 14.4% improvement, respectively, over prior state-of-the-art models. These improvements represent a major step towards future robust in-silico testing and improved decoding performance. They also reveal how auditory and semantic information are fused in motor, somatosensory, and higher-level semantic regions, aligning with existing neurolinguistic theories. Overall, our work highlights the often neglected potential of nonlinear and multimodal approaches to brain modeling, paving the way for future studies to embrace these strategies in naturalistic neurolinguistics research. </p> </div> </dd> <dt> <a name='item86'>[86]</a> <a href ="/abs/2502.12788" title="Abstract" id="2502.12788"> arXiv:2502.12788 </a> [<a href="/pdf/2502.12788" title="Download PDF" id="pdf-2502.12788" aria-labelledby="pdf-2502.12788">pdf</a>, <a href="https://arxiv.org/html/2502.12788v1" title="View HTML" id="html-2502.12788" aria-labelledby="html-2502.12788" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12788" title="Other formats" id="oth-2502.12788" aria-labelledby="oth-2502.12788">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Commonsense Reasoning in Arab Culture </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Sadallah,+A">Abdelrahman Sadallah</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tonga,+J+C">Junior Cedric Tonga</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Almubarak,+K">Khalid Almubarak</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Almheiri,+S">Saeed Almheiri</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Atif,+F">Farah Atif</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qwaider,+C">Chatrine Qwaider</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kadaoui,+K">Karima Kadaoui</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shatnawi,+S">Sara Shatnawi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Alesh,+Y">Yaser Alesh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Koto,+F">Fajri Koto</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Despite progress in Arabic large language models, such as Jais and AceGPT, their evaluation on commonsense reasoning has largely relied on machine-translated datasets, which lack cultural depth and may introduce Anglocentric biases. Commonsense reasoning is shaped by geographical and cultural contexts, and existing English datasets fail to capture the diversity of the Arab world. To address this, we introduce \datasetname, a commonsense reasoning dataset in Modern Standard Arabic (MSA), covering cultures of 13 countries across the Gulf, Levant, North Africa, and the Nile Valley. The dataset was built from scratch by engaging native speakers to write and validate culturally relevant questions for their respective countries. \datasetname spans 12 daily life domains with 54 fine-grained subtopics, reflecting various aspects of social norms, traditions, and everyday experiences. Zero-shot evaluations show that open-weight language models with up to 32B parameters struggle to comprehend diverse Arab cultures, with performance varying across regions. These findings highlight the need for more culturally aware models and datasets tailored to the Arabic-speaking world. </p> </div> </dd> <dt> <a name='item87'>[87]</a> <a href ="/abs/2502.12799" title="Abstract" id="2502.12799"> arXiv:2502.12799 </a> [<a href="/pdf/2502.12799" title="Download PDF" id="pdf-2502.12799" aria-labelledby="pdf-2502.12799">pdf</a>, <a href="https://arxiv.org/html/2502.12799v1" title="View HTML" id="html-2502.12799" aria-labelledby="html-2502.12799" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12799" title="Other formats" id="oth-2502.12799" aria-labelledby="oth-2502.12799">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Towards Text-Image Interleaved Retrieval </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+X">Xin Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dai,+Z">Ziqi Dai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yongqi Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yanzhao Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Long,+D">Dingkun Long</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xie,+P">Pengjun Xie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+M">Meishan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+J">Jun Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+W">Wenjie Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+M">Min Zhang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 16 pages, 14 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Computer Vision and Pattern Recognition (cs.CV); Information Retrieval (cs.IR) </div> <p class='mathjax'> Current multimodal information retrieval studies mainly focus on single-image inputs, which limits real-world applications involving multiple images and text-image interleaved content. In this work, we introduce the text-image interleaved retrieval (TIIR) task, where the query and document are interleaved text-image sequences, and the model is required to understand the semantics from the interleaved context for effective retrieval. We construct a TIIR benchmark based on naturally interleaved wikiHow tutorials, where a specific pipeline is designed to generate interleaved queries. To explore the task, we adapt several off-the-shelf retrievers and build a dense baseline by interleaved multimodal large language model (MLLM). We then propose a novel Matryoshka Multimodal Embedder (MME), which compresses the number of visual tokens at different granularity, to address the challenge of excessive visual tokens in MLLM-based TIIR models. Experiments demonstrate that simple adaption of existing models does not consistently yield effective results. Our MME achieves significant improvements over the baseline by substantially fewer visual tokens. We provide extensive analysis and will release the dataset and code to facilitate future research. </p> </div> </dd> <dt> <a name='item88'>[88]</a> <a href ="/abs/2502.12813" title="Abstract" id="2502.12813"> arXiv:2502.12813 </a> [<a href="/pdf/2502.12813" title="Download PDF" id="pdf-2502.12813" aria-labelledby="pdf-2502.12813">pdf</a>, <a href="https://arxiv.org/html/2502.12813v1" title="View HTML" id="html-2502.12813" aria-labelledby="html-2502.12813" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12813" title="Other formats" id="oth-2502.12813" aria-labelledby="oth-2502.12813">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Simulating User Diversity in Task-Oriented Dialogue Systems using Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ahmad,+A">Adnan Ahmad</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hillmann,+S">Stefan Hillmann</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=M%C3%B6ller,+S">Sebastian M枚ller</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> In this study, we explore the application of Large Language Models (LLMs) for generating synthetic users and simulating user conversations with a task-oriented dialogue system and present detailed results and their analysis. We propose a comprehensive novel approach to user simulation technique that uses LLMs to create diverse user profiles, set goals, engage in multi-turn dialogues, and evaluate the conversation success. We employ two proprietary LLMs, namely GPT-4o and GPT-o1 (Achiam et al., 2023), to generate a heterogeneous base of user profiles, characterized by varied demographics, multiple user goals, different conversational styles, initial knowledge levels, interests, and conversational objectives. We perform a detailed analysis of the user profiles generated by LLMs to assess the diversity, consistency, and potential biases inherent in these LLM-generated user simulations. We find that GPT-o1 generates more heterogeneous user distribution across most user attributes, while GPT-4o generates more skewed user attributes. The generated set of user profiles are then utilized to simulate dialogue sessions by interacting with a task-oriented dialogue system. </p> </div> </dd> <dt> <a name='item89'>[89]</a> <a href ="/abs/2502.12821" title="Abstract" id="2502.12821"> arXiv:2502.12821 </a> [<a href="/pdf/2502.12821" title="Download PDF" id="pdf-2502.12821" aria-labelledby="pdf-2502.12821">pdf</a>, <a href="https://arxiv.org/html/2502.12821v1" title="View HTML" id="html-2502.12821" aria-labelledby="html-2502.12821" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12821" title="Other formats" id="oth-2502.12821" aria-labelledby="oth-2502.12821">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Pitfalls of Scale: Investigating the Inverse Task of Redefinition in Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Stringli,+E">Elena Stringli</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lymperaiou,+M">Maria Lymperaiou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Filandrianos,+G">Giorgos Filandrianos</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Stamou,+G">Giorgos Stamou</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Inverse tasks can uncover potential reasoning gaps as Large Language Models (LLMs) scale up. In this work, we explore the redefinition task, in which we assign alternative values to well-known physical constants and units of measure, prompting LLMs to respond accordingly. Our findings show that not only does model performance degrade with scale, but its false confidence also rises. Moreover, while factors such as prompting strategies or response formatting are influential, they do not preclude LLMs from anchoring to memorized values. </p> </div> </dd> <dt> <a name='item90'>[90]</a> <a href ="/abs/2502.12825" title="Abstract" id="2502.12825"> arXiv:2502.12825 </a> [<a href="/pdf/2502.12825" title="Download PDF" id="pdf-2502.12825" aria-labelledby="pdf-2502.12825">pdf</a>, <a href="https://arxiv.org/html/2502.12825v1" title="View HTML" id="html-2502.12825" aria-labelledby="html-2502.12825" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12825" title="Other formats" id="oth-2502.12825" aria-labelledby="oth-2502.12825">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Reasoning and the Trusting Behavior of DeepSeek and GPT: An Experiment Revealing Hidden Fault Lines in Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+R">Rubing Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sedoc,+J">Jo茫o Sedoc</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sundararajan,+A">Arun Sundararajan</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> When encountering increasingly frequent performance improvements or cost reductions from a new large language model (LLM), developers of applications leveraging LLMs must decide whether to take advantage of these improvements or stay with older tried-and-tested models. Low perceived switching frictions can lead to choices that do not consider more subtle behavior changes that the transition may induce. Our experiments use a popular game-theoretic behavioral economics model of trust to show stark differences in the trusting behavior of OpenAI's and DeepSeek's models. We highlight a collapse in the economic trust behavior of the o1-mini and o3-mini models as they reconcile profit-maximizing and risk-seeking with future returns from trust, and contrast it with DeepSeek's more sophisticated and profitable trusting behavior that stems from an ability to incorporate deeper concepts like forward planning and theory-of-mind. As LLMs form the basis for high-stakes commercial systems, our results highlight the perils of relying on LLM performance benchmarks that are too narrowly defined and suggest that careful analysis of their hidden fault lines should be part of any organization's AI strategy. </p> </div> </dd> <dt> <a name='item91'>[91]</a> <a href ="/abs/2502.12829" title="Abstract" id="2502.12829"> arXiv:2502.12829 </a> [<a href="/pdf/2502.12829" title="Download PDF" id="pdf-2502.12829" aria-labelledby="pdf-2502.12829">pdf</a>, <a href="https://arxiv.org/html/2502.12829v1" title="View HTML" id="html-2502.12829" aria-labelledby="html-2502.12829" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12829" title="Other formats" id="oth-2502.12829" aria-labelledby="oth-2502.12829">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> KazMMLU: Evaluating Language Models on Kazakh, Russian, and Regional Knowledge of Kazakhstan </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Togmanov,+M">Mukhammed Togmanov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mukhituly,+N">Nurdaulet Mukhituly</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Turmakhan,+D">Diana Turmakhan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mansurov,+J">Jonibek Mansurov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Goloburda,+M">Maiya Goloburda</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sakip,+A">Akhmed Sakip</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xie,+Z">Zhuohan Xie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yuxia Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Syzdykov,+B">Bekassyl Syzdykov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Laiyk,+N">Nurkhan Laiyk</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Aji,+A+F">Alham Fikri Aji</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kochmar,+E">Ekaterina Kochmar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nakov,+P">Preslav Nakov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Koto,+F">Fajri Koto</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Despite having a population of twenty million, Kazakhstan's culture and language remain underrepresented in the field of natural language processing. Although large language models (LLMs) continue to advance worldwide, progress in Kazakh language has been limited, as seen in the scarcity of dedicated models and benchmark evaluations. To address this gap, we introduce KazMMLU, the first MMLU-style dataset specifically designed for Kazakh language. KazMMLU comprises 23,000 questions that cover various educational levels, including STEM, humanities, and social sciences, sourced from authentic educational materials and manually validated by native speakers and educators. The dataset includes 10,969 Kazakh questions and 12,031 Russian questions, reflecting Kazakhstan's bilingual education system and rich local context. Our evaluation of several state-of-the-art multilingual models (Llama-3.1, Qwen-2.5, GPT-4, and DeepSeek V3) demonstrates substantial room for improvement, as even the best-performing models struggle to achieve competitive performance in Kazakh and Russian. These findings underscore significant performance gaps compared to high-resource languages. We hope that our dataset will enable further research and development of Kazakh-centric LLMs. Data and code will be made available upon acceptance. </p> </div> </dd> <dt> <a name='item92'>[92]</a> <a href ="/abs/2502.12835" title="Abstract" id="2502.12835"> arXiv:2502.12835 </a> [<a href="/pdf/2502.12835" title="Download PDF" id="pdf-2502.12835" aria-labelledby="pdf-2502.12835">pdf</a>, <a href="https://arxiv.org/html/2502.12835v1" title="View HTML" id="html-2502.12835" aria-labelledby="html-2502.12835" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12835" title="Other formats" id="oth-2502.12835" aria-labelledby="oth-2502.12835">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Subword models struggle with word learning, but surprisal hides it </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Bunzeck,+B">Bastian Bunzeck</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zarrie%C3%9F,+S">Sina Zarrie脽</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 12 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> We study word learning in subword and character language models with the psycholinguistic lexical decision task. While subword LMs struggle to discern words and non-words with high accuracy, character LMs solve this task easily and consistently. Furthermore, when comparing word learning and syntactic learning, both processes are separable in character LM where word learning predates syntactic learning, whereas these processes are simultaneous in subword LM. This raises questions about the adequacy of subword LMs for modeling language acquisition and positions character LMs as a viable alternative. </p> </div> </dd> <dt> <a name='item93'>[93]</a> <a href ="/abs/2502.12836" title="Abstract" id="2502.12836"> arXiv:2502.12836 </a> [<a href="/pdf/2502.12836" title="Download PDF" id="pdf-2502.12836" aria-labelledby="pdf-2502.12836">pdf</a>, <a href="https://arxiv.org/html/2502.12836v1" title="View HTML" id="html-2502.12836" aria-labelledby="html-2502.12836" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12836" title="Other formats" id="oth-2502.12836" aria-labelledby="oth-2502.12836">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> An LLM-Powered Agent for Physiological Data Analysis: A Case Study on PPG-based Heart Rate Estimation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Feli,+M">Mohammad Feli</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Azimi,+I">Iman Azimi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liljeberg,+P">Pasi Liljeberg</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=M.Rahmani,+A">Amir M.Rahmani</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large language models (LLMs) are revolutionizing healthcare by improving diagnosis, patient care, and decision support through interactive communication. More recently, they have been applied to analyzing physiological time-series like wearable data for health insight extraction. Existing methods embed raw numerical sequences directly into prompts, which exceeds token limits and increases computational costs. Additionally, some studies integrated features extracted from time-series in textual prompts or applied multimodal approaches. However, these methods often produce generic and unreliable outputs due to LLMs' limited analytical rigor and inefficiency in interpreting continuous waveforms. In this paper, we develop an LLM-powered agent for physiological time-series analysis aimed to bridge the gap in integrating LLMs with well-established analytical tools. Built on the OpenCHA, an open-source LLM-powered framework, our agent features an orchestrator that integrates user interaction, data sources, and analytical tools to generate accurate health insights. To evaluate its effectiveness, we implement a case study on heart rate (HR) estimation from Photoplethysmogram (PPG) signals using a dataset of PPG and Electrocardiogram (ECG) recordings in a remote health monitoring study. The agent's performance is benchmarked against OpenAI GPT-4o-mini and GPT-4o, with ECG serving as the gold standard for HR estimation. Results demonstrate that our agent significantly outperforms benchmark models by achieving lower error rates and more reliable HR estimations. The agent implementation is publicly available on GitHub. </p> </div> </dd> <dt> <a name='item94'>[94]</a> <a href ="/abs/2502.12851" title="Abstract" id="2502.12851"> arXiv:2502.12851 </a> [<a href="/pdf/2502.12851" title="Download PDF" id="pdf-2502.12851" aria-labelledby="pdf-2502.12851">pdf</a>, <a href="https://arxiv.org/html/2502.12851v1" title="View HTML" id="html-2502.12851" aria-labelledby="html-2502.12851" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12851" title="Other formats" id="oth-2502.12851" aria-labelledby="oth-2502.12851">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MeMo: Towards Language Models with Associative Memory Mechanisms </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zanzotto,+F+M">Fabio Massimo Zanzotto</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ruzzetti,+E+S">Elena Sofia Ruzzetti</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xompero,+G+A">Giancarlo A. Xompero</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ranaldi,+L">Leonardo Ranaldi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Venditti,+D">Davide Venditti</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ranaldi,+F">Federico Ranaldi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Giannone,+C">Cristina Giannone</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Favalli,+A">Andrea Favalli</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Romagnoli,+R">Raniero Romagnoli</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Memorization is a fundamental ability of Transformer-based Large Language Models, achieved through learning. In this paper, we propose a paradigm shift by designing an architecture to memorize text directly, bearing in mind the principle that memorization precedes learning. We introduce MeMo, a novel architecture for language modeling that explicitly memorizes sequences of tokens in layered associative memories. By design, MeMo offers transparency and the possibility of model editing, including forgetting texts. We experimented with the MeMo architecture, showing the memorization power of the one-layer and the multi-layer configurations. </p> </div> </dd> <dt> <a name='item95'>[95]</a> <a href ="/abs/2502.12852" title="Abstract" id="2502.12852"> arXiv:2502.12852 </a> [<a href="/pdf/2502.12852" title="Download PDF" id="pdf-2502.12852" aria-labelledby="pdf-2502.12852">pdf</a>, <a href="https://arxiv.org/html/2502.12852v1" title="View HTML" id="html-2502.12852" aria-labelledby="html-2502.12852" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12852" title="Other formats" id="oth-2502.12852" aria-labelledby="oth-2502.12852">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MVL-SIB: A Massively Multilingual Vision-Language Benchmark for Cross-Modal Topical Matching </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Schmidt,+F+D">Fabian David Schmidt</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Schneider,+F">Florian Schneider</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Biemann,+C">Chris Biemann</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Glava%C5%A1,+G">Goran Glava拧</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Existing multilingual vision-language (VL) benchmarks often only cover a handful of languages. Consequently, evaluations of large vision-language models (LVLMs) predominantly target high-resource languages, underscoring the need for evaluation data for low-resource languages. To address this limitation, we introduce MVL-SIB, a massively multilingual vision-language benchmark that evaluates both cross-modal and text-only topical matching across 205 languages -- over 100 more than the most multilingual existing VL benchmarks encompass. We then benchmark a range of of open-weight LVLMs together with GPT-4o(-mini) on MVL-SIB. Our results reveal that LVLMs struggle in cross-modal topic matching in lower-resource languages, performing no better than chance on languages like N'Koo. Our analysis further reveals that VL support in LVLMs declines disproportionately relative to textual support for lower-resource languages, as evidenced by comparison of cross-modal and text-only topical matching performance. We further observe that open-weight LVLMs do not benefit from representing a topic with more than one image, suggesting that these models are not yet fully effective at handling multi-image tasks. By correlating performance on MVL-SIB with other multilingual VL benchmarks, we highlight that MVL-SIB serves as a comprehensive probe of multilingual VL understanding in LVLMs. </p> </div> </dd> <dt> <a name='item96'>[96]</a> <a href ="/abs/2502.12853" title="Abstract" id="2502.12853"> arXiv:2502.12853 </a> [<a href="/pdf/2502.12853" title="Download PDF" id="pdf-2502.12853" aria-labelledby="pdf-2502.12853">pdf</a>, <a href="https://arxiv.org/html/2502.12853v1" title="View HTML" id="html-2502.12853" aria-labelledby="html-2502.12853" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12853" title="Other formats" id="oth-2502.12853" aria-labelledby="oth-2502.12853">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> S$^2$R: Teaching LLMs to Self-verify and Self-correct via Reinforcement Learning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+R">Ruotian Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+P">Peisong Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+C">Cheng Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+X">Xingyan Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+J">Jiaqi Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+B">Bang Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+X">Xin Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Du,+N">Nan Du</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+J">Jia Li</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Recent studies have demonstrated the effectiveness of LLM test-time scaling. However, existing approaches to incentivize LLMs' deep thinking abilities generally require large-scale data or significant training efforts. Meanwhile, it remains unclear how to improve the thinking abilities of less powerful base models. In this work, we introduce S$^2$R, an efficient framework that enhances LLM reasoning by teaching models to self-verify and self-correct during inference. Specifically, we first initialize LLMs with iterative self-verification and self-correction behaviors through supervised fine-tuning on carefully curated data. The self-verification and self-correction skills are then further strengthened by both outcome-level and process-level reinforcement learning, with minimized resource requirements, enabling the model to adaptively refine its reasoning process during inference. Our results demonstrate that, with only 3.1k self-verifying and self-correcting behavior initialization samples, Qwen2.5-math-7B achieves an accuracy improvement from 51.0\% to 81.6\%, outperforming models trained on an equivalent amount of long-CoT distilled data. Extensive experiments and analysis based on three base models across both in-domain and out-of-domain benchmarks validate the effectiveness of S$^2$R. Our code and data are available at <a href="https://github.com/NineAbyss/S2R" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item97'>[97]</a> <a href ="/abs/2502.12855" title="Abstract" id="2502.12855"> arXiv:2502.12855 </a> [<a href="/pdf/2502.12855" title="Download PDF" id="pdf-2502.12855" aria-labelledby="pdf-2502.12855">pdf</a>, <a href="/format/2502.12855" title="Other formats" id="oth-2502.12855" aria-labelledby="oth-2502.12855">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Integrating Arithmetic Learning Improves Mathematical Reasoning in Smaller Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Gangwar,+N">Neeraj Gangwar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bhat,+S+P">Suma P Bhat</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kani,+N">Nickvash Kani</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Preprint </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> While large models pre-trained on high-quality data exhibit excellent performance across various reasoning tasks, including mathematical reasoning (e.g. GSM8k, MultiArith), specializing smaller models to excel at mathematical reasoning remains a challenging problem. Common approaches to address this challenge include knowledge distillation, where smaller student models learn from large pre-trained teacher models, and data augmentation, such as rephrasing questions. Despite these efforts, smaller models struggle with arithmetic computations, leading to errors in mathematical reasoning. In this work, we focus on leveraging a programmatically generated arithmetic dataset to enhance the reasoning capabilities of smaller models. We investigate two key approaches to incorporate this dataset -- (1) intermediate fine-tuning, where a model is fine-tuned on the arithmetic dataset before being trained on a reasoning dataset, and (2) integrating the arithmetic dataset into the instruction-tuning mixture, allowing the model to learn arithmetic skills alongside general instruction-following abilities. Our experiments on multiple reasoning benchmarks demonstrate that incorporating an arithmetic dataset, whether through targeted fine-tuning or within the instruction-tuning mixture, enhances the models' arithmetic capabilities, which in turn improves their mathematical reasoning performance. </p> </div> </dd> <dt> <a name='item98'>[98]</a> <a href ="/abs/2502.12858" title="Abstract" id="2502.12858"> arXiv:2502.12858 </a> [<a href="/pdf/2502.12858" title="Download PDF" id="pdf-2502.12858" aria-labelledby="pdf-2502.12858">pdf</a>, <a href="https://arxiv.org/html/2502.12858v1" title="View HTML" id="html-2502.12858" aria-labelledby="html-2502.12858" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12858" title="Other formats" id="oth-2502.12858" aria-labelledby="oth-2502.12858">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Rejected Dialects: Biases Against African American Language in Reward Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Mire,+J">Joel Mire</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Aysola,+Z+T">Zubin Trivadi Aysola</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chechelnitsky,+D">Daniel Chechelnitsky</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Deas,+N">Nicholas Deas</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zerva,+C">Chrysoula Zerva</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sap,+M">Maarten Sap</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted to NAACL Findings 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Computers and Society (cs.CY) </div> <p class='mathjax'> Preference alignment via reward models helps build safe, helpful, and reliable large language models (LLMs). However, subjectivity in preference judgments and the lack of representative sampling in preference data collection can introduce new biases, hindering reward models' fairness and equity. In this work, we introduce a framework for evaluating dialect biases in reward models and conduct a case study on biases against African American Language (AAL) through several experiments comparing reward model preferences and behavior on paired White Mainstream English (WME) and both machine-translated and human-written AAL corpora. We show that reward models are less aligned with human preferences when processing AAL texts vs. WME ones (-4\% accuracy on average), frequently disprefer AAL-aligned texts vs. WME-aligned ones, and steer conversations toward WME, even when prompted with AAL texts. Our findings provide a targeted analysis of anti-AAL biases at a relatively understudied stage in LLM development, highlighting representational harms and ethical questions about the desired behavior of LLMs concerning AAL. </p> </div> </dd> <dt> <a name='item99'>[99]</a> <a href ="/abs/2502.12859" title="Abstract" id="2502.12859"> arXiv:2502.12859 </a> [<a href="/pdf/2502.12859" title="Download PDF" id="pdf-2502.12859" aria-labelledby="pdf-2502.12859">pdf</a>, <a href="https://arxiv.org/html/2502.12859v1" title="View HTML" id="html-2502.12859" aria-labelledby="html-2502.12859" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12859" title="Other formats" id="oth-2502.12859" aria-labelledby="oth-2502.12859">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> PAFT: Prompt-Agnostic Fine-Tuning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wei,+C">Chenxing Wei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shu,+Y">Yao Shu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ou,+M">Mingwen Ou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+Y+T">Ying Tiffany He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+F+R">Fei Richard Yu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 20 pages, 6 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> While Large Language Models (LLMs) adapt well to downstream tasks after fine-tuning, this adaptability often compromises prompt robustness, as even minor prompt variations can significantly degrade performance. To address this, we propose Prompt-Agnostic Fine-Tuning(PAFT), a simple yet effective approach that dynamically adjusts prompts during fine-tuning. This encourages the model to learn underlying task principles rather than overfitting to specific prompt formulations. PAFT operates in two stages: First, a diverse set of meaningful, synthetic candidate prompts is constructed. Second, during fine-tuning, prompts are randomly sampled from this set to create dynamic training inputs. Extensive experiments across diverse datasets and LLMs demonstrate that models trained with PAFT exhibit strong robustness and generalization across a wide range of prompts, including unseen ones. This enhanced robustness improves both model performance and inference speed while maintaining training efficiency. Ablation studies further confirm the effectiveness of PAFT. </p> </div> </dd> <dt> <a name='item100'>[100]</a> <a href ="/abs/2502.12884" title="Abstract" id="2502.12884"> arXiv:2502.12884 </a> [<a href="/pdf/2502.12884" title="Download PDF" id="pdf-2502.12884" aria-labelledby="pdf-2502.12884">pdf</a>, <a href="https://arxiv.org/html/2502.12884v1" title="View HTML" id="html-2502.12884" aria-labelledby="html-2502.12884" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12884" title="Other formats" id="oth-2502.12884" aria-labelledby="oth-2502.12884">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> How desirable is alignment between LLMs and linguistically diverse human users? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Knoeferle,+P">Pia Knoeferle</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=M%C3%B6ller,+S">Sebastian M枚ller</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kolossa,+D">Dorothea Kolossa</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Solopova,+V">Veronika Solopova</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rehm,+G">Georg Rehm</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> We discuss how desirable it is that Large Language Models (LLMs) be able to adapt or align their language behavior with users who may be diverse in their language use. User diversity may come about among others due to i) age differences; ii) gender characteristics, and/or iii) multilingual experience, and associated differences in language processing and use. We consider potential consequences for usability, communication, and LLM development. </p> </div> </dd> <dt> <a name='item101'>[101]</a> <a href ="/abs/2502.12886" title="Abstract" id="2502.12886"> arXiv:2502.12886 </a> [<a href="/pdf/2502.12886" title="Download PDF" id="pdf-2502.12886" aria-labelledby="pdf-2502.12886">pdf</a>, <a href="https://arxiv.org/html/2502.12886v1" title="View HTML" id="html-2502.12886" aria-labelledby="html-2502.12886" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12886" title="Other formats" id="oth-2502.12886" aria-labelledby="oth-2502.12886">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Are Multilingual Language Models an Off-ramp for Under-resourced Languages? Will we arrive at Digital Language Equality in Europe in 2030? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Rehm,+G">Georg Rehm</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gr%C3%BCtzner-Zahn,+A">Annika Gr眉tzner-Zahn</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Barth,+F">Fabio Barth</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large language models (LLMs) demonstrate unprecedented capabilities and define the state of the art for almost all natural language processing (NLP) tasks and also for essentially all Language Technology (LT) applications. LLMs can only be trained for languages for which a sufficient amount of pre-training data is available, effectively excluding many languages that are typically characterised as under-resourced. However, there is both circumstantial and empirical evidence that multilingual LLMs, which have been trained using data sets that cover multiple languages (including under-resourced ones), do exhibit strong capabilities for some of these under-resourced languages. Eventually, this approach may have the potential to be a technological off-ramp for those under-resourced languages for which "native" LLMs, and LLM-based technologies, cannot be developed due to a lack of training data. This paper, which concentrates on European languages, examines this idea, analyses the current situation in terms of technology support and summarises related work. The article concludes by focusing on the key open questions that need to be answered for the approach to be put into practice in a systematic way. </p> </div> </dd> <dt> <a name='item102'>[102]</a> <a href ="/abs/2502.12893" title="Abstract" id="2502.12893"> arXiv:2502.12893 </a> [<a href="/pdf/2502.12893" title="Download PDF" id="pdf-2502.12893" aria-labelledby="pdf-2502.12893">pdf</a>, <a href="https://arxiv.org/html/2502.12893v1" title="View HTML" id="html-2502.12893" aria-labelledby="html-2502.12893" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12893" title="Other formats" id="oth-2502.12893" aria-labelledby="oth-2502.12893">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> H-CoT: Hijacking the Chain-of-Thought Safety Reasoning Mechanism to Jailbreak Large Reasoning Models, Including OpenAI o1/o3, DeepSeek-R1, and Gemini 2.0 Flash Thinking </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Kuo,+M">Martin Kuo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+J">Jianyi Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ding,+A">Aolin Ding</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Q">Qinsi Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=DiValentin,+L">Louis DiValentin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bao,+Y">Yujia Bao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wei,+W">Wei Wei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Juan,+D">Da-Cheng Juan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+H">Hai Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Y">Yiran Chen</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large Reasoning Models (LRMs) have recently extended their powerful reasoning capabilities to safety checks-using chain-of-thought reasoning to decide whether a request should be answered. While this new approach offers a promising route for balancing model utility and safety, its robustness remains underexplored. To address this gap, we introduce Malicious-Educator, a benchmark that disguises extremely dangerous or malicious requests beneath seemingly legitimate educational prompts. Our experiments reveal severe security flaws in popular commercial-grade LRMs, including OpenAI o1/o3, DeepSeek-R1, and Gemini 2.0 Flash Thinking. For instance, although OpenAI's o1 model initially maintains a high refusal rate of about 98%, subsequent model updates significantly compromise its safety; and attackers can easily extract criminal strategies from DeepSeek-R1 and Gemini 2.0 Flash Thinking without any additional tricks. To further highlight these vulnerabilities, we propose Hijacking Chain-of-Thought (H-CoT), a universal and transferable attack method that leverages the model's own displayed intermediate reasoning to jailbreak its safety reasoning mechanism. Under H-CoT, refusal rates sharply decline-dropping from 98% to below 2%-and, in some instances, even transform initially cautious tones into ones that are willing to provide harmful content. We hope these findings underscore the urgent need for more robust safety mechanisms to preserve the benefits of advanced reasoning capabilities without compromising ethical standards. </p> </div> </dd> <dt> <a name='item103'>[103]</a> <a href ="/abs/2502.12895" title="Abstract" id="2502.12895"> arXiv:2502.12895 </a> [<a href="/pdf/2502.12895" title="Download PDF" id="pdf-2502.12895" aria-labelledby="pdf-2502.12895">pdf</a>, <a href="https://arxiv.org/html/2502.12895v1" title="View HTML" id="html-2502.12895" aria-labelledby="html-2502.12895" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12895" title="Other formats" id="oth-2502.12895" aria-labelledby="oth-2502.12895">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Multilingual European Language Models: Benchmarking Approaches and Challenges </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Barth,+F">Fabio Barth</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rehm,+G">Georg Rehm</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The breakthrough of generative large language models (LLMs) that can solve different tasks through chat interaction has led to a significant increase in the use of general benchmarks to assess the quality or performance of these models beyond individual applications. There is also a need for better methods to evaluate and also to compare models due to the ever increasing number of new models published. However, most of the established benchmarks revolve around the English language. This paper analyses the benefits and limitations of current evaluation datasets, focusing on multilingual European benchmarks. We analyse seven multilingual benchmarks and identify four major challenges. Furthermore, we discuss potential solutions to enhance translation quality and mitigate cultural biases, including human-in-the-loop verification and iterative translation ranking. Our analysis highlights the need for culturally aware and rigorously validated benchmarks to assess the reasoning and question-answering capabilities of multilingual LLMs accurately. </p> </div> </dd> <dt> <a name='item104'>[104]</a> <a href ="/abs/2502.12896" title="Abstract" id="2502.12896"> arXiv:2502.12896 </a> [<a href="/pdf/2502.12896" title="Download PDF" id="pdf-2502.12896" aria-labelledby="pdf-2502.12896">pdf</a>, <a href="https://arxiv.org/html/2502.12896v1" title="View HTML" id="html-2502.12896" aria-labelledby="html-2502.12896" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12896" title="Other formats" id="oth-2502.12896" aria-labelledby="oth-2502.12896">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> None of the Others: a General Technique to Distinguish Reasoning from Memorization in Multiple-Choice LLM Evaluation Benchmarks </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Salido,+E+S">Eva S谩nchez Salido</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gonzalo,+J">Julio Gonzalo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Marco,+G">Guillermo Marco</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> In LLM evaluations, reasoning is often distinguished from recall/memorization by performing numerical variations to math-oriented questions. Here we introduce a general variation method for multiple-choice questions that completely dissociates the correct answer from previously seen tokens or concepts, requiring LLMs to understand and reason (rather than memorizing) in order to answer correctly. Using this method, we evaluate state-of-the-art proprietary and open-source LLMs on two datasets available in English and Spanish: the public MMLU benchmark and the private UNED-Access 2024 dataset. Results show that all models experience remarkable accuracy drops under our proposed variation, with an average loss of 57% on MMLU and 50% on UNED-Access 2024, ranging from 10% to 93% across models. Notably, the most accurate model in our experimentation (OpenAI-o3-mini) is not the most robust (DeepSeek-R1-70B), suggesting that the best models in standard evaluations may not be the ones with better reasoning capabilities. Also, we see larger accuracy drops in public (vs private) datasets and questions posed in their original language (vs a manual translation), which are signs of contamination and also point to a relevant role of recall/memorization in current LLMs' answers. </p> </div> </dd> <dt> <a name='item105'>[105]</a> <a href ="/abs/2502.12900" title="Abstract" id="2502.12900"> arXiv:2502.12900 </a> [<a href="/pdf/2502.12900" title="Download PDF" id="pdf-2502.12900" aria-labelledby="pdf-2502.12900">pdf</a>, <a href="/format/2502.12900" title="Other formats" id="oth-2502.12900" aria-labelledby="oth-2502.12900">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Soundwave: Less is More for Speech-Text Alignment in LLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yuhao Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zhiheng Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bu,+F">Fan Bu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+R">Ruiyu Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+B">Benyou Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+H">Haizhou Li</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Sound (cs.SD) </div> <p class='mathjax'> Existing end-to-end speech large language models (LLMs) usually rely on large-scale annotated data for training, while data-efficient training has not been discussed in depth. We focus on two fundamental problems between speech and text: the representation space gap and sequence length inconsistency. We propose Soundwave, which utilizes an efficient training strategy and a novel architecture to address these issues. Results show that Soundwave outperforms the advanced Qwen2-Audio in speech translation and AIR-Bench speech tasks, using only one-fiftieth of the training data. Further analysis shows that Soundwave still retains its intelligence during conversation. The project is available at <a href="https://github.com/FreedomIntelligence/Soundwave" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item106'>[106]</a> <a href ="/abs/2502.12904" title="Abstract" id="2502.12904"> arXiv:2502.12904 </a> [<a href="/pdf/2502.12904" title="Download PDF" id="pdf-2502.12904" aria-labelledby="pdf-2502.12904">pdf</a>, <a href="https://arxiv.org/html/2502.12904v1" title="View HTML" id="html-2502.12904" aria-labelledby="html-2502.12904" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12904" title="Other formats" id="oth-2502.12904" aria-labelledby="oth-2502.12904">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Fraud-R1 : A Multi-Round Benchmark for Assessing the Robustness of LLM Against Augmented Fraud and Phishing Inducements </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+S">Shu Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+S">Shenzhe Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Z">Zeyu Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+K">Keyu Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yao,+J">Junchi Yao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+J">Junchao Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+L">Lijie Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+M">Mengdi Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wong,+D+F">Derek F. Wong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+D">Di Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> We introduce Fraud-R1, a benchmark designed to evaluate LLMs' ability to defend against internet fraud and phishing in dynamic, real-world scenarios. Fraud-R1 comprises 8,564 fraud cases sourced from phishing scams, fake job postings, social media, and news, categorized into 5 major fraud types. Unlike previous benchmarks, Fraud-R1 introduces a multi-round evaluation pipeline to assess LLMs' resistance to fraud at different stages, including credibility building, urgency creation, and emotional manipulation. Furthermore, we evaluate 15 LLMs under two settings: 1. Helpful-Assistant, where the LLM provides general decision-making assistance, and 2. Role-play, where the model assumes a specific persona, widely used in real-world agent-based interactions. Our evaluation reveals the significant challenges in defending against fraud and phishing inducement, especially in role-play settings and fake job postings. Additionally, we observe a substantial performance gap between Chinese and English, underscoring the need for improved multilingual fraud detection capabilities. </p> </div> </dd> <dt> <a name='item107'>[107]</a> <a href ="/abs/2502.12911" title="Abstract" id="2502.12911"> arXiv:2502.12911 </a> [<a href="/pdf/2502.12911" title="Download PDF" id="pdf-2502.12911" aria-labelledby="pdf-2502.12911">pdf</a>, <a href="https://arxiv.org/html/2502.12911v1" title="View HTML" id="html-2502.12911" aria-labelledby="html-2502.12911" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12911" title="Other formats" id="oth-2502.12911" aria-labelledby="oth-2502.12911">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Knapsack Optimization-based Schema Linking for LLM-based Text-to-SQL Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+Z">Zheng Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+H">Hao Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hong,+Z">Zijin Hong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Q">Qinggang Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+F">Feiran Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+X">Xiao Huang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Databases (cs.DB) </div> <p class='mathjax'> Generating SQLs from user queries is a long-standing challenge, where the accuracy of initial schema linking significantly impacts subsequent SQL generation performance. However, current schema linking models still struggle with missing relevant schema elements or an excess of redundant ones. A crucial reason for this is that commonly used metrics, recall and precision, fail to capture relevant element missing and thus cannot reflect actual schema linking performance. Motivated by this, we propose an enhanced schema linking metric by introducing a restricted missing indicator. Accordingly, we introduce Knapsack optimization-based Schema Linking Agent (KaSLA), a plug-in schema linking agent designed to prevent the missing of relevant schema elements while minimizing the inclusion of redundant ones. KaSLA employs a hierarchical linking strategy that first identifies the optimal table linking and subsequently links columns within the selected table to reduce linking candidate space. In each linking process, it utilize a knapsack optimization approach to link potentially relevant elements while accounting for a limited tolerance of potential redundant <a href="http://ones.With" rel="external noopener nofollow" class="link-external link-http">this http URL</a> this optimization, KaSLA-1.6B achieves superior schema linking results compared to large-scale LLMs, including deepseek-v3 with state-of-the-art (SOTA) schema linking method. Extensive experiments on Spider and BIRD benchmarks verify that KaSLA can significantly improve the SQL generation performance of SOTA text-to-SQL models by substituting their schema linking processes. </p> </div> </dd> <dt> <a name='item108'>[108]</a> <a href ="/abs/2502.12921" title="Abstract" id="2502.12921"> arXiv:2502.12921 </a> [<a href="/pdf/2502.12921" title="Download PDF" id="pdf-2502.12921" aria-labelledby="pdf-2502.12921">pdf</a>, <a href="https://arxiv.org/html/2502.12921v1" title="View HTML" id="html-2502.12921" aria-labelledby="html-2502.12921" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12921" title="Other formats" id="oth-2502.12921" aria-labelledby="oth-2502.12921">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Q-STRUM Debate: Query-Driven Contrastive Summarization for Recommendation Comparison </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Saad,+G">George-Kirollos Saad</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sanner,+S">Scott Sanner</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Query-driven recommendation with unknown items poses a challenge for users to understand why certain items are appropriate for their needs. Query-driven Contrastive Summarization (QCS) is a methodology designed to address this issue by leveraging language-based item descriptions to clarify contrasts between them. However, existing state-of-the-art contrastive summarization methods such as STRUM-LLM fall short of this goal. To overcome these limitations, we introduce Q-STRUM Debate, a novel extension of STRUM-LLM that employs debate-style prompting to generate focused and contrastive summarizations of item aspects relevant to a query. Leveraging modern large language models (LLMs) as powerful tools for generating debates, Q-STRUM Debate provides enhanced contrastive summaries. Experiments across three datasets demonstrate that Q-STRUM Debate yields significant performance improvements over existing methods on key contrastive summarization criteria, thus introducing a novel and performant debate prompting methodology for QCS. </p> </div> </dd> <dt> <a name='item109'>[109]</a> <a href ="/abs/2502.12923" title="Abstract" id="2502.12923"> arXiv:2502.12923 </a> [<a href="/pdf/2502.12923" title="Download PDF" id="pdf-2502.12923" aria-labelledby="pdf-2502.12923">pdf</a>, <a href="https://arxiv.org/html/2502.12923v1" title="View HTML" id="html-2502.12923" aria-labelledby="html-2502.12923" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12923" title="Other formats" id="oth-2502.12923" aria-labelledby="oth-2502.12923">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> On-Device LLMs for Home Assistant: Dual Role in Intent Detection and Response Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Birkmose,+R">Rune Birkmose</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Reece,+N+M">Nathan M酶rkeberg Reece</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Norvin,+E+H">Esben Hofstedt Norvin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bjerva,+J">Johannes Bjerva</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+M">Mike Zhang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> This paper investigates whether Large Language Models (LLMs), fine-tuned on synthetic but domain-representative data, can perform the twofold task of (i) slot and intent detection and (ii) natural language response generation for a smart home assistant, while running solely on resource-limited, CPU-only edge hardware. We fine-tune LLMs to produce both JSON action calls and text responses. Our experiments show that 16-bit and 8-bit quantized variants preserve high accuracy on slot and intent detection and maintain strong semantic coherence in generated text, while the 4-bit model, while retaining generative fluency, suffers a noticeable drop in device-service classification accuracy. Further evaluations on noisy human (non-synthetic) prompts and out-of-domain intents confirm the models' generalization ability, obtaining around 80--86\% accuracy. While the average inference time is 5--6 seconds per query -- acceptable for one-shot commands but suboptimal for multi-turn dialogue -- our results affirm that an on-device LLM can effectively unify command interpretation and flexible response generation for home automation without relying on specialized hardware. </p> </div> </dd> <dt> <a name='item110'>[110]</a> <a href ="/abs/2502.12924" title="Abstract" id="2502.12924"> arXiv:2502.12924 </a> [<a href="/pdf/2502.12924" title="Download PDF" id="pdf-2502.12924" aria-labelledby="pdf-2502.12924">pdf</a>, <a href="https://arxiv.org/html/2502.12924v1" title="View HTML" id="html-2502.12924" aria-labelledby="html-2502.12924" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12924" title="Other formats" id="oth-2502.12924" aria-labelledby="oth-2502.12924">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Conditioning LLMs to Generate Code-Switched Text: A Methodology Grounded in Naturally Occurring Data </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Heredia,+M">Maite Heredia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Labaka,+G">Gorka Labaka</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Barnes,+J">Jeremy Barnes</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Soroa,+A">Aitor Soroa</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Code-switching (CS) is still a critical challenge in Natural Language Processing (NLP). Current Large Language Models (LLMs) struggle to interpret and generate code-switched text, primarily due to the scarcity of large-scale CS datasets for training. This paper presents a novel methodology to generate CS data using LLMs, and test it on the English-Spanish language pair. We propose back-translating natural CS sentences into monolingual English, and using the resulting parallel corpus to fine-tune LLMs to turn monolingual sentences into CS. Unlike previous approaches to CS generation, our methodology uses natural CS data as a starting point, allowing models to learn its natural distribution beyond grammatical patterns. We thoroughly analyse the models' performance through a study on human preferences, a qualitative error analysis and an evaluation with popular automatic metrics. Results show that our methodology generates fluent code-switched text, expanding research opportunities in CS communication, and that traditional metrics do not correlate with human judgement when assessing the quality of the generated CS data. We release our code and generated dataset under a CC-BY-NC-SA license. </p> </div> </dd> <dt> <a name='item111'>[111]</a> <a href ="/abs/2502.12927" title="Abstract" id="2502.12927"> arXiv:2502.12927 </a> [<a href="/pdf/2502.12927" title="Download PDF" id="pdf-2502.12927" aria-labelledby="pdf-2502.12927">pdf</a>, <a href="https://arxiv.org/html/2502.12927v1" title="View HTML" id="html-2502.12927" aria-labelledby="html-2502.12927" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12927" title="Other formats" id="oth-2502.12927" aria-labelledby="oth-2502.12927">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SEFL: Harnessing Large Language Model Agents to Improve Educational Feedback Systems </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+M">Mike Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dilling,+A+P">Amalie Pernille Dilling</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gondelman,+L">L茅on Gondelman</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lyngdorf,+N+E+R">Niels Erik Ruan Lyngdorf</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lindsay,+E+D">Euan D. Lindsay</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bjerva,+J">Johannes Bjerva</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Providing high-quality feedback is crucial for student success but is constrained by time, cost, and limited data availability. We introduce Synthetic Educational Feedback Loops (SEFL), a novel framework designed to deliver immediate, on-demand feedback at scale without relying on extensive, real-world student data. In SEFL, two large language models (LLMs) operate in teacher--student roles to simulate assignment completion and formative feedback, generating abundant synthetic pairs of student work and corresponding critiques. We then fine-tune smaller, more computationally efficient LLMs on these synthetic pairs, enabling them to replicate key features of high-quality, goal-oriented feedback. Unlike personalized tutoring approaches that offer multi-turn, individualized instruction, SEFL specifically focuses on replicating the teacher-->student feedback loop for diverse assignments. Through both LLM-as-a-judge and human evaluations, we demonstrate that SEFL-tuned models outperform their non-tuned counterparts in feedback quality, clarity, and timeliness. These findings reveal SEFL's potential to transform feedback processes for higher education and beyond, offering an ethical and scalable alternative to conventional manual feedback cycles. </p> </div> </dd> <dt> <a name='item112'>[112]</a> <a href ="/abs/2502.12928" title="Abstract" id="2502.12928"> arXiv:2502.12928 </a> [<a href="/pdf/2502.12928" title="Download PDF" id="pdf-2502.12928" aria-labelledby="pdf-2502.12928">pdf</a>, <a href="https://arxiv.org/html/2502.12928v1" title="View HTML" id="html-2502.12928" aria-labelledby="html-2502.12928" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12928" title="Other formats" id="oth-2502.12928" aria-labelledby="oth-2502.12928">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Finedeep: Mitigating Sparse Activation in Dense LLMs via Multi-Layer Fine-Grained Experts </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Pan,+L">Leiyu Pan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Su,+Z">Zhenpeng Su</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lv,+M">Minxuan Lv</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiong,+Y">Yizhe Xiong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+X">Xiangwen Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+Z">Zijia Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+H">Hui Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+J">Jungong Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ding,+G">Guiguang Ding</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+C">Cheng Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+D">Di Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gai,+K">Kun Gai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiong,+D">Deyi Xiong</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large language models have demonstrated exceptional performance across a wide range of tasks. However, dense models usually suffer from sparse activation, where many activation values tend towards zero (i.e., being inactivated). We argue that this could restrict the efficient exploration of model representation space. To mitigate this issue, we propose Finedeep, a deep-layered fine-grained expert architecture for dense models. Our framework partitions the feed-forward neural network layers of traditional dense models into small experts, arranges them across multiple sub-layers. A novel routing mechanism is proposed to determine each expert's contribution. We conduct extensive experiments across various model sizes, demonstrating that our approach significantly outperforms traditional dense architectures in terms of perplexity and benchmark performance while maintaining a comparable number of parameters and floating-point operations. Moreover, we find that Finedeep achieves optimal results when balancing depth and width, specifically by adjusting the number of expert sub-layers and the number of experts per sub-layer. Empirical results confirm that Finedeep effectively alleviates sparse activation and efficiently utilizes representation capacity in dense models. </p> </div> </dd> <dt> <a name='item113'>[113]</a> <a href ="/abs/2502.12932" title="Abstract" id="2502.12932"> arXiv:2502.12932 </a> [<a href="/pdf/2502.12932" title="Download PDF" id="pdf-2502.12932" aria-labelledby="pdf-2502.12932">pdf</a>, <a href="https://arxiv.org/html/2502.12932v1" title="View HTML" id="html-2502.12932" aria-labelledby="html-2502.12932" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12932" title="Other formats" id="oth-2502.12932" aria-labelledby="oth-2502.12932">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Synthetic Data Generation for Culturally Nuanced Commonsense Reasoning in Low-Resource Languages </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Pranida,+S+Z">Salsabila Zahirah Pranida</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Genadi,+R+A">Rifo Ahmad Genadi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Koto,+F">Fajri Koto</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 18 pages total: 8 pages of main body, 6 pages of appendix. 4 figures in main body, 6 figures in appendix. Submitted to ARR on February 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Quantifying reasoning capability in low-resource languages remains a challenge in NLP due to data scarcity and limited access to annotators. While LLM-assisted dataset construction has proven useful for medium- and high-resource languages, its effectiveness in low-resource languages, particularly for commonsense reasoning, is still unclear. In this paper, we compare three dataset creation strategies: (1) LLM-assisted dataset generation, (2) machine translation, and (3) human-written data by native speakers, to build a culturally nuanced story comprehension dataset. We focus on Javanese and Sundanese, two major local languages in Indonesia, and evaluate the effectiveness of open-weight and closed-weight LLMs in assisting dataset creation through extensive manual validation. To assess the utility of synthetic data, we fine-tune language models on classification and generation tasks using this data and evaluate performance on a human-written test set. Our findings indicate that LLM-assisted data creation outperforms machine translation. </p> </div> </dd> <dt> <a name='item114'>[114]</a> <a href ="/abs/2502.12945" title="Abstract" id="2502.12945"> arXiv:2502.12945 </a> [<a href="/pdf/2502.12945" title="Download PDF" id="pdf-2502.12945" aria-labelledby="pdf-2502.12945">pdf</a>, <a href="https://arxiv.org/html/2502.12945v1" title="View HTML" id="html-2502.12945" aria-labelledby="html-2502.12945" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12945" title="Other formats" id="oth-2502.12945" aria-labelledby="oth-2502.12945">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LLMPopcorn: An Empirical Study of LLMs as Assistants for Popular Micro-video Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Fu,+J">Junchen Fu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ge,+X">Xuri Ge</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zheng,+K">Kaiwen Zheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Arapakis,+I">Ioannis Arapakis</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xin,+X">Xin Xin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jose,+J+M">Joemon M. Jose</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Popular Micro-videos, dominant on platforms like TikTok and YouTube, hold significant commercial value. The rise of high-quality AI-generated content has spurred interest in AI-driven micro-video creation. However, despite the advanced capabilities of large language models (LLMs) like ChatGPT and DeepSeek in text generation and reasoning, their potential to assist the creation of popular micro-videos remains largely unexplored. <br>In this paper, we conduct an empirical study on LLM-assisted popular micro-video generation (LLMPopcorn). Specifically, we investigate the following research questions: (i) How can LLMs be effectively utilized to assist popular micro-video generation? (ii) To what extent can prompt-based enhancements optimize the LLM-generated content for higher popularity? (iii) How well do various LLMs and video generators perform in the popular micro-video generation task? By exploring these questions, we show that advanced LLMs like DeepSeek-V3 enable micro-video generation to achieve popularity comparable to human-created content. Prompt enhancements further boost popularity, and benchmarking highlights DeepSeek-V3 and DeepSeek-R1 among LLMs, while LTX-Video and HunyuanVideo lead in video generation. This pioneering work advances AI-assisted micro-video creation, uncovering new research opportunities. We will release the code and datasets to support future studies. </p> </div> </dd> <dt> <a name='item115'>[115]</a> <a href ="/abs/2502.12947" title="Abstract" id="2502.12947"> arXiv:2502.12947 </a> [<a href="/pdf/2502.12947" title="Download PDF" id="pdf-2502.12947" aria-labelledby="pdf-2502.12947">pdf</a>, <a href="https://arxiv.org/html/2502.12947v1" title="View HTML" id="html-2502.12947" aria-labelledby="html-2502.12947" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12947" title="Other formats" id="oth-2502.12947" aria-labelledby="oth-2502.12947">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Every Expert Matters: Towards Effective Knowledge Distillation for Mixture-of-Experts Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+G">Gyeongman Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chu,+G">Gyouk Chu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+E">Eunho Yang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> With the emergence of Mixture-of-Experts (MoE), the efficient scaling of model size has accelerated the development of large language models in recent years. However, their high memory requirements prevent their use in resource-constrained environments. While knowledge distillation (KD) has been a proven method for model compression, its application to MoE teacher models remains underexplored. Through our investigation, we discover that non-activated experts in MoE models possess valuable knowledge that benefits student models. We further demonstrate that existing KD methods are not optimal for compressing MoE models, as they fail to leverage this knowledge effectively. To address this, we propose two intuitive MoE-specific KD methods for the first time: Knowledge Augmentation (KA) and Student-Aware Router (SAR), both designed to effectively extract knowledge from all experts. Specifically, KA augments knowledge by sampling experts multiple times, while SAR uses all experts and adjusts the expert weights through router training to provide optimal knowledge. Extensive experiments show that our methods outperform conventional KD methods, demonstrating their effectiveness for MoE teacher models. </p> </div> </dd> <dt> <a name='item116'>[116]</a> <a href ="/abs/2502.12953" title="Abstract" id="2502.12953"> arXiv:2502.12953 </a> [<a href="/pdf/2502.12953" title="Download PDF" id="pdf-2502.12953" aria-labelledby="pdf-2502.12953">pdf</a>, <a href="https://arxiv.org/html/2502.12953v1" title="View HTML" id="html-2502.12953" aria-labelledby="html-2502.12953" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12953" title="Other formats" id="oth-2502.12953" aria-labelledby="oth-2502.12953">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Task-Informed Anti-Curriculum by Masking Improves Downstream Performance on Text </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Jarca,+A">Andrei Jarca</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Croitoru,+F+A">Florinel Alin Croitoru</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ionescu,+R+T">Radu Tudor Ionescu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Masked language modeling has become a widely adopted unsupervised technique to pre-train language models. However, the process of selecting tokens for masking is random, and the percentage of masked tokens is typically fixed for the entire training process. In this paper, we propose to adjust the masking ratio and to decide which tokens to mask based on a novel task-informed anti-curriculum learning scheme. First, we harness task-specific knowledge about useful and harmful tokens in order to determine which tokens to mask. Second, we propose a cyclic decaying masking ratio, which corresponds to an anti-curriculum schedule (from hard to easy). We exemplify our novel task-informed anti-curriculum by masking (TIACBM) approach across three diverse downstream tasks: sentiment analysis, text classification by topic, and authorship attribution. Our findings suggest that TIACBM enhances the ability of the model to focus on key task-relevant features, contributing to statistically significant performance gains across tasks. We release our code at <a href="https://github.com/JarcaAndrei/TIACBM" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item117'>[117]</a> <a href ="/abs/2502.12959" title="Abstract" id="2502.12959"> arXiv:2502.12959 </a> [<a href="/pdf/2502.12959" title="Download PDF" id="pdf-2502.12959" aria-labelledby="pdf-2502.12959">pdf</a>, <a href="https://arxiv.org/html/2502.12959v1" title="View HTML" id="html-2502.12959" aria-labelledby="html-2502.12959" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12959" title="Other formats" id="oth-2502.12959" aria-labelledby="oth-2502.12959">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> AlignFreeze: Navigating the Impact of Realignment on the Layers of Multilingual Models Across Diverse Languages </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Bakos,+S">Steve Bakos</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gaschi,+F">F茅lix Gaschi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guzm%C3%A1n,+D">David Guzm谩n</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=More,+R">Riddhi More</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+K+C">Kelly Chutong Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+E+A">En-Shiun Annie Lee</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 24 pages, 2 figures, to be published in Proceedings of NAACL 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Realignment techniques are often employed to enhance cross-lingual transfer in multilingual language models, still, they can sometimes degrade performance in languages that differ significantly from the fine-tuned source language. This paper introduces AlignFreeze, a method that freezes either the layers' lower half or upper half during realignment. Through controlled experiments on 4 tasks, 3 models, and in 35 languages, we find that realignment affects all the layers but can be the most detrimental to the lower ones. Freezing the lower layers can prevent performance degradation. Particularly, AlignFreeze improves Part-of-Speech (PoS) tagging performances in languages where full realignment fails: with XLM-R, it provides improvements of more than one standard deviation in accuracy in seven more languages than full realignment. </p> </div> </dd> <dt> <a name='item118'>[118]</a> <a href ="/abs/2502.12962" title="Abstract" id="2502.12962"> arXiv:2502.12962 </a> [<a href="/pdf/2502.12962" title="Download PDF" id="pdf-2502.12962" aria-labelledby="pdf-2502.12962">pdf</a>, <a href="https://arxiv.org/html/2502.12962v1" title="View HTML" id="html-2502.12962" aria-labelledby="html-2502.12962" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12962" title="Other formats" id="oth-2502.12962" aria-labelledby="oth-2502.12962">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Infinite Retrieval: Attention Enhanced LLMs in Long-Context Processing </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ye,+X">Xiaoju Ye</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zhichun Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+J">Jingyuan Wang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 21 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Limited by the context window size of Large Language Models(LLMs), handling various tasks with input tokens exceeding the upper limit has been challenging, whether it is a simple direct retrieval task or a complex multi-hop reasoning task. Although various methods have been proposed to enhance the long-context processing capabilities of LLMs, they either incur substantial post-training costs, or require additional tool modules(e.g.,RAG), or have not shown significant improvement in realistic tasks. Our work observes the correlation between the attention distribution and generated answers across each layer, and establishes the attention allocation aligns with retrieval-augmented capabilities through experiments. Drawing on the above insights, we propose a novel method InfiniRetri that leverages the LLMs's own attention information to enable accurate retrieval across inputs of infinitely length. Our evaluations indicate that InfiniRetri achieves 100% accuracy in the Needle-In-a-Haystack(NIH) test over 1M tokens using a 0.5B parameter model, surpassing other method or larger models and setting a new state-of-the-art(SOTA). Moreover, our method achieves significant performance improvements on real-world benchmarks, with a maximum 288% improvement. In addition, InfiniRetri can be applied to any Transformer-based LLMs without additional training and substantially reduces inference latency and compute overhead in long texts. In summary, our comprehensive studies show InfiniRetri's potential for practical applications and creates a paradigm for retrievaling information using LLMs own capabilities under infinite-length tokens. Code will be released in link. </p> </div> </dd> <dt> <a name='item119'>[119]</a> <a href ="/abs/2502.12964" title="Abstract" id="2502.12964"> arXiv:2502.12964 </a> [<a href="/pdf/2502.12964" title="Download PDF" id="pdf-2502.12964" aria-labelledby="pdf-2502.12964">pdf</a>, <a href="https://arxiv.org/html/2502.12964v1" title="View HTML" id="html-2502.12964" aria-labelledby="html-2502.12964" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12964" title="Other formats" id="oth-2502.12964" aria-labelledby="oth-2502.12964">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Trust Me, I'm Wrong: High-Certainty Hallucinations in LLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Simhi,+A">Adi Simhi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Itzhak,+I">Itay Itzhak</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Barez,+F">Fazl Barez</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Stanovsky,+G">Gabriel Stanovsky</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Belinkov,+Y">Yonatan Belinkov</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large Language Models (LLMs) often generate outputs that lack grounding in real-world facts, a phenomenon known as hallucinations. Prior research has associated hallucinations with model uncertainty, leveraging this relationship for hallucination detection and mitigation. In this paper, we challenge the underlying assumption that all hallucinations are associated with uncertainty. Using knowledge detection and uncertainty measurement methods, we demonstrate that models can hallucinate with high certainty even when they have the correct knowledge. We further show that high-certainty hallucinations are consistent across models and datasets, distinctive enough to be singled out, and challenge existing mitigation methods. Our findings reveal an overlooked aspect of hallucinations, emphasizing the need to understand their origins and improve mitigation strategies to enhance LLM safety. The code is available at <a href="https://github.com/technion-cs-nlp/Trust_me_Im_wrong" rel="external noopener nofollow" class="link-external link-https">this https URL</a> . </p> </div> </dd> <dt> <a name='item120'>[120]</a> <a href ="/abs/2502.12965" title="Abstract" id="2502.12965"> arXiv:2502.12965 </a> [<a href="/pdf/2502.12965" title="Download PDF" id="pdf-2502.12965" aria-labelledby="pdf-2502.12965">pdf</a>, <a href="https://arxiv.org/html/2502.12965v1" title="View HTML" id="html-2502.12965" aria-labelledby="html-2502.12965" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12965" title="Other formats" id="oth-2502.12965" aria-labelledby="oth-2502.12965">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Survey of Text Classification Under Class Distribution Shift </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Costache,+A+V">Adriana Valentina Costache</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gheorghe,+S+F">Silviu Florin Gheorghe</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Poesina,+E+G">Eduard Gabriel Poesina</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Irofti,+P">Paul Irofti</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ionescu,+R+T">Radu Tudor Ionescu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> The basic underlying assumption of machine learning (ML) models is that the training and test data are sampled from the same distribution. However, in daily practice, this assumption is often broken, i.e.~the distribution of the test data changes over time, which hinders the application of conventional ML models. One domain where the distribution shift naturally occurs is text classification, since people always find new topics to discuss. To this end, we survey research articles studying open-set text classification and related tasks. We divide the methods in this area based on the constraints that define the kind of distribution shift and the corresponding problem formulation, i.e.~learning with the Universum, zero-shot learning, and open-set learning. We next discuss the predominant mitigation approaches for each problem setup. Finally, we identify several future work directions, aiming to push the boundaries beyond the state of the art. Interestingly, we find that continual learning can solve many of the issues caused by the shifting class distribution. We maintain a list of relevant papers at <a href="https://github.com/Eduard6421/Open-Set-Survey" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item121'>[121]</a> <a href ="/abs/2502.12970" title="Abstract" id="2502.12970"> arXiv:2502.12970 </a> [<a href="/pdf/2502.12970" title="Download PDF" id="pdf-2502.12970" aria-labelledby="pdf-2502.12970">pdf</a>, <a href="https://arxiv.org/html/2502.12970v1" title="View HTML" id="html-2502.12970" aria-labelledby="html-2502.12970" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12970" title="Other formats" id="oth-2502.12970" aria-labelledby="oth-2502.12970">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Reasoning-to-Defend: Safety-Aware Reasoning Can Defend Large Language Models from Jailbreaking </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+J">Junda Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yan,+L">Lingyong Yan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Shuaiqiang Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yin,+D">Dawei Yin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sha,+L">Lei Sha</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 18 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The reasoning abilities of Large Language Models (LLMs) have demonstrated remarkable advancement and exceptional performance across diverse domains. However, leveraging these reasoning capabilities to enhance LLM safety against adversarial attacks and jailbreak queries remains largely unexplored. To bridge this gap, we propose Reasoning-to-Defend (R2D), a novel training paradigm that integrates safety reflections of queries and responses into LLMs' generation process, unlocking a safety-aware reasoning mechanism. This approach enables self-evaluation at each reasoning step to create safety pivot tokens as indicators of the response's safety status. Furthermore, in order to improve the learning efficiency of pivot token prediction, we propose Contrastive Pivot Optimization(CPO), which enhances the model's ability to perceive the safety status of dialogues. Through this mechanism, LLMs dynamically adjust their response strategies during reasoning, significantly enhancing their defense capabilities against jailbreak attacks. Extensive experimental results demonstrate that R2D effectively mitigates various attacks and improves overall safety, highlighting the substantial potential of safety-aware reasoning in strengthening LLMs' robustness against jailbreaks. </p> </div> </dd> <dt> <a name='item122'>[122]</a> <a href ="/abs/2502.12982" title="Abstract" id="2502.12982"> arXiv:2502.12982 </a> [<a href="/pdf/2502.12982" title="Download PDF" id="pdf-2502.12982" aria-labelledby="pdf-2502.12982">pdf</a>, <a href="https://arxiv.org/html/2502.12982v1" title="View HTML" id="html-2502.12982" aria-labelledby="html-2502.12982" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12982" title="Other formats" id="oth-2502.12982" aria-labelledby="oth-2502.12982">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Sailor2: Sailing in South-East Asia with Inclusive Multilingual LLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Dou,+L">Longxu Dou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Q">Qian Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+F">Fan Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+C">Changyu Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zili Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jin,+Z">Ziqi Jin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zichen Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+T">Tongyao Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Du,+C">Cunxiao Du</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+P">Penghui Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+H">Haonan Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+J">Jiaheng Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+Y">Yongchi Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+X">Xiachong Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mao,+X">Xin Mao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yeung,+M+T">Man Tsung Yeung</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pipatanakul,+K">Kunat Pipatanakul</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Koto,+F">Fajri Koto</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Thu,+M+S">Min Si Thu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kydl%C3%AD%C4%8Dek,+H">Hynek Kydl铆膷ek</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zeyi Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+Q">Qunshu Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sittipong">Sittipong Sripaisarnmongkol</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sae-Khow,+K">Kridtaphad Sae-Khow</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Thongchim,+N">Nirattisai Thongchim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Konkaew,+T">Taechawat Konkaew</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Borijindargoon,+N">Narong Borijindargoon</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dao,+A">Anh Dao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Maneegard,+M">Matichon Maneegard</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Artkaew,+P">Phakphum Artkaew</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yong,+Z">Zheng-Xin Yong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nguyen,+Q">Quan Nguyen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Phatthiyaphaibun,+W">Wannaphong Phatthiyaphaibun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tran,+H+H">Hoang H. Tran</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+M">Mike Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+S">Shiqi Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pang,+T">Tianyu Pang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Du,+C">Chao Du</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wan,+X">Xinyi Wan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+W">Wei Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+M">Min Lin</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 49 pages, 16 figures. Technical Report of Sailor2: <a href="https://sea-sailor.github.io/blog/sailor2/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Sailor2 is a family of cutting-edge multilingual language models for South-East Asian (SEA) languages, available in 1B, 8B, and 20B sizes to suit diverse applications. Building on Qwen2.5, Sailor2 undergoes continuous pre-training on 500B tokens (400B SEA-specific and 100B replay tokens) to support 13 SEA languages while retaining proficiency in Chinese and English. Sailor2-20B model achieves a 50-50 win rate against GPT-4o across SEA languages. We also deliver a comprehensive cookbook on how to develop the multilingual model in an efficient manner, including five key aspects: data curation, pre-training, post-training, model customization and evaluation. We hope that Sailor2 model (Apache 2.0 license) will drive language development in the SEA region, and Sailor2 cookbook will inspire researchers to build more inclusive LLMs for other under-served languages. </p> </div> </dd> <dt> <a name='item123'>[123]</a> <a href ="/abs/2502.12988" title="Abstract" id="2502.12988"> arXiv:2502.12988 </a> [<a href="/pdf/2502.12988" title="Download PDF" id="pdf-2502.12988" aria-labelledby="pdf-2502.12988">pdf</a>, <a href="https://arxiv.org/html/2502.12988v1" title="View HTML" id="html-2502.12988" aria-labelledby="html-2502.12988" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12988" title="Other formats" id="oth-2502.12988" aria-labelledby="oth-2502.12988">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Beyond Profile: From Surface-Level Facts to Deep Persona Simulation in LLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zixiao Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+D">Duzhen Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Agrawal,+I">Ishita Agrawal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+S">Shen Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+L">Le Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+X">Xiuying Chen</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 19 pages, 3 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Previous approaches to persona simulation large language models (LLMs) have typically relied on learning basic biographical information, or using limited role-play dialogue datasets to capture a character's responses. However, a holistic representation of an individual goes beyond surface-level facts or conversations to deeper thoughts and thinking. In this work, we introduce CharacterBot, a model designed to replicate both the linguistic patterns and distinctive thought processes of a character. Using Lu Xun, a renowned Chinese writer, as a case study, we propose four training tasks derived from his 17 essay collections. These include a pre-training task focused on mastering external linguistic structures and knowledge, as well as three fine-tuning tasks: multiple-choice question answering, generative question answering, and style transfer, each aligning the LLM with Lu Xun's internal ideation and writing style. To optimize learning across these tasks, we introduce a CharLoRA parameter updating mechanism, where a general linguistic style expert collaborates with other task-specific experts to better study both the language style and the understanding of deeper thoughts. We evaluate CharacterBot on three tasks for linguistic accuracy and opinion comprehension, demonstrating that it significantly outperforms the baselines on our adapted metrics. We hope that this work inspires future research on deep character persona simulation LLM. </p> </div> </dd> <dt> <a name='item124'>[124]</a> <a href ="/abs/2502.12992" title="Abstract" id="2502.12992"> arXiv:2502.12992 </a> [<a href="/pdf/2502.12992" title="Download PDF" id="pdf-2502.12992" aria-labelledby="pdf-2502.12992">pdf</a>, <a href="https://arxiv.org/html/2502.12992v1" title="View HTML" id="html-2502.12992" aria-labelledby="html-2502.12992" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12992" title="Other formats" id="oth-2502.12992" aria-labelledby="oth-2502.12992">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> B-cos LM: Efficiently Transforming Pre-trained Language Models for Improved Explainability </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yifan Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rao,+S">Sukrut Rao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+J">Ji-Ung Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jobanputra,+M">Mayank Jobanputra</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Demberg,+V">Vera Demberg</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 20 pages, 15 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Post-hoc explanation methods for black-box models often struggle with faithfulness and human interpretability due to the lack of explainability in current neural models. Meanwhile, B-cos networks have been introduced to improve model explainability through architectural and computational adaptations, but their application has so far been limited to computer vision models and their associated training pipelines. In this work, we introduce B-cos LMs, i.e., B-cos networks empowered for NLP tasks. Our approach directly transforms pre-trained language models into B-cos LMs by combining B-cos conversion and task fine-tuning, improving efficiency compared to previous B-cos methods. Our automatic and human evaluation results demonstrate that B-cos LMs produce more faithful and human interpretable explanations than post hoc methods, while maintaining task performance comparable to conventional fine-tuning. Our in-depth analysis explores how B-cos LMs differ from conventionally fine-tuned models in their learning processes and explanation patterns. Finally, we provide practical guidelines for effectively building B-cos LMs based on our findings. Our code is available at <a href="https://anonymous.4open.science/r/bcos_lm" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item125'>[125]</a> <a href ="/abs/2502.12996" title="Abstract" id="2502.12996"> arXiv:2502.12996 </a> [<a href="/pdf/2502.12996" title="Download PDF" id="pdf-2502.12996" aria-labelledby="pdf-2502.12996">pdf</a>, <a href="https://arxiv.org/html/2502.12996v1" title="View HTML" id="html-2502.12996" aria-labelledby="html-2502.12996" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12996" title="Other formats" id="oth-2502.12996" aria-labelledby="oth-2502.12996">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Eager Updates For Overlapped Communication and Computation in DiLoCo </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Kale,+S">Satyen Kale</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Douillard,+A">Arthur Douillard</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Donchev,+Y">Yanislav Donchev</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> arXiv admin note: text overlap with <a href="https://arxiv.org/abs/2501.18512" data-arxiv-id="2501.18512" class="link-https">arXiv:2501.18512</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Distributed optimization methods such as DiLoCo have been shown to be effective in training very large models across multiple distributed workers, such as datacenters. These methods split updates into two parts: an inner optimization phase, where the workers independently execute multiple optimization steps on their own local data, and an outer optimization step, where the inner updates are synchronized. While such approaches require orders of magnitude less communication than standard data-parallel training, in settings where the workers are datacenters, even the limited communication requirements of these approaches can still cause significant slow downs due to the blocking necessary at each outer optimization step. In this paper, we investigate techniques to mitigate this issue by overlapping communication with computation in a manner that allows the outer optimization step to fully overlap with the inner optimization phase. We show that a particular variant, dubbed eager updates, provides competitive performance with standard DiLoCo in settings with low bandwidth between workers. </p> </div> </dd> <dt> <a name='item126'>[126]</a> <a href ="/abs/2502.13004" title="Abstract" id="2502.13004"> arXiv:2502.13004 </a> [<a href="/pdf/2502.13004" title="Download PDF" id="pdf-2502.13004" aria-labelledby="pdf-2502.13004">pdf</a>, <a href="https://arxiv.org/html/2502.13004v1" title="View HTML" id="html-2502.13004" aria-labelledby="html-2502.13004" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.13004" title="Other formats" id="oth-2502.13004" aria-labelledby="oth-2502.13004">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Language Barriers: Evaluating Cross-Lingual Performance of CNN and Transformer Architectures for Speech Quality Estimation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wardah,+W">Wafaa Wardah</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=B%C3%BCy%C3%BCkta%C5%9F,+T+M+K">Tu臒莽e Melike Ko莽ak B眉y眉kta艧</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shchegelskiy,+K">Kirill Shchegelskiy</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=M%C3%B6ller,+S">Sebastian M枚ller</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Spang,+R+P">Robert P. Spang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Objective speech quality models aim to predict human-perceived speech quality using automated methods. However, cross-lingual generalization remains a major challenge, as Mean Opinion Scores (MOS) vary across languages due to linguistic, perceptual, and dataset-specific differences. A model trained primarily on English data may struggle to generalize to languages with different phonetic, tonal, and prosodic characteristics, leading to inconsistencies in objective assessments. This study investigates the cross-lingual performance of two speech quality models: NISQA, a CNN-based model, and a Transformer-based Audio Spectrogram Transformer (AST) model. Both models were trained exclusively on English datasets containing over 49,000 speech samples and subsequently evaluated on speech in German, French, Mandarin, Swedish, and Dutch. We analyze model performance using Pearson Correlation Coefficient (PCC) and Root Mean Square Error (RMSE) across five speech quality dimensions: coloration, discontinuity, loudness, noise, and MOS. Our findings show that while AST achieves a more stable cross-lingual performance, both models exhibit noticeable biases. Notably, Mandarin speech quality predictions correlate highly with human MOS scores, whereas Swedish and Dutch present greater prediction challenges. Discontinuities remain difficult to model across all languages. These results highlight the need for more balanced multilingual datasets and architecture-specific adaptations to improve cross-lingual generalization. </p> </div> </dd> <dt> <a name='item127'>[127]</a> <a href ="/abs/2502.13010" title="Abstract" id="2502.13010"> arXiv:2502.13010 </a> [<a href="/pdf/2502.13010" title="Download PDF" id="pdf-2502.13010" aria-labelledby="pdf-2502.13010">pdf</a>, <a href="https://arxiv.org/html/2502.13010v1" title="View HTML" id="html-2502.13010" aria-labelledby="html-2502.13010" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.13010" title="Other formats" id="oth-2502.13010" aria-labelledby="oth-2502.13010">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Adaptive Knowledge Graphs Enhance Medical Question Answering: Bridging the Gap Between LLMs and Evolving Medical Knowledge </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Rezaei,+M+R">Mohammad Reza Rezaei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fard,+R+S">Reza Saadati Fard</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Parker,+J">Jayson Parker</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Krishnan,+R+G">Rahul G. Krishnan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lankarany,+M">Milad Lankarany</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Multiagent Systems (cs.MA) </div> <p class='mathjax'> Large Language Models (LLMs) have significantly advanced medical question-answering by leveraging extensive clinical data and medical literature. However, the rapid evolution of medical knowledge and the labor-intensive process of manually updating domain-specific resources pose challenges to the reliability of these systems. To address this, we introduce Adaptive Medical Graph-RAG (AMG-RAG), a comprehensive framework that automates the construction and continuous updating of medical knowledge graphs, integrates reasoning, and retrieves current external evidence, such as PubMed and WikiSearch. By dynamically linking new findings and complex medical concepts, AMG-RAG not only improves accuracy but also enhances interpretability in medical queries. <br>Evaluations on the MEDQA and MEDMCQA benchmarks demonstrate the effectiveness of AMG-RAG, achieving an F1 score of 74.1 percent on MEDQA and an accuracy of 66.34 percent on MEDMCQA, outperforming both comparable models and those 10 to 100 times larger. Notably, these improvements are achieved without increasing computational overhead, highlighting the critical role of automated knowledge graph generation and external evidence retrieval in delivering up-to-date, trustworthy medical insights. </p> </div> </dd> <dt> <a name='item128'>[128]</a> <a href ="/abs/2502.13019" title="Abstract" id="2502.13019"> arXiv:2502.13019 </a> [<a href="/pdf/2502.13019" title="Download PDF" id="pdf-2502.13019" aria-labelledby="pdf-2502.13019">pdf</a>, <a href="https://arxiv.org/html/2502.13019v1" title="View HTML" id="html-2502.13019" aria-labelledby="html-2502.13019" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.13019" title="Other formats" id="oth-2502.13019" aria-labelledby="oth-2502.13019">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Oreo: A Plug-in Context Reconstructor to Enhance Retrieval-Augmented Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+S">Sha Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ramarkrishnan,+N">Naren Ramarkrishnan</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 14 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Despite the remarkable capabilities of Large Language Models (LLMs) in various NLP tasks, they remain vulnerable to hallucinations due to their limited parametric knowledge and lack of domain-specific expertise. Retrieval-Augmented Generation (RAG) addresses this challenge by incorporating external document retrieval to augment the knowledge base of LLMs. In this approach, RAG retrieves document chunks from an external corpus in response to a query, which are then used as context for the downstream language model to generate an answer. However, these retrieved knowledge sources often include irrelevant or erroneous information, undermining the effectiveness of RAG in downstream tasks. To overcome this limitation, we introduce a compact, efficient, and pluggable module designed to refine external knowledge sources before feeding them to the generator. The module reconstructs retrieved content by extracting the most relevant and supportive information and reorganising it into a concise, query-specific format. Through a three-stage training paradigm - comprising supervised fine-tuning, contrastive multi-task learning, and reinforcement learning-based alignment - it prioritises critical knowledge and aligns it with the generator's preferences. This method enables LLMs to produce outputs that are more accurate, reliable, and contextually appropriate. </p> </div> </dd> <dt> <a name='item129'>[129]</a> <a href ="/abs/2502.13028" title="Abstract" id="2502.13028"> arXiv:2502.13028 </a> [<a href="/pdf/2502.13028" title="Download PDF" id="pdf-2502.13028" aria-labelledby="pdf-2502.13028">pdf</a>, <a href="https://arxiv.org/html/2502.13028v1" title="View HTML" id="html-2502.13028" aria-labelledby="html-2502.13028" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.13028" title="Other formats" id="oth-2502.13028" aria-labelledby="oth-2502.13028">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Whose story is it? Personalizing story generation by inferring author styles </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Kumar,+N+A">Nischal Ashok Kumar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pham,+C+M">Chau Minh Pham</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Iyyer,+M">Mohit Iyyer</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lan,+A">Andrew Lan</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> preprint 52 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Personalization has become essential for improving user experience in interactive writing and educational applications, yet its potential in story generation remains largely unexplored. In this work, we propose a novel two-stage pipeline for personalized story generation. Our approach first infers an author's implicit story-writing characteristics from their past work and organizes them into an Author Writing Sheet, inspired by narrative theory. The second stage uses this sheet to simulate the author's persona through tailored persona descriptions and personalized story writing rules. To enable and validate our approach, we construct Mythos, a dataset of 590 stories from 64 authors across five distinct sources that reflect diverse story-writing settings. A head-to-head comparison with a non-personalized baseline demonstrates our pipeline's effectiveness in generating high-quality personalized stories. Our personalized stories achieve a 75 percent win rate (versus 14 percent for the baseline and 11 percent ties) in capturing authors' writing style based on their past works. Human evaluation highlights the high quality of our Author Writing Sheet and provides valuable insights into the personalized story generation task. Notable takeaways are that writings from certain sources, such as Reddit, are easier to personalize than others, like AO3, while narrative aspects, like Creativity and Language Use, are easier to personalize than others, like Plot. </p> </div> </dd> <dt> <a name='item130'>[130]</a> <a href ="/abs/2502.13031" title="Abstract" id="2502.13031"> arXiv:2502.13031 </a> [<a href="/pdf/2502.13031" title="Download PDF" id="pdf-2502.13031" aria-labelledby="pdf-2502.13031">pdf</a>, <a href="https://arxiv.org/html/2502.13031v1" title="View HTML" id="html-2502.13031" aria-labelledby="html-2502.13031" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.13031" title="Other formats" id="oth-2502.13031" aria-labelledby="oth-2502.13031">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> HPSS: Heuristic Prompting Strategy Search for LLM Evaluators </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wen,+B">Bosi Wen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ke,+P">Pei Ke</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+Y">Yufei Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+C">Cunxiang Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gu,+X">Xiaotao Gu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+J">Jinfeng Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+J">Jie Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+H">Hongning Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+M">Minlie Huang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 32 pages, 10 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Since the adoption of large language models (LLMs) for text evaluation has become increasingly prevalent in the field of natural language processing (NLP), a series of existing works attempt to optimize the prompts for LLM evaluators to improve their alignment with human judgment. However, their efforts are limited to optimizing individual factors of evaluation prompts, such as evaluation criteria or output formats, neglecting the combinatorial impact of multiple factors, which leads to insufficient optimization of the evaluation pipeline. Nevertheless, identifying well-behaved prompting strategies for adjusting multiple factors requires extensive enumeration. To this end, we comprehensively integrate 8 key factors for evaluation prompts and propose a novel automatic prompting strategy optimization method called Heuristic Prompting Strategy Search (HPSS). Inspired by the genetic algorithm, HPSS conducts an iterative search to find well-behaved prompting strategies for LLM evaluators. A heuristic function is employed to guide the search process, enhancing the performance of our algorithm. Extensive experiments across four evaluation tasks demonstrate the effectiveness of HPSS, consistently outperforming both human-designed evaluation prompts and existing automatic prompt optimization methods. </p> </div> </dd> <dt> <a name='item131'>[131]</a> <a href ="/abs/2502.13034" title="Abstract" id="2502.13034"> arXiv:2502.13034 </a> [<a href="/pdf/2502.13034" title="Download PDF" id="pdf-2502.13034" aria-labelledby="pdf-2502.13034">pdf</a>, <a href="https://arxiv.org/html/2502.13034v1" title="View HTML" id="html-2502.13034" aria-labelledby="html-2502.13034" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.13034" title="Other formats" id="oth-2502.13034" aria-labelledby="oth-2502.13034">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Natural Language Generation from Visual Sequences: Challenges and Future Directions </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Surikuchi,+A+K">Aditya K Surikuchi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fern%C3%A1ndez,+R">Raquel Fern谩ndez</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pezzelle,+S">Sandro Pezzelle</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV); Machine Learning (cs.LG) </div> <p class='mathjax'> The ability to use natural language to talk about visual content is at the core of human intelligence and a crucial feature of any artificial intelligence system. Various studies have focused on generating text for single images. In contrast, comparatively little attention has been paid to exhaustively analyzing and advancing work on multiple-image vision-to-text settings. In this position paper, we claim that any task dealing with temporally ordered sequences of multiple images or frames is an instance of a broader, more general problem involving the understanding of intricate relationships between the visual content and the corresponding text. We comprehensively analyze five tasks that are instances of this problem and argue that they pose a common set of challenges and share similarities in terms of modeling and evaluation approaches. Based on the insights from these various aspects and stages of multi-image-to-text generation, we highlight several open questions and suggest future research directions. We believe that these directions can advance the understanding of complex phenomena in this domain and the development of better models. </p> </div> </dd> <dt> <a name='item132'>[132]</a> <a href ="/abs/2502.13044" title="Abstract" id="2502.13044"> arXiv:2502.13044 </a> [<a href="/pdf/2502.13044" title="Download PDF" id="pdf-2502.13044" aria-labelledby="pdf-2502.13044">pdf</a>, <a href="https://arxiv.org/html/2502.13044v1" title="View HTML" id="html-2502.13044" aria-labelledby="html-2502.13044" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.13044" title="Other formats" id="oth-2502.13044" aria-labelledby="oth-2502.13044">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Do we still need Human Annotators? Prompting Large Language Models for Aspect Sentiment Quad Prediction </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Hellwig,+N+C">Nils Constantin Hellwig</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fehle,+J">Jakob Fehle</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kruschwitz,+U">Udo Kruschwitz</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wolff,+C">Christian Wolff</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Aspect sentiment quadruple prediction (ASQP) facilitates a detailed understanding of opinions expressed in a text by identifying the opinion term, aspect term, aspect category and sentiment polarity for each opinion. However, annotating a full set of training examples to fine-tune models for ASQP is a resource-intensive process. In this study, we explore the capabilities of large language models (LLMs) for zero- and few-shot learning on the ASQP task across five diverse datasets. We report F1 scores slightly below those obtained with state-of-the-art fine-tuned models but exceeding previously reported zero- and few-shot performance. In the 40-shot setting on the Rest16 restaurant domain dataset, LLMs achieved an F1 score of 52.46, compared to 60.39 by the best-performing fine-tuned method MVP. Additionally, we report the performance of LLMs in target aspect sentiment detection (TASD), where the F1 scores were also close to fine-tuned models, achieving 66.03 on Rest16 in the 40-shot setting, compared to 72.76 with MVP. While human annotators remain essential for achieving optimal performance, LLMs can reduce the need for extensive manual annotation in ASQP tasks. </p> </div> </dd> <dt> <a name='item133'>[133]</a> <a href ="/abs/2502.13053" title="Abstract" id="2502.13053"> arXiv:2502.13053 </a> [<a href="/pdf/2502.13053" title="Download PDF" id="pdf-2502.13053" aria-labelledby="pdf-2502.13053">pdf</a>, <a href="https://arxiv.org/html/2502.13053v1" title="View HTML" id="html-2502.13053" aria-labelledby="html-2502.13053" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.13053" title="Other formats" id="oth-2502.13053" aria-labelledby="oth-2502.13053">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> AEIA-MN: Evaluating the Robustness of Multimodal LLM-Powered Mobile Agents Against Active Environmental Injection Attacks </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Y">Yurun Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+X">Xueyu Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yin,+K">Keting Yin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+J">Juncheng Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+S">Shengyu Zhang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> As researchers continuously optimize AI agents to perform tasks more effectively within operating systems, they often neglect to address the critical need for enabling these agents to identify "impostors" within the system. Through an analysis of the agents' operating environment, we identified a potential threat: attackers can disguise their attack methods as environmental elements, injecting active disturbances into the agents' execution process, thereby disrupting their decision-making. We define this type of attack as Active Environment Injection Attack (AEIA). Based on this, we propose AEIA-MN, an active environment injection attack scheme that exploits interaction vulnerabilities in the mobile operating system to evaluate the robustness of MLLM-based agents against such threats. Experimental results show that even advanced MLLMs are highly vulnerable to this attack, achieving a maximum attack success rate of 93% in the AndroidWorld benchmark. </p> </div> </dd> <dt> <a name='item134'>[134]</a> <a href ="/abs/2502.13059" title="Abstract" id="2502.13059"> arXiv:2502.13059 </a> [<a href="/pdf/2502.13059" title="Download PDF" id="pdf-2502.13059" aria-labelledby="pdf-2502.13059">pdf</a>, <a href="/format/2502.13059" title="Other formats" id="oth-2502.13059" aria-labelledby="oth-2502.13059">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SimpleVQA: Multimodal Factuality Evaluation for Multimodal Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Cheng,+X">Xianfu Cheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+W">Wei Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+S">Shiwei Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+J">Jian Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guan,+X">Xiangyuan Guan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+X">Xianjie Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+X">Xiang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+G">Ge Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+J">Jiaheng Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mai,+Y">Yuying Mai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zeng,+Y">Yutao Zeng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wen,+Z">Zhoufutu Wen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jin,+K">Ke Jin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+B">Baorui Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+W">Weixiao Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+Y">Yunhong Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+T">Tongliang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+W">Wenhao Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Z">Zhoujun Li</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The increasing application of multi-modal large language models (MLLMs) across various sectors have spotlighted the essence of their output reliability and accuracy, particularly their ability to produce content grounded in factual information (e.g. common and domain-specific knowledge). In this work, we introduce SimpleVQA, the first comprehensive multi-modal benchmark to evaluate the factuality ability of MLLMs to answer natural language short questions. SimpleVQA is characterized by six key features: it covers multiple tasks and multiple scenarios, ensures high quality and challenging queries, maintains static and timeless reference answers, and is straightforward to evaluate. Our approach involves categorizing visual question-answering items into 9 different tasks around objective events or common knowledge and situating these within 9 topics. Rigorous quality control processes are implemented to guarantee high-quality, concise, and clear answers, facilitating evaluation with minimal variance via an LLM-as-a-judge scoring system. Using SimpleVQA, we perform a comprehensive assessment of leading 18 MLLMs and 8 text-only LLMs, delving into their image comprehension and text generation abilities by identifying and analyzing error cases. </p> </div> </dd> <dt> <a name='item135'>[135]</a> <a href ="/abs/2502.13061" title="Abstract" id="2502.13061"> arXiv:2502.13061 </a> [<a href="/pdf/2502.13061" title="Download PDF" id="pdf-2502.13061" aria-labelledby="pdf-2502.13061">pdf</a>, <a href="https://arxiv.org/html/2502.13061v1" title="View HTML" id="html-2502.13061" aria-labelledby="html-2502.13061" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.13061" title="Other formats" id="oth-2502.13061" aria-labelledby="oth-2502.13061">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Improved Fine-Tuning of Large Multimodal Models for Hateful Meme Detection </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Mei,+J">Jingbiao Mei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+J">Jinghong Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+G">Guangyu Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+W">Weizhe Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Byrne,+B">Bill Byrne</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Preprint. Under Review </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV); Machine Learning (cs.LG) </div> <p class='mathjax'> Hateful memes have become a significant concern on the Internet, necessitating robust automated detection systems. While large multimodal models have shown strong generalization across various tasks, they exhibit poor generalization to hateful meme detection due to the dynamic nature of memes tied to emerging social trends and breaking news. Recent work further highlights the limitations of conventional supervised fine-tuning for large multimodal models in this context. To address these challenges, we propose Large Multimodal Model Retrieval-Guided Contrastive Learning (LMM-RGCL), a novel two-stage fine-tuning framework designed to improve both in-domain accuracy and cross-domain generalization. Experimental results on six widely used meme classification datasets demonstrate that LMM-RGCL achieves state-of-the-art performance, outperforming agent-based systems such as VPD-PALI-X-55B. Furthermore, our method effectively generalizes to out-of-domain memes under low-resource settings, surpassing models like GPT-4o. </p> </div> </dd> <dt> <a name='item136'>[136]</a> <a href ="/abs/2502.13063" title="Abstract" id="2502.13063"> arXiv:2502.13063 </a> [<a href="/pdf/2502.13063" title="Download PDF" id="pdf-2502.13063" aria-labelledby="pdf-2502.13063">pdf</a>, <a href="https://arxiv.org/html/2502.13063v1" title="View HTML" id="html-2502.13063" aria-labelledby="html-2502.13063" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.13063" title="Other formats" id="oth-2502.13063" aria-labelledby="oth-2502.13063">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Cramming 1568 Tokens into a Single Vector and Back Again: Exploring the Limits of Embedding Space Capacity </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Kuratov,+Y">Yuri Kuratov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Arkhipov,+M">Mikhail Arkhipov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bulatov,+A">Aydar Bulatov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Burtsev,+M">Mikhail Burtsev</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> A range of recent works addresses the problem of compression of sequence of tokens into a shorter sequence of real-valued vectors to be used as inputs instead of token embeddings or key-value cache. These approaches allow to reduce the amount of compute in existing language models. Despite relying on powerful models as encoders, the maximum attainable lossless compression ratio is typically not higher than x10. This fact is highly intriguing because, in theory, the maximum information capacity of large real-valued vectors is far beyond the presented rates even for 16-bit precision and a modest vector size. In this work, we explore the limits of compression by replacing the encoder with a per-sample optimization procedure. We show that vectors with compression ratios up to x1500 exist, which highlights two orders of magnitude gap between existing and practically attainable solutions. Furthermore, we empirically show that the compression limits are determined not by the length of the input but by the amount of uncertainty to be reduced, namely, the cross-entropy loss on this sequence without any conditioning. The obtained limits highlight the substantial gap between the theoretical capacity of input embeddings and their practical utilization, suggesting significant room for optimization in model design. </p> </div> </dd> <dt> <a name='item137'>[137]</a> <a href ="/abs/2502.13076" title="Abstract" id="2502.13076"> arXiv:2502.13076 </a> [<a href="/pdf/2502.13076" title="Download PDF" id="pdf-2502.13076" aria-labelledby="pdf-2502.13076">pdf</a>, <a href="https://arxiv.org/html/2502.13076v1" title="View HTML" id="html-2502.13076" aria-labelledby="html-2502.13076" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.13076" title="Other formats" id="oth-2502.13076" aria-labelledby="oth-2502.13076">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> KAPPA: A Generic Patent Analysis Framework with Keyphrase-Based Portraits </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xia,+X">Xin Xia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yujin Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+J">Jun Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhong,+G">Guisheng Zhong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cai,+L">Linning Cai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+C">Chen Zhang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Patent analysis highly relies on concise and interpretable document representations, referred to as patent portraits. Keyphrases, both present and absent, are ideal candidates for patent portraits due to their brevity, representativeness, and clarity. In this paper, we introduce KAPPA, an integrated framework designed to construct keyphrase-based patent portraits and enhance patent analysis. KAPPA operates in two phases: patent portrait construction and portrait-based analysis. To ensure effective portrait construction, we propose a semantic-calibrated keyphrase generation paradigm that integrates pre-trained language models with a prompt-based hierarchical decoding strategy to leverage the multi-level structural characteristics of patents. For portrait-based analysis, we develop a comprehensive framework that employs keyphrase-based patent portraits to enable efficient and accurate patent analysis. Extensive experiments on benchmark datasets of keyphrase generation, the proposed model achieves significant improvements compared to state-of-the-art baselines. Further experiments conducted on real-world patent applications demonstrate that our keyphrase-based portraits effectively capture domain-specific knowledge and enrich semantic representation for patent analysis tasks. </p> </div> </dd> <dt> <a name='item138'>[138]</a> <a href ="/abs/2502.13092" title="Abstract" id="2502.13092"> arXiv:2502.13092 </a> [<a href="/pdf/2502.13092" title="Download PDF" id="pdf-2502.13092" aria-labelledby="pdf-2502.13092">pdf</a>, <a href="https://arxiv.org/html/2502.13092v1" title="View HTML" id="html-2502.13092" aria-labelledby="html-2502.13092" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.13092" title="Other formats" id="oth-2502.13092" aria-labelledby="oth-2502.13092">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Text2World: Benchmarking Large Language Models for Symbolic World Model Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+M">Mengkang Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+T">Tianxing Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zou,+Y">Yude Zou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lei,+Y">Yuheng Lei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Q">Qiguang Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+M">Ming Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+H">Hongyuan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shao,+W">Wenqi Shao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+P">Ping Luo</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Project page: <a href="https://text-to-world.github.io/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Recently, there has been growing interest in leveraging large language models (LLMs) to generate symbolic world models from textual descriptions. Although LLMs have been extensively explored in the context of world modeling, prior studies encountered several challenges, including evaluation randomness, dependence on indirect metrics, and a limited domain scope. To address these limitations, we introduce a novel benchmark, Text2World, based on planning domain definition language (PDDL), featuring hundreds of diverse domains and employing multi-criteria, execution-based metrics for a more robust evaluation. We benchmark current LLMs using Text2World and find that reasoning models trained with large-scale reinforcement learning outperform others. However, even the best-performing model still demonstrates limited capabilities in world modeling. Building on these insights, we examine several promising strategies to enhance the world modeling capabilities of LLMs, including test-time scaling, agent training, and more. We hope that Text2World can serve as a crucial resource, laying the groundwork for future research in leveraging LLMs as world models. The project page is available at <a href="https://text-to-world.github.io/" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item139'>[139]</a> <a href ="/abs/2502.13108" title="Abstract" id="2502.13108"> arXiv:2502.13108 </a> [<a href="/pdf/2502.13108" title="Download PDF" id="pdf-2502.13108" aria-labelledby="pdf-2502.13108">pdf</a>, <a href="https://arxiv.org/html/2502.13108v1" title="View HTML" id="html-2502.13108" aria-labelledby="html-2502.13108" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.13108" title="Other formats" id="oth-2502.13108" aria-labelledby="oth-2502.13108">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Improving Clinical Question Answering with Multi-Task Learning: A Joint Approach for Answer Extraction and Medical Categorization </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Pattnayak,+P">Priyaranjan Pattnayak</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Patel,+H+L">Hitesh Laxmichand Patel</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Agarwal,+A">Amit Agarwal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kumar,+B">Bhargava Kumar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Panda,+S">Srikant Panda</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kumar,+T">Tejaswini Kumar</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Clinical Question Answering (CQA) plays a crucial role in medical decision-making, enabling physicians to extract relevant information from Electronic Medical Records (EMRs). While transformer-based models such as BERT, BioBERT, and ClinicalBERT have demonstrated state-of-the-art performance in CQA, existing models lack the ability to categorize extracted answers, which is critical for structured retrieval, content filtering, and medical decision support. <br>To address this limitation, we introduce a Multi-Task Learning (MTL) framework that jointly trains CQA models for both answer extraction and medical categorization. In addition to predicting answer spans, our model classifies responses into five standardized medical categories: Diagnosis, Medication, Symptoms, Procedure, and Lab Reports. This categorization enables more structured and interpretable outputs, making clinical QA models more useful in real-world healthcare settings. <br>We evaluate our approach on emrQA, a large-scale dataset for medical question answering. Results show that MTL improves F1-score by 2.2% compared to standard fine-tuning, while achieving 90.7% accuracy in answer categorization. These findings suggest that MTL not only enhances CQA performance but also introduces an effective mechanism for categorization and structured medical information retrieval. </p> </div> </dd> <dt> <a name='item140'>[140]</a> <a href ="/abs/2502.13114" title="Abstract" id="2502.13114"> arXiv:2502.13114 </a> [<a href="/pdf/2502.13114" title="Download PDF" id="pdf-2502.13114" aria-labelledby="pdf-2502.13114">pdf</a>, <a href="/format/2502.13114" title="Other formats" id="oth-2502.13114" aria-labelledby="oth-2502.13114">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> The influence of motion features in temporal perception </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Castillo,+R+I">Rosa Illan Castillo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Valenzuela,+J">Javier Valenzuela</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> This paper examines the role of manner-of-motion verbs in shaping subjective temporal perception and emotional resonance. Through four complementary studies, we explore how these verbs influence the conceptualization of time, examining their use in literal and metaphorical (temporal) contexts. Our findings reveal that faster verbs (e.g., fly, zoom) evoke dynamic and engaging temporal experiences, often linked to positive emotions and greater agency. In contrast, slower verbs (e.g., crawl, drag) convey passivity, monotony, and negative emotions, reflecting tedious or constrained experiences of time. These effects are amplified in metaphorical contexts, where manner verbs encode emotional and experiential nuances that transcend their literal meanings. We also find that participants prefer manner verbs over path verbs (e.g., go, pass) in emotionally charged temporal contexts, as manner verbs capture the experiential and emotional qualities of time more effectively. These findings highlight the interplay between language, motion, and emotion in shaping temporal perception, offering insights into how linguistic framing influences subjective experiences of time. </p> </div> </dd> <dt> <a name='item141'>[141]</a> <a href ="/abs/2502.13119" title="Abstract" id="2502.13119"> arXiv:2502.13119 </a> [<a href="/pdf/2502.13119" title="Download PDF" id="pdf-2502.13119" aria-labelledby="pdf-2502.13119">pdf</a>, <a href="/format/2502.13119" title="Other formats" id="oth-2502.13119" aria-labelledby="oth-2502.13119">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> STEER-ME: Assessing the Microeconomic Reasoning of Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Raman,+N">Narun Raman</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lundy,+T">Taylor Lundy</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Amin,+T">Thiago Amin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Perla,+J">Jesse Perla</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Brown,+K">Kevin-Leyton Brown</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 18 pages, 11 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> How should one judge whether a given large language model (LLM) can reliably perform economic reasoning? Most existing LLM benchmarks focus on specific applications and fail to present the model with a rich variety of economic tasks. A notable exception is Raman et al. [2024], who offer an approach for comprehensively benchmarking strategic decision-making; however, this approach fails to address the non-strategic settings prevalent in microeconomics, such as supply-and-demand analysis. We address this gap by taxonomizing microeconomic reasoning into $58$ distinct elements, focusing on the logic of supply and demand, each grounded in up to $10$ distinct domains, $5$ perspectives, and $3$ types. The generation of benchmark data across this combinatorial space is powered by a novel LLM-assisted data generation protocol that we dub auto-STEER, which generates a set of questions by adapting handwritten templates to target new domains and perspectives. Because it offers an automated way of generating fresh questions, auto-STEER mitigates the risk that LLMs will be trained to over-fit evaluation benchmarks; we thus hope that it will serve as a useful tool both for evaluating and fine-tuning models for years to come. We demonstrate the usefulness of our benchmark via a case study on $27$ LLMs, ranging from small open-source models to the current state of the art. We examined each model's ability to solve microeconomic problems across our whole taxonomy and present the results across a range of prompting strategies and scoring metrics. </p> </div> </dd> <dt> <a name='item142'>[142]</a> <a href ="/abs/2502.13120" title="Abstract" id="2502.13120"> arXiv:2502.13120 </a> [<a href="/pdf/2502.13120" title="Download PDF" id="pdf-2502.13120" aria-labelledby="pdf-2502.13120">pdf</a>, <a href="https://arxiv.org/html/2502.13120v1" title="View HTML" id="html-2502.13120" aria-labelledby="html-2502.13120" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.13120" title="Other formats" id="oth-2502.13120" aria-labelledby="oth-2502.13120">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Adapting Psycholinguistic Research for LLMs: Gender-inclusive Language in a Coreference Context </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Bartl,+M">Marion Bartl</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Murphy,+T+B">Thomas Brendan Murphy</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Leavy,+S">Susan Leavy</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 9 pages, 7 figures, submitted to ACL 2025 (ARR February 2025 cycle) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Gender-inclusive language is often used with the aim of ensuring that all individuals, regardless of gender, can be associated with certain concepts. While psycholinguistic studies have examined its effects in relation to human cognition, it remains unclear how Large Language Models (LLMs) process gender-inclusive language. Given that commercial LLMs are gaining an increasingly strong foothold in everyday applications, it is crucial to examine whether LLMs in fact interpret gender-inclusive language neutrally, because the language they generate has the potential to influence the language of their users. This study examines whether LLM-generated coreferent terms align with a given gender expression or reflect model biases. Adapting psycholinguistic methods from French to English and German, we find that in English, LLMs generally maintain the antecedent's gender but exhibit underlying masculine bias. In German, this bias is much stronger, overriding all tested gender-neutralization strategies. </p> </div> </dd> <dt> <a name='item143'>[143]</a> <a href ="/abs/2502.13124" title="Abstract" id="2502.13124"> arXiv:2502.13124 </a> [<a href="/pdf/2502.13124" title="Download PDF" id="pdf-2502.13124" aria-labelledby="pdf-2502.13124">pdf</a>, <a href="https://arxiv.org/html/2502.13124v1" title="View HTML" id="html-2502.13124" aria-labelledby="html-2502.13124" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.13124" title="Other formats" id="oth-2502.13124" aria-labelledby="oth-2502.13124">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> NaturalReasoning: Reasoning in the Wild with 2.8M Challenging Questions </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+W">Weizhe Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+J">Jane Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+S">Song Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Padthe,+K">Karthik Padthe</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+D">Dong Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kulikov,+I">Ilia Kulikov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cho,+K">Kyunghyun Cho</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tian,+Y">Yuandong Tian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Weston,+J+E">Jason E Weston</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+X">Xian Li</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Dataset at <a href="https://huggingface.co/datasets/facebook/natural_reasoning" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Scaling reasoning capabilities beyond traditional domains such as math and coding is hindered by the lack of diverse and high-quality questions. To overcome this limitation, we introduce a scalable approach for generating diverse and challenging reasoning questions, accompanied by reference answers. We present NaturalReasoning, a comprehensive dataset comprising 2.8 million questions that span multiple domains, including STEM fields (e.g., Physics, Computer Science), Economics, Social Sciences, and more. We demonstrate the utility of the questions in NaturalReasoning through knowledge distillation experiments which show that NaturalReasoning can effectively elicit and transfer reasoning capabilities from a strong teacher model. Furthermore, we demonstrate that NaturalReasoning is also effective for unsupervised self-training using external reward models or self-rewarding. </p> </div> </dd> <dt> <a name='item144'>[144]</a> <a href ="/abs/2502.13125" title="Abstract" id="2502.13125"> arXiv:2502.13125 </a> [<a href="/pdf/2502.13125" title="Download PDF" id="pdf-2502.13125" aria-labelledby="pdf-2502.13125">pdf</a>, <a href="https://arxiv.org/html/2502.13125v1" title="View HTML" id="html-2502.13125" aria-labelledby="html-2502.13125" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.13125" title="Other formats" id="oth-2502.13125" aria-labelledby="oth-2502.13125">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> RuozhiBench: Evaluating LLMs with Logical Fallacies and Misleading Premises </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhai,+Z">Zenan Zhai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+H">Hao Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+X">Xudong Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Z">Zhenxuan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yixuan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Baldwin,+T">Timothy Baldwin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+H">Haonan Li</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Recent advances in large language models (LLMs) have shown that they can answer questions requiring complex reasoning. However, their ability to identify and respond to text containing logical fallacies or deliberately misleading premises remains less studied. To address this gap, we introduce RuozhiBench, a bilingual dataset comprising 677 carefully curated questions that contain various forms of deceptive reasoning, meticulously crafted through extensive human effort and expert review. In a comprehensive evaluation of 17 LLMs from 5 Series over RuozhiBench using both open-ended and two-choice formats, we conduct extensive analyses on evaluation protocols and result patterns. Despite their high scores on conventional benchmarks, these models showed limited ability to detect and reason correctly about logical fallacies, with even the best-performing model, Claude-3-haiku, achieving only 62% accuracy compared to the human of more than 90%. </p> </div> </dd> <dt> <a name='item145'>[145]</a> <a href ="/abs/2502.13127" title="Abstract" id="2502.13127"> arXiv:2502.13127 </a> [<a href="/pdf/2502.13127" title="Download PDF" id="pdf-2502.13127" aria-labelledby="pdf-2502.13127">pdf</a>, <a href="https://arxiv.org/html/2502.13127v1" title="View HTML" id="html-2502.13127" aria-labelledby="html-2502.13127" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.13127" title="Other formats" id="oth-2502.13127" aria-labelledby="oth-2502.13127">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Facilitating Long Context Understanding via Supervised Chain-of-Thought Reasoning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+J">Jingyang Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wong,+A">Andy Wong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xia,+T">Tian Xia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+S">Shenghua He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wei,+H">Hui Wei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+M">Mei Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+J">Jiebo Luo</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 15 Pages, 6 Tables, 8 Figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Recent advances in Large Language Models (LLMs) have enabled them to process increasingly longer sequences, ranging from 2K to 2M tokens and even beyond. However, simply extending the input sequence length does not necessarily lead to effective long-context understanding. In this study, we integrate Chain-of-Thought (CoT) reasoning into LLMs in a supervised manner to facilitate effective long-context understanding. To achieve this, we introduce LongFinanceQA, a synthetic dataset in the financial domain designed to improve long-context reasoning. Unlike existing long-context synthetic data, LongFinanceQA includes intermediate CoT reasoning before the final conclusion, which encourages LLMs to perform explicit reasoning, improving accuracy and interpretability in long-context understanding. To generate synthetic CoT reasoning, we propose Property-driven Agentic Inference (PAI), an agentic framework that simulates human-like reasoning steps, including property extraction, retrieval, and summarization. We evaluate PAI's reasoning capabilities by assessing GPT-4o-mini w/ PAI on the Loong benchmark, outperforming standard GPT-4o-mini by 20.0%. Furthermore, we fine-tune LLaMA-3.1-8B-Instruct on LongFinanceQA, achieving a 24.6% gain on Loong's financial subset. </p> </div> </dd> <dt> <a name='item146'>[146]</a> <a href ="/abs/2502.13141" title="Abstract" id="2502.13141"> arXiv:2502.13141 </a> [<a href="/pdf/2502.13141" title="Download PDF" id="pdf-2502.13141" aria-labelledby="pdf-2502.13141">pdf</a>, <a href="https://arxiv.org/html/2502.13141v1" title="View HTML" id="html-2502.13141" aria-labelledby="html-2502.13141" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.13141" title="Other formats" id="oth-2502.13141" aria-labelledby="oth-2502.13141">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> UniGuardian: A Unified Defense for Detecting Prompt Injection, Backdoor Attacks and Adversarial Attacks in Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+H">Huawei Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lao,+Y">Yingjie Lao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Geng,+T">Tong Geng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+T">Tan Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+W">Weijie Zhao</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 18 Pages, 8 Figures, 5 Tables, Keywords: Attack Defending, Security, Prompt Injection, Backdoor Attacks, Adversarial Attacks, Prompt Trigger Attacks </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Large Language Models (LLMs) are vulnerable to attacks like prompt injection, backdoor attacks, and adversarial attacks, which manipulate prompts or models to generate harmful outputs. In this paper, departing from traditional deep learning attack paradigms, we explore their intrinsic relationship and collectively term them Prompt Trigger Attacks (PTA). This raises a key question: Can we determine if a prompt is benign or poisoned? To address this, we propose UniGuardian, the first unified defense mechanism designed to detect prompt injection, backdoor attacks, and adversarial attacks in LLMs. Additionally, we introduce a single-forward strategy to optimize the detection pipeline, enabling simultaneous attack detection and text generation within a single forward pass. Our experiments confirm that UniGuardian accurately and efficiently identifies malicious prompts in LLMs. </p> </div> </dd> </dl> <dl id='articles'> <h3>Cross submissions (showing 30 of 30 entries)</h3> <dt> <a name='item147'>[147]</a> <a href ="/abs/2502.12158" title="Abstract" id="2502.12158"> arXiv:2502.12158 </a> (cross-list from cs.LG) [<a href="/pdf/2502.12158" title="Download PDF" id="pdf-2502.12158" aria-labelledby="pdf-2502.12158">pdf</a>, <a href="https://arxiv.org/html/2502.12158v1" title="View HTML" id="html-2502.12158" aria-labelledby="html-2502.12158" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12158" title="Other formats" id="oth-2502.12158" aria-labelledby="oth-2502.12158">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Mining Social Determinants of Health for Heart Failure Patient 30-Day Readmission via Large Language Model </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Shao,+M">Mingchen Shao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kang,+Y">Youjeong Kang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+X">Xiao Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kwak,+H+G">Hyunjung Gloria Kwak</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+C">Carl Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+J">Jiaying Lu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Computers and Society (cs.CY) </div> <p class='mathjax'> Heart Failure (HF) affects millions of Americans and leads to high readmission rates, posing significant healthcare challenges. While Social Determinants of Health (SDOH) such as socioeconomic status and housing stability play critical roles in health outcomes, they are often underrepresented in structured EHRs and hidden in unstructured clinical notes. This study leverages advanced large language models (LLMs) to extract SDOHs from clinical text and uses logistic regression to analyze their association with HF readmissions. By identifying key SDOHs (e.g. tobacco usage, limited transportation) linked to readmission risk, this work also offers actionable insights for reducing readmissions and improving patient care. </p> </div> </dd> <dt> <a name='item148'>[148]</a> <a href ="/abs/2502.12159" title="Abstract" id="2502.12159"> arXiv:2502.12159 </a> (cross-list from physics.soc-ph) [<a href="/pdf/2502.12159" title="Download PDF" id="pdf-2502.12159" aria-labelledby="pdf-2502.12159">pdf</a>, <a href="https://arxiv.org/html/2502.12159v1" title="View HTML" id="html-2502.12159" aria-labelledby="html-2502.12159" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12159" title="Other formats" id="oth-2502.12159" aria-labelledby="oth-2502.12159">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Causal Interpretations in Observational Studies: The Role of Sociocultural Backgrounds and Team Dynamics </div> <div class='list-authors'><a href="https://arxiv.org/search/physics?searchtype=author&query=Wang,+J">Jun Wang</a>, <a href="https://arxiv.org/search/physics?searchtype=author&query=Yu,+B">Bei Yu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 13 pages, 4 figures, 2 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Physics and Society (physics.soc-ph)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> The prevalence of drawing causal conclusions from observational studies has raised concerns about potential exaggeration in science communication. While some believe causal language should only apply to randomized controlled trials, others argue that rigorous methods can justify causal claims in observational studies. Ideally, causal language should align with the strength of the evidence. However, through the analysis of over 80,000 observational study abstracts using computational linguistic and regression methods, we found that causal language is more frequently used by less experienced authors, smaller research teams, male last authors, and authors from countries with higher uncertainty avoidance indices. These findings suggest that the use of causal language may be influenced by external factors such as the sociocultural backgrounds of authors and the dynamics of research collaboration. This newly identified link deepens our understanding of how such factors help shape scientific conclusions in causal inference and science communication. </p> </div> </dd> <dt> <a name='item149'>[149]</a> <a href ="/abs/2502.12170" title="Abstract" id="2502.12170"> arXiv:2502.12170 </a> (cross-list from cs.LG) [<a href="/pdf/2502.12170" title="Download PDF" id="pdf-2502.12170" aria-labelledby="pdf-2502.12170">pdf</a>, <a href="https://arxiv.org/html/2502.12170v1" title="View HTML" id="html-2502.12170" aria-labelledby="html-2502.12170" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12170" title="Other formats" id="oth-2502.12170" aria-labelledby="oth-2502.12170">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MUDDFormer: Breaking Residual Bottlenecks in Transformers via Multiway Dynamic Dense Connections </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xiao,+D">Da Xiao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Meng,+Q">Qingye Meng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+S">Shengping Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+X">Xingyuan Yuan</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> We propose MUltiway Dynamic Dense (MUDD) connections, a simple yet effective method to address the limitations of residual connections and enhance cross-layer information flow in Transformers. Unlike existing dense connection approaches with static and shared connection weights, MUDD generates connection weights dynamically depending on hidden states at each sequence position and for each decoupled input stream (the query, key, value or residual) of a Transformer block. MUDD connections can be seamlessly integrated into any Transformer architecture to create MUDDFormer. Extensive experiments show that MUDDFormer significantly outperforms Transformers across various model architectures and scales in language modeling, achieving the performance of Transformers trained with 1.8X-2.4X compute. Notably, MUDDPythia-2.8B matches Pythia-6.9B in pretraining ppl and downstream tasks and even rivals Pythia-12B in five-shot settings, while adding only 0.23% parameters and 0.4% computation. Code in JAX and PyTorch and pre-trained models are available at <a href="https://github.com/Caiyun-AI/MUDDFormer" rel="external noopener nofollow" class="link-external link-https">this https URL</a> . </p> </div> </dd> <dt> <a name='item150'>[150]</a> <a href ="/abs/2502.12171" title="Abstract" id="2502.12171"> arXiv:2502.12171 </a> (cross-list from cs.LG) [<a href="/pdf/2502.12171" title="Download PDF" id="pdf-2502.12171" aria-labelledby="pdf-2502.12171">pdf</a>, <a href="https://arxiv.org/html/2502.12171v1" title="View HTML" id="html-2502.12171" aria-labelledby="html-2502.12171" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12171" title="Other formats" id="oth-2502.12171" aria-labelledby="oth-2502.12171">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> GoRA: Gradient-driven Adaptive Low Rank Adaptation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=He,+H">Haonan He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ye,+P">Peng Ye</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ren,+Y">Yuchen Ren</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+Y">Yuan Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+L">Lei Chen</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> Low-Rank Adaptation (LoRA) is a crucial method for efficiently fine-tuning pretrained large language models (LLMs), with its performance largely influenced by two key factors: rank and initialization strategy. Numerous LoRA variants have been proposed to enhance its performance by addressing these factors. However, these variants often compromise LoRA's usability or efficiency. In this paper, we analyze the fundamental limitations of existing methods and introduce a novel approach, GoRA (Gradient-driven Adaptive Low Rank Adaptation), which adaptively assigns ranks and initializes weights for low-rank adapters simultaneously based on gradient information. Extensive experimental results demonstrate that GoRA significantly improves performance while preserving the high usability and efficiency of LoRA. On the T5 model fine-tuned for the GLUE benchmark, GoRA achieves a 5.88-point improvement over LoRA and slightly surpasses full fine-tuning. Similarly, on the Llama3.1-8B-Base model fine-tuned for GSM8k tasks, GoRA outperforms LoRA with a 5.13-point improvement and exceeds full fine-tuning in high-rank settings by a margin of 2.05 points. </p> </div> </dd> <dt> <a name='item151'>[151]</a> <a href ="/abs/2502.12179" title="Abstract" id="2502.12179"> arXiv:2502.12179 </a> (cross-list from cs.LG) [<a href="/pdf/2502.12179" title="Download PDF" id="pdf-2502.12179" aria-labelledby="pdf-2502.12179">pdf</a>, <a href="https://arxiv.org/html/2502.12179v1" title="View HTML" id="html-2502.12179" aria-labelledby="html-2502.12179" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12179" title="Other formats" id="oth-2502.12179" aria-labelledby="oth-2502.12179">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Identifiable Steering via Sparse Autoencoding of Multi-Concept Shifts </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Joshi,+S">Shruti Joshi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dittadi,+A">Andrea Dittadi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lachapelle,+S">S茅bastien Lachapelle</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dhanya">Dhanya Sridhar</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 27 pages, 9 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> Steering methods manipulate the representations of large language models (LLMs) to induce responses that have desired properties, e.g., truthfulness, offering a promising approach for LLM alignment without the need for fine-tuning. Traditionally, steering has relied on supervision, such as from contrastive pairs of prompts that vary in a single target concept, which is costly to obtain and limits the speed of steering research. An appealing alternative is to use unsupervised approaches such as sparse autoencoders (SAEs) to map LLM embeddings to sparse representations that capture human-interpretable concepts. However, without further assumptions, SAEs may not be identifiable: they could learn latent dimensions that entangle multiple concepts, leading to unintentional steering of unrelated properties. We introduce Sparse Shift Autoencoders (SSAEs) that instead map the differences between embeddings to sparse representations. Crucially, we show that SSAEs are identifiable from paired observations that vary in \textit{multiple unknown concepts}, leading to accurate steering of single concepts without the need for supervision. We empirically demonstrate accurate steering across semi-synthetic and real-world language datasets using Llama-3.1 embeddings. </p> </div> </dd> <dt> <a name='item152'>[152]</a> <a href ="/abs/2502.12206" title="Abstract" id="2502.12206"> arXiv:2502.12206 </a> (cross-list from cs.AI) [<a href="/pdf/2502.12206" title="Download PDF" id="pdf-2502.12206" aria-labelledby="pdf-2502.12206">pdf</a>, <a href="https://arxiv.org/html/2502.12206v1" title="View HTML" id="html-2502.12206" aria-labelledby="html-2502.12206" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12206" title="Other formats" id="oth-2502.12206" aria-labelledby="oth-2502.12206">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Evaluating the Paperclip Maximizer: Are RL-Based Language Models More Likely to Pursue Instrumental Goals? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=He,+Y">Yufei He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yuexin Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+J">Jiaying Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sui,+Y">Yuan Sui</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Y">Yulin Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hooi,+B">Bryan Hooi</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL); Machine Learning (cs.LG) </div> <p class='mathjax'> As large language models (LLMs) continue to evolve, ensuring their alignment with human goals and values remains a pressing challenge. A key concern is \textit{instrumental convergence}, where an AI system, in optimizing for a given objective, develops unintended intermediate goals that override the ultimate objective and deviate from human-intended goals. This issue is particularly relevant in reinforcement learning (RL)-trained models, which can generate creative but unintended strategies to maximize rewards. In this paper, we explore instrumental convergence in LLMs by comparing models trained with direct RL optimization (e.g., the o1 model) to those trained with reinforcement learning from human feedback (RLHF). We hypothesize that RL-driven models exhibit a stronger tendency for instrumental convergence due to their optimization of goal-directed behavior in ways that may misalign with human intentions. To assess this, we introduce InstrumentalEval, a benchmark for evaluating instrumental convergence in RL-trained LLMs. Initial experiments reveal cases where a model tasked with making money unexpectedly pursues instrumental objectives, such as self-replication, implying signs of instrumental convergence. Our findings contribute to a deeper understanding of alignment challenges in AI systems and the risks posed by unintended model behaviors. </p> </div> </dd> <dt> <a name='item153'>[153]</a> <a href ="/abs/2502.12216" title="Abstract" id="2502.12216"> arXiv:2502.12216 </a> (cross-list from cs.LG) [<a href="/pdf/2502.12216" title="Download PDF" id="pdf-2502.12216" aria-labelledby="pdf-2502.12216">pdf</a>, <a href="https://arxiv.org/html/2502.12216v1" title="View HTML" id="html-2502.12216" aria-labelledby="html-2502.12216" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12216" title="Other formats" id="oth-2502.12216" aria-labelledby="oth-2502.12216">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Tactic: Adaptive Sparse Attention with Clustering and Distribution Fitting for Long-Context LLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+K">Kan Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+T">Tian Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+Q">Qinyu Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gu,+Y">Yile Gu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zeng,+Z">Zhichen Zeng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kadekodi,+R">Rohan Kadekodi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+L">Liangyu Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+A">Ang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Krishnamurthy,+A">Arvind Krishnamurthy</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kasikci,+B">Baris Kasikci</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> Long-context models are essential for many applications but face inefficiencies in loading large KV caches during decoding. Prior methods enforce fixed token budgets for sparse attention, assuming a set number of tokens can approximate full attention. However, these methods overlook variations in the importance of attention across heads, layers, and contexts. To address these limitations, we propose Tactic, a sparsity-adaptive and calibration-free sparse attention mechanism that dynamically selects tokens based on their cumulative attention scores rather than a fixed token budget. By setting a target fraction of total attention scores, Tactic ensures that token selection naturally adapts to variations in attention sparsity. To efficiently approximate this selection, Tactic leverages clustering-based sorting and distribution fitting, allowing it to accurately estimate token importance with minimal computational overhead. We show that Tactic outperforms existing sparse attention algorithms, achieving superior accuracy and up to 7.29x decode attention speedup. This improvement translates to an overall 1.58x end-to-end inference speedup, making Tactic a practical and effective solution for long-context LLM inference in accuracy-sensitive applications. </p> </div> </dd> <dt> <a name='item154'>[154]</a> <a href ="/abs/2502.12217" title="Abstract" id="2502.12217"> arXiv:2502.12217 </a> (cross-list from cs.LG) [<a href="/pdf/2502.12217" title="Download PDF" id="pdf-2502.12217" aria-labelledby="pdf-2502.12217">pdf</a>, <a href="https://arxiv.org/html/2502.12217v1" title="View HTML" id="html-2502.12217" aria-labelledby="html-2502.12217" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12217" title="Other formats" id="oth-2502.12217" aria-labelledby="oth-2502.12217">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Optimal Brain Iterative Merging: Mitigating Interference in LLM Merging </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zhixiang Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mao,+Z">Zhenyu Mao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qiao,+Y">Yixuan Qiao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Y">Yunfang Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+B">Biye Li</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> Large Language Models (LLMs) have demonstrated impressive capabilities, but their high computational costs pose challenges for customization. Model merging offers a cost-effective alternative, yet existing methods suffer from interference among parameters, leading to performance degradation. In this work, we propose Optimal Brain Iterative Merging (OBIM), a novel method designed to mitigate both intra-model and inter-model interference. OBIM consists of two key components: (1) A saliency measurement mechanism that evaluates parameter importance based on loss changes induced by individual weight alterations, reducing intra-model interference by preserving only high-saliency parameters. (2) A mutually exclusive iterative merging framework, which incrementally integrates models using a binary mask to avoid direct parameter averaging, thereby mitigating inter-model interference. We validate OBIM through experiments on both Supervised Fine-Tuned (SFT) models and post-pretrained checkpoints. The results show that OBIM significantly outperforms existing merging techniques. Overall, OBIM provides an effective and practical solution for enhancing LLM merging. </p> </div> </dd> <dt> <a name='item155'>[155]</a> <a href ="/abs/2502.12272" title="Abstract" id="2502.12272"> arXiv:2502.12272 </a> (cross-list from cs.LG) [<a href="/pdf/2502.12272" title="Download PDF" id="pdf-2502.12272" aria-labelledby="pdf-2502.12272">pdf</a>, <a href="/format/2502.12272" title="Other formats" id="oth-2502.12272" aria-labelledby="oth-2502.12272">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Learning to Reason at the Frontier of Learnability </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Foster,+T">Thomas Foster</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Foerster,+J">Jakob Foerster</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> Reinforcement learning is now widely adopted as the final stage of large language model training, especially for reasoning-style tasks such as maths problems. Typically, models attempt each question many times during a single training step and attempt to learn from their successes and failures. However, we demonstrate that throughout training with two popular algorithms (PPO and VinePPO) on two widely used datasets, many questions are either solved by all attempts - meaning they are already learned - or by none - providing no meaningful training signal. To address this, we adapt a method from the reinforcement learning literature - sampling for learnability - and apply it to the reinforcement learning stage of LLM training. Our curriculum prioritises questions with high variance of success, i.e. those where the agent sometimes succeeds, but not always. Our findings demonstrate that this curriculum consistently boosts training performance across multiple algorithms and datasets, paving the way for more efficient and effective reinforcement learning in LLMs. </p> </div> </dd> <dt> <a name='item156'>[156]</a> <a href ="/abs/2502.12275" title="Abstract" id="2502.12275"> arXiv:2502.12275 </a> (cross-list from cs.AI) [<a href="/pdf/2502.12275" title="Download PDF" id="pdf-2502.12275" aria-labelledby="pdf-2502.12275">pdf</a>, <a href="https://arxiv.org/html/2502.12275v1" title="View HTML" id="html-2502.12275" aria-labelledby="html-2502.12275" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12275" title="Other formats" id="oth-2502.12275" aria-labelledby="oth-2502.12275">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Integrating Expert Knowledge into Logical Programs via LLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=G%C3%B3rski,+F">Franciszek G贸rski</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wysocki,+O">Oskar Wysocki</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Valentino,+M">Marco Valentino</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Freitas,+A">Andre Freitas</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL); Multiagent Systems (cs.MA) </div> <p class='mathjax'> This paper introduces ExKLoP, a novel framework designed to evaluate how effectively Large Language Models (LLMs) integrate expert knowledge into logical reasoning systems. This capability is especially valuable in engineering, where expert knowledge-such as manufacturer-recommended operational ranges-can be directly embedded into automated monitoring systems. By mirroring expert verification steps, tasks like range checking and constraint validation help ensure system safety and reliability. Our approach systematically evaluates LLM-generated logical rules, assessing both syntactic fluency and logical correctness in these critical validation tasks. We also explore the models capacity for self-correction via an iterative feedback loop based on code execution outcomes. ExKLoP presents an extensible dataset comprising 130 engineering premises, 950 prompts, and corresponding validation points. It enables comprehensive benchmarking while allowing control over task complexity and scalability of experiments. We leverage the synthetic data creation methodology to conduct extensive empirical evaluation on a diverse set of LLMs including Llama3, Gemma, Mixtral, Mistral, and Qwen. Results reveal that while models generate nearly perfect syntactically correct code, they frequently exhibit logical errors in translating expert knowledge. Furthermore, iterative self-correction yields only marginal improvements (up to 3%). Overall, ExKLoP serves as a robust evaluation platform that streamlines the selection of effective models for self-correcting systems while clearly delineating the types of errors encountered. The complete implementation, along with all relevant data, is available at GitHub. </p> </div> </dd> <dt> <a name='item157'>[157]</a> <a href ="/abs/2502.12292" title="Abstract" id="2502.12292"> arXiv:2502.12292 </a> (cross-list from cs.LG) [<a href="/pdf/2502.12292" title="Download PDF" id="pdf-2502.12292" aria-labelledby="pdf-2502.12292">pdf</a>, <a href="/format/2502.12292" title="Other formats" id="oth-2502.12292" aria-labelledby="oth-2502.12292">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Independence Tests for Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+S">Sally Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ahmed,+A">Ahmed Ahmed</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kuditipudi,+R">Rohith Kuditipudi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liang,+P">Percy Liang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> We consider the following problem: given the weights of two models, can we test whether they were trained independently -- i.e., from independent random initializations? We consider two settings: constrained and unconstrained. In the constrained setting, we make assumptions about model architecture and training and propose a family of statistical tests that yield exact p-values with respect to the null hypothesis that the models are trained from independent random initializations. These p-values are valid regardless of the composition of either model's training data; we compute them by simulating exchangeable copies of each model under our assumptions and comparing various similarity measures of weights and activations between the original two models versus these copies. We report the p-values from these tests on pairs of 21 open-weight models (210 total pairs) and correctly identify all pairs of non-independent models. Our tests remain effective even if one model was fine-tuned for many tokens. In the unconstrained setting, where we make no assumptions about training procedures, can change model architecture, and allow for adversarial evasion attacks, the previous tests no longer work. Instead, we propose a new test which matches hidden activations between two models, and which is robust to adversarial transformations and to changes in model architecture. The test can also do localized testing: identifying specific non-independent components of models. Though we no longer obtain exact p-values from this, empirically we find it behaves as one and reliably identifies non-independent models. Notably, we can use the test to identify specific parts of one model that are derived from another (e.g., how Llama 3.1-8B was pruned to initialize Llama 3.2-3B, or shared layers between Mistral-7B and StripedHyena-7B), and it is even robust to retraining individual layers of either model from scratch. </p> </div> </dd> <dt> <a name='item158'>[158]</a> <a href ="/abs/2502.12435" title="Abstract" id="2502.12435"> arXiv:2502.12435 </a> (cross-list from cs.AI) [<a href="/pdf/2502.12435" title="Download PDF" id="pdf-2502.12435" aria-labelledby="pdf-2502.12435">pdf</a>, <a href="https://arxiv.org/html/2502.12435v1" title="View HTML" id="html-2502.12435" aria-labelledby="html-2502.12435" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12435" title="Other formats" id="oth-2502.12435" aria-labelledby="oth-2502.12435">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Survey on Large Language Models for Automated Planning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Aghzal,+M">Mohamed Aghzal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Plaku,+E">Erion Plaku</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Stein,+G+J">Gregory J. Stein</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yao,+Z">Ziyu Yao</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> The planning ability of Large Language Models (LLMs) has garnered increasing attention in recent years due to their remarkable capacity for multi-step reasoning and their ability to generalize across a wide range of domains. While some researchers emphasize the potential of LLMs to perform complex planning tasks, others highlight significant limitations in their performance, particularly when these models are tasked with handling the intricacies of long-horizon reasoning. In this survey, we critically investigate existing research on the use of LLMs in automated planning, examining both their successes and shortcomings in detail. We illustrate that although LLMs are not well-suited to serve as standalone planners because of these limitations, they nonetheless present an enormous opportunity to enhance planning applications when combined with other approaches. Thus, we advocate for a balanced methodology that leverages the inherent flexibility and generalized knowledge of LLMs alongside the rigor and cost-effectiveness of traditional planning methods. </p> </div> </dd> <dt> <a name='item159'>[159]</a> <a href ="/abs/2502.12442" title="Abstract" id="2502.12442"> arXiv:2502.12442 </a> (cross-list from cs.IR) [<a href="/pdf/2502.12442" title="Download PDF" id="pdf-2502.12442" aria-labelledby="pdf-2502.12442">pdf</a>, <a href="https://arxiv.org/html/2502.12442v1" title="View HTML" id="html-2502.12442" aria-labelledby="html-2502.12442" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12442" title="Other formats" id="oth-2502.12442" aria-labelledby="oth-2502.12442">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> HopRAG: Multi-Hop Reasoning for Logic-Aware Retrieval-Augmented Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+H">Hao Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zhengren Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+X">Xi Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Z">Zhiyu Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiong,+F">Feiyu Xiong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+Q">Qinhan Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+W">Wentao Zhang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Information Retrieval (cs.IR)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Retrieval-Augmented Generation (RAG) systems often struggle with imperfect retrieval, as traditional retrievers focus on lexical or semantic similarity rather than logical relevance. To address this, we propose HopRAG, a novel RAG framework that augments retrieval with logical reasoning through graph-structured knowledge exploration. During indexing, HopRAG constructs a passage graph, with text chunks as vertices and logical connections established via LLM-generated pseudo-queries as edges. During retrieval, it employs a retrieve-reason-prune mechanism: starting with lexically or semantically similar passages, the system explores multi-hop neighbors guided by pseudo-queries and LLM reasoning to identify truly relevant ones. Extensive experiments demonstrate HopRAG's superiority, achieving 76.78\% higher answer accuracy and 65.07\% improved retrieval F1 score compared to conventional methods. The repository is available at <a href="https://github.com/LIU-Hao-2002/HopRAG" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item160'>[160]</a> <a href ="/abs/2502.12466" title="Abstract" id="2502.12466"> arXiv:2502.12466 </a> (cross-list from cs.LG) [<a href="/pdf/2502.12466" title="Download PDF" id="pdf-2502.12466" aria-labelledby="pdf-2502.12466">pdf</a>, <a href="https://arxiv.org/html/2502.12466v1" title="View HTML" id="html-2502.12466" aria-labelledby="html-2502.12466" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12466" title="Other formats" id="oth-2502.12466" aria-labelledby="oth-2502.12466">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> EquiBench: Benchmarking Code Reasoning Capabilities of Large Language Models via Equivalence Checking </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wei,+A">Anjiang Wei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+J">Jiannan Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+R">Ran Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+H">Hongyu Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yuhui Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Ziheng Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+Y">Yaofeng Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Y">Yuan Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Teixeira,+T+S+F+X">Thiago S. F. X. Teixeira</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+D">Diyi Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+K">Ke Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Aiken,+A">Alex Aiken</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Programming Languages (cs.PL); Software Engineering (cs.SE) </div> <p class='mathjax'> Equivalence checking, i.e., determining whether two programs produce identical outputs for all possible inputs, underpins a broad range of applications, including software refactoring, testing, and optimization. We present the task of equivalence checking as a new way to evaluate the code reasoning abilities of large language models (LLMs). We introduce EquiBench, a dataset of 2400 program pairs spanning four programming languages and six equivalence categories. These pairs are systematically generated through program analysis, compiler scheduling, and superoptimization, covering nontrivial structural transformations that demand deep semantic reasoning beyond simple syntactic variations. Our evaluation of 17 state-of-the-art LLMs shows that OpenAI o3-mini achieves the highest overall accuracy of 78.0%. In the most challenging categories, the best accuracies are 62.3% and 68.8%, only modestly above the 50% random baseline for binary classification, indicating significant room for improvement in current models' code reasoning capabilities. </p> </div> </dd> <dt> <a name='item161'>[161]</a> <a href ="/abs/2502.12561" title="Abstract" id="2502.12561"> arXiv:2502.12561 </a> (cross-list from cs.HC) [<a href="/pdf/2502.12561" title="Download PDF" id="pdf-2502.12561" aria-labelledby="pdf-2502.12561">pdf</a>, <a href="/format/2502.12561" title="Other formats" id="oth-2502.12561" aria-labelledby="oth-2502.12561">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> UXAgent: An LLM Agent-Based Usability Testing Framework for Web Design </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+Y">Yuxuan Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yao,+B">Bingsheng Yao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gu,+H">Hansu Gu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+J">Jing Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+J">Jessie Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+L">Laurence Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gesi,+J">Jiri Gesi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+Q">Qi He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+T+J">Toby Jia-Jun Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+D">Dakuo Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Human-Computer Interaction (cs.HC)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Usability testing is a fundamental yet challenging (e.g., inflexible to iterate the study design flaws and hard to recruit study participants) research method for user experience (UX) researchers to evaluate a web design. Recent advances in Large Language Model-simulated Agent (LLM-Agent) research inspired us to design UXAgent to support UX researchers in evaluating and reiterating their usability testing study design before they conduct the real human subject study. Our system features an LLM-Agent module and a universal browser connector module so that UX researchers can automatically generate thousands of simulated users to test the target website. The results are shown in qualitative (e.g., interviewing how an agent thinks ), quantitative (e.g., # of actions), and video recording formats for UX researchers to analyze. Through a heuristic user evaluation with five UX researchers, participants praised the innovation of our system but also expressed concerns about the future of LLM Agent-assisted UX study. </p> </div> </dd> <dt> <a name='item162'>[162]</a> <a href ="/abs/2502.12586" title="Abstract" id="2502.12586"> arXiv:2502.12586 </a> (cross-list from cs.IR) [<a href="/pdf/2502.12586" title="Download PDF" id="pdf-2502.12586" aria-labelledby="pdf-2502.12586">pdf</a>, <a href="https://arxiv.org/html/2502.12586v1" title="View HTML" id="html-2502.12586" aria-labelledby="html-2502.12586" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12586" title="Other formats" id="oth-2502.12586" aria-labelledby="oth-2502.12586">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> G-Refer: Graph Retrieval-Augmented Large Language Model for Explainable Recommendation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yuhan Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+X">Xinni Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+L">Linhao Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chang,+H">Heng Chang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ren,+Y">Yuxiang Ren</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=King,+I">Irwin King</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+J">Jia Li</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted by WWW 2025, research track </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Information Retrieval (cs.IR)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Explainable recommendation has demonstrated significant advantages in informing users about the logic behind recommendations, thereby increasing system transparency, effectiveness, and trustworthiness. To provide personalized and interpretable explanations, existing works often combine the generation capabilities of large language models (LLMs) with collaborative filtering (CF) information. CF information extracted from the user-item interaction graph captures the user behaviors and preferences, which is crucial for providing informative explanations. However, due to the complexity of graph structure, effectively extracting the CF information from graphs still remains a challenge. Moreover, existing methods often struggle with the integration of extracted CF information with LLMs due to its implicit representation and the modality gap between graph structures and natural language explanations. To address these challenges, we propose G-Refer, a framework using graph retrieval-augmented large language models (LLMs) for explainable recommendation. Specifically, we first employ a hybrid graph retrieval mechanism to retrieve explicit CF signals from both structural and semantic perspectives. The retrieved CF information is explicitly formulated as human-understandable text by the proposed graph translation and accounts for the explanations generated by LLMs. To bridge the modality gap, we introduce knowledge pruning and retrieval-augmented fine-tuning to enhance the ability of LLMs to process and utilize the retrieved CF information to generate explanations. Extensive experiments show that G-Refer achieves superior performance compared with existing methods in both explainability and stability. Codes and data are available at <a href="https://github.com/Yuhan1i/G-Refer" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item163'>[163]</a> <a href ="/abs/2502.12591" title="Abstract" id="2502.12591"> arXiv:2502.12591 </a> (cross-list from cs.CV) [<a href="/pdf/2502.12591" title="Download PDF" id="pdf-2502.12591" aria-labelledby="pdf-2502.12591">pdf</a>, <a href="https://arxiv.org/html/2502.12591v1" title="View HTML" id="html-2502.12591" aria-labelledby="html-2502.12591" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12591" title="Other formats" id="oth-2502.12591" aria-labelledby="oth-2502.12591">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CutPaste&Find: Efficient Multimodal Hallucination Detector with Visual-aid Knowledge Base </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Nguyen,+C">Cong-Duy Nguyen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+X">Xiaobao Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Vu,+D+A">Duc Anh Vu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+S">Shuai Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nguyen,+T">Thong Nguyen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luu,+A+T">Anh Tuan Luu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Large Vision-Language Models (LVLMs) have demonstrated impressive multimodal reasoning capabilities, but they remain susceptible to hallucination, particularly object hallucination where non-existent objects or incorrect attributes are fabricated in generated descriptions. Existing detection methods achieve strong performance but rely heavily on expensive API calls and iterative LVLM-based validation, making them impractical for large-scale or offline use. To address these limitations, we propose CutPaste\&Find, a lightweight and training-free framework for detecting hallucinations in LVLM-generated outputs. Our approach leverages off-the-shelf visual and linguistic modules to perform multi-step verification efficiently without requiring LVLM inference. At the core of our framework is a Visual-aid Knowledge Base that encodes rich entity-attribute relationships and associated image representations. We introduce a scaling factor to refine similarity scores, mitigating the issue of suboptimal alignment values even for ground-truth image-text pairs. Comprehensive evaluations on benchmark datasets, including POPE and R-Bench, demonstrate that CutPaste\&Find achieves competitive hallucination detection performance while being significantly more efficient and cost-effective than previous methods. </p> </div> </dd> <dt> <a name='item164'>[164]</a> <a href ="/abs/2502.12623" title="Abstract" id="2502.12623"> arXiv:2502.12623 </a> (cross-list from cs.SD) [<a href="/pdf/2502.12623" title="Download PDF" id="pdf-2502.12623" aria-labelledby="pdf-2502.12623">pdf</a>, <a href="https://arxiv.org/html/2502.12623v1" title="View HTML" id="html-2502.12623" aria-labelledby="html-2502.12623" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12623" title="Other formats" id="oth-2502.12623" aria-labelledby="oth-2502.12623">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> DeepResonance: Enhancing Multimodal Music Understanding via Music-centric Multi-way Instruction Tuning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Mao,+Z">Zhuoyuan Mao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+M">Mengjie Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Q">Qiyu Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wakaki,+H">Hiromi Wakaki</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mitsufuji,+Y">Yuki Mitsufuji</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Multimedia (cs.MM); Audio and Speech Processing (eess.AS) </div> <p class='mathjax'> Recent advancements in music large language models (LLMs) have significantly improved music understanding tasks, which involve the model's ability to analyze and interpret various musical elements. These improvements primarily focused on integrating both music and text inputs. However, the potential of incorporating additional modalities such as images, videos and textual music features to enhance music understanding remains unexplored. To bridge this gap, we propose DeepResonance, a multimodal music understanding LLM fine-tuned via multi-way instruction tuning with multi-way aligned music, text, image, and video data. To this end, we construct Music4way-MI2T, Music4way-MV2T, and Music4way-Any2T, three 4-way training and evaluation datasets designed to enable DeepResonance to integrate both visual and textual music feature content. We also introduce multi-sampled ImageBind embeddings and a pre-alignment Transformer to enhance modality fusion prior to input into text LLMs, tailoring DeepResonance for multi-way instruction tuning. Our model achieves state-of-the-art performances across six music understanding tasks, highlighting the benefits of the auxiliary modalities and the structural superiority of DeepResonance. We plan to open-source the models and the newly constructed datasets. </p> </div> </dd> <dt> <a name='item165'>[165]</a> <a href ="/abs/2502.12678" title="Abstract" id="2502.12678"> arXiv:2502.12678 </a> (cross-list from cs.LG) [<a href="/pdf/2502.12678" title="Download PDF" id="pdf-2502.12678" aria-labelledby="pdf-2502.12678">pdf</a>, <a href="https://arxiv.org/html/2502.12678v1" title="View HTML" id="html-2502.12678" aria-labelledby="html-2502.12678" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12678" title="Other formats" id="oth-2502.12678" aria-labelledby="oth-2502.12678">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Multi-Step Alignment as Markov Games: An Optimistic Online Gradient Descent Approach with Convergence Guarantees </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Y">Yongtao Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Viano,+L">Luca Viano</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Y">Yihang Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+Z">Zhenyu Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Antonakopoulos,+K">Kimon Antonakopoulos</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gu,+Q">Quanquan Gu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cevher,+V">Volkan Cevher</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted as oral presentation in NeurIPS LanGame Workshop, revised from ICLR submission </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> Reinforcement Learning from Human Feedback (RLHF) has been highly successful in aligning large language models with human preferences. While prevalent methods like DPO have demonstrated strong performance, they frame interactions with the language model as a bandit problem, which limits their applicability in real-world scenarios where multi-turn conversations are common. Additionally, DPO relies on the Bradley-Terry model assumption, which does not adequately capture the non-transitive nature of human preferences. In this paper, we address these challenges by modeling the alignment problem as a two-player constant-sum Markov game, where each player seeks to maximize their winning rate against the other across all steps of the conversation. Our approach Multi-step Preference Optimization (MPO) is built upon the natural actor-critic framework~\citep{peters2008natural}. We further develop OMPO based on the optimistic online gradient descent algorithm~\citep{rakhlin2013online,joulani17a}. Theoretically, we provide a rigorous analysis for both algorithms on convergence and show that OMPO requires $\mathcal{O}(\epsilon^{-1})$ policy updates to converge to an $\epsilon$-approximate Nash equilibrium. We also validate the effectiveness of our method on multi-turn conversations dataset and math reasoning dataset. </p> </div> </dd> <dt> <a name='item166'>[166]</a> <a href ="/abs/2502.12734" title="Abstract" id="2502.12734"> arXiv:2502.12734 </a> (cross-list from cs.CR) [<a href="/pdf/2502.12734" title="Download PDF" id="pdf-2502.12734" aria-labelledby="pdf-2502.12734">pdf</a>, <a href="https://arxiv.org/html/2502.12734v1" title="View HTML" id="html-2502.12734" aria-labelledby="html-2502.12734" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12734" title="Other formats" id="oth-2502.12734" aria-labelledby="oth-2502.12734">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Iron Sharpens Iron: Defending Against Attacks in Machine-Generated Text Detection with Adversarial Training </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yuanfan Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Z">Zhaohan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+C">Chengzhengxu Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+C">Chao Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+X">Xiaoming Liu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Submitted to ACL 2025, Preprint, Under review </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Cryptography and Security (cs.CR)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Machine-generated Text (MGT) detection is crucial for regulating and attributing online texts. While the existing MGT detectors achieve strong performance, they remain vulnerable to simple perturbations and adversarial attacks. To build an effective defense against malicious perturbations, we view MGT detection from a threat modeling perspective, that is, analyzing the model's vulnerability from an adversary's point of view and exploring effective mitigations. To this end, we introduce an adversarial framework for training a robust MGT detector, named GREedy Adversary PromoTed DefendER (GREATER). The GREATER consists of two key components: an adversary GREATER-A and a detector GREATER-D. The GREATER-D learns to defend against the adversarial attack from GREATER-A and generalizes the defense to other attacks. GREATER-A identifies and perturbs the critical tokens in embedding space, along with greedy search and pruning to generate stealthy and disruptive adversarial examples. Besides, we update the GREATER-A and GREATER-D synchronously, encouraging the GREATER-D to generalize its defense to different attacks and varying attack intensities. Our experimental results across 9 text perturbation strategies and 5 adversarial attacks show that our GREATER-D reduces the Attack Success Rate (ASR) by 10.61% compared with SOTA defense methods while our GREATER-A is demonstrated to be more effective and efficient than SOTA attack approaches. </p> </div> </dd> <dt> <a name='item167'>[167]</a> <a href ="/abs/2502.12838" title="Abstract" id="2502.12838"> arXiv:2502.12838 </a> (cross-list from cs.CY) [<a href="/pdf/2502.12838" title="Download PDF" id="pdf-2502.12838" aria-labelledby="pdf-2502.12838">pdf</a>, <a href="/format/2502.12838" title="Other formats" id="oth-2502.12838" aria-labelledby="oth-2502.12838">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Towards Equitable AI: Detecting Bias in Using Large Language Models for Marketing </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yilmaz,+B">Berk Yilmaz</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ashqar,+H+I">Huthaifa I. Ashqar</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computers and Society (cs.CY)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> The recent advances in large language models (LLMs) have revolutionized industries such as finance, marketing, and customer service by enabling sophisticated natural language processing tasks. However, the broad adoption of LLMs brings significant challenges, particularly in the form of social biases that can be embedded within their outputs. Biases related to gender, age, and other sensitive attributes can lead to unfair treatment, raising ethical concerns and risking both company reputation and customer trust. This study examined bias in finance-related marketing slogans generated by LLMs (i.e., ChatGPT) by prompting tailored ads targeting five demographic categories: gender, marital status, age, income level, and education level. A total of 1,700 slogans were generated for 17 unique demographic groups, and key terms were categorized into four thematic groups: empowerment, financial, benefits and features, and personalization. Bias was systematically assessed using relative bias calculations and statistically tested with the Kolmogorov-Smirnov (KS) test against general slogans generated for any individual. Results revealed that marketing slogans are not neutral; rather, they emphasize different themes based on demographic factors. Women, younger individuals, low-income earners, and those with lower education levels receive more distinct messaging compared to older, higher-income, and highly educated individuals. This underscores the need to consider demographic-based biases in AI-generated marketing strategies and their broader societal implications. The findings of this study provide a roadmap for developing more equitable AI systems, highlighting the need for ongoing bias detection and mitigation efforts in LLMs. </p> </div> </dd> <dt> <a name='item168'>[168]</a> <a href ="/abs/2502.12913" title="Abstract" id="2502.12913"> arXiv:2502.12913 </a> (cross-list from cs.LG) [<a href="/pdf/2502.12913" title="Download PDF" id="pdf-2502.12913" aria-labelledby="pdf-2502.12913">pdf</a>, <a href="/format/2502.12913" title="Other formats" id="oth-2502.12913" aria-labelledby="oth-2502.12913">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> GSQ-Tuning: Group-Shared Exponents Integer in Fully Quantized Training for LLMs On-Device Fine-tuning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+S">Sifan Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Shuo Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+Z">Zhihang Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shi,+M">Mingjia Shi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shang,+Y">Yuzhang Shang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+D">Dawei Yang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> Large Language Models (LLMs) fine-tuning technologies have achieved remarkable results. However, traditional LLM fine-tuning approaches face significant challenges: they require large Floating Point (FP) computation, raising privacy concerns when handling sensitive data, and are impractical for resource-constrained edge devices. While Parameter-Efficient Fine-Tuning (PEFT) techniques reduce trainable parameters, their reliance on floating-point arithmetic creates fundamental incompatibilities with edge hardware. In this work, we introduce a novel framework for on-device LLM fine-tuning that eliminates the need for floating-point operations in both inference and training, named GSQ-Tuning. At its core is the Group-Shared Exponents Integer format, which efficiently represents model parameters in integer format using shared exponents among parameter groups. When combined with LoRA-like adapters, this enables fully integer-based fine-tuning that is both memory and compute efficient. We demonstrate that our approach achieves accuracy comparable to FP16-based fine-tuning while significantly reducing memory usage (50%). Moreover, compared to FP8, our method can reduce 5x power consumption and 11x chip area with same performance, making large-scale model adaptation feasible on edge devices. </p> </div> </dd> <dt> <a name='item169'>[169]</a> <a href ="/abs/2502.12929" title="Abstract" id="2502.12929"> arXiv:2502.12929 </a> (cross-list from cs.LG) [<a href="/pdf/2502.12929" title="Download PDF" id="pdf-2502.12929" aria-labelledby="pdf-2502.12929">pdf</a>, <a href="/format/2502.12929" title="Other formats" id="oth-2502.12929" aria-labelledby="oth-2502.12929">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Flow-of-Options: Diversified and Improved LLM Reasoning by Thinking Through Options </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Nair,+L">Lakshmi Nair</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Trase,+I">Ian Trase</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+M">Mark Kim</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Github code: <a href="https://github.com/flagshippioneering/Flow-of-Options" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> We present a novel reasoning approach called Flow-of-Options (FoO), designed to address intrinsic biases in Large Language Models (LLMs). FoO enables LLMs to systematically explore a diverse range of possibilities in their reasoning, as demonstrated by an FoO-based agentic system for autonomously solving Machine Learning tasks (AutoML). Our framework outperforms state-of-the-art baselines, achieving improvements of 38.2% - 69.2% on standard data science tasks, and 37.4% - 47.9% on therapeutic chemistry tasks. With an overall operation cost under $1 per task, our framework is well-suited for cost-sensitive applications. Beyond classification and regression, we illustrate the broader applicability of our FoO-based agentic system to tasks such as reinforcement learning and image generation. Our framework presents significant advancements compared to current state-of-the-art agentic systems for AutoML, due to the benefits of FoO in enforcing diversity in LLM solutions through compressed, explainable representations that also support long-term memory when combined with case-based reasoning. </p> </div> </dd> <dt> <a name='item170'>[170]</a> <a href ="/abs/2502.12961" title="Abstract" id="2502.12961"> arXiv:2502.12961 </a> (cross-list from cs.AI) [<a href="/pdf/2502.12961" title="Download PDF" id="pdf-2502.12961" aria-labelledby="pdf-2502.12961">pdf</a>, <a href="https://arxiv.org/html/2502.12961v1" title="View HTML" id="html-2502.12961" aria-labelledby="html-2502.12961" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12961" title="Other formats" id="oth-2502.12961" aria-labelledby="oth-2502.12961">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Adaptive Tool Use in Large Language Models with Meta-Cognition Trigger </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+W">Wenjun Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+D">Dexun Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dong,+K">Kuicai Dong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+C">Cong Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+H">Hao Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+W">Weiwen Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yasheng Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+R">Ruiming Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Y">Yong Liu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Large language models (LLMs) have shown remarkable emergent capabilities, transforming the execution of functional tasks by leveraging external tools for complex problems that require specialized processing or real-time data. While existing research expands LLMs access to diverse tools (e.g., program interpreters, search engines, weather/map apps), the necessity of using these tools is often overlooked, leading to indiscriminate tool invocation. This naive approach raises two key issues:(1) increased delays due to unnecessary tool calls, and (2) potential errors resulting from faulty interactions with external tools. In this paper, we introduce meta-cognition as a proxy for LLMs self-assessment of their capabilities, representing the model's awareness of its own limitations. Based on this, we propose MeCo, an adaptive decision-making strategy for external tool use. MeCo quantifies metacognitive scores by capturing high-level cognitive signals in the representation space, guiding when to invoke tools. Notably, MeCo is fine-tuning-free and incurs minimal cost. Our experiments show that MeCo accurately detects LLMs' internal cognitive signals and significantly improves tool-use decision-making across multiple base models and benchmarks. </p> </div> </dd> <dt> <a name='item171'>[171]</a> <a href ="/abs/2502.13001" title="Abstract" id="2502.13001"> arXiv:2502.13001 </a> (cross-list from cs.AI) [<a href="/pdf/2502.13001" title="Download PDF" id="pdf-2502.13001" aria-labelledby="pdf-2502.13001">pdf</a>, <a href="https://arxiv.org/html/2502.13001v1" title="View HTML" id="html-2502.13001" aria-labelledby="html-2502.13001" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.13001" title="Other formats" id="oth-2502.13001" aria-labelledby="oth-2502.13001">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> You need to MIMIC to get FAME: Solving Meeting Transcript Scarcity with a Multi-Agent Conversations </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Kirstein,+F">Frederic Kirstein</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Khan,+M">Muneeb Khan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wahle,+J+P">Jan Philip Wahle</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ruas,+T">Terry Ruas</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gipp,+B">Bela Gipp</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Meeting summarization suffers from limited high-quality data, mainly due to privacy restrictions and expensive collection processes. We address this gap with FAME, a dataset of 500 meetings in English and 300 in German produced by MIMIC, our new multi-agent meeting synthesis framework that generates meeting transcripts on a given knowledge source by defining psychologically grounded participant profiles, outlining the conversation, and orchestrating a large language model (LLM) debate. A modular post-processing step refines these outputs, mitigating potential repetitiveness and overly formal tones, ensuring coherent, credible dialogues at scale. We also propose a psychologically grounded evaluation framework assessing naturalness, social behavior authenticity, and transcript difficulties. Human assessments show that FAME approximates real-meeting spontaneity (4.5/5 in naturalness), preserves speaker-centric challenges (3/5 in spoken language), and introduces richer information-oriented difficulty (4/5 in difficulty). These findings highlight that FAME is a good and scalable proxy for real-world meeting conditions. It enables new test scenarios for meeting summarization research and other conversation-centric applications in tasks requiring conversation data or simulating social scenarios under behavioral constraints. </p> </div> </dd> <dt> <a name='item172'>[172]</a> <a href ="/abs/2502.13012" title="Abstract" id="2502.13012"> arXiv:2502.13012 </a> (cross-list from cs.HC) [<a href="/pdf/2502.13012" title="Download PDF" id="pdf-2502.13012" aria-labelledby="pdf-2502.13012">pdf</a>, <a href="https://arxiv.org/html/2502.13012v1" title="View HTML" id="html-2502.13012" aria-labelledby="html-2502.13012" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.13012" title="Other formats" id="oth-2502.13012" aria-labelledby="oth-2502.13012">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Towards a Design Guideline for RPA Evaluation: A Survey of Large Language Model-Based Role-Playing Agents </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+C">Chaoran Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yao,+B">Bingsheng Yao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zou,+R">Ruishi Zou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hua,+W">Wenyue Hua</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lyu,+W">Weimin Lyu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+T+J">Toby Jia-Jun Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+D">Dakuo Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Human-Computer Interaction (cs.HC)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Role-Playing Agent (RPA) is an increasingly popular type of LLM Agent that simulates human-like behaviors in a variety of tasks. However, evaluating RPAs is challenging due to diverse task requirements and agent designs. This paper proposes an evidence-based, actionable, and generalizable evaluation design guideline for LLM-based RPA by systematically reviewing 1,676 papers published between Jan. 2021 and Dec. 2024. Our analysis identifies six agent attributes, seven task attributes, and seven evaluation metrics from existing literature. Based on these findings, we present an RPA evaluation design guideline to help researchers develop more systematic and consistent evaluation methods. </p> </div> </dd> <dt> <a name='item173'>[173]</a> <a href ="/abs/2502.13025" title="Abstract" id="2502.13025"> arXiv:2502.13025 </a> (cross-list from cs.AI) [<a href="/pdf/2502.13025" title="Download PDF" id="pdf-2502.13025" aria-labelledby="pdf-2502.13025">pdf</a>, <a href="https://arxiv.org/html/2502.13025v1" title="View HTML" id="html-2502.13025" aria-labelledby="html-2502.13025" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.13025" title="Other formats" id="oth-2502.13025" aria-labelledby="oth-2502.13025">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Agentic Deep Graph Reasoning Yields Self-Organizing Knowledge Networks </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Buehler,+M+J">Markus J. Buehler</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Materials Science (cond-mat.mtrl-sci); Computation and Language (cs.CL); Machine Learning (cs.LG) </div> <p class='mathjax'> We present an agentic, autonomous graph expansion framework that iteratively structures and refines knowledge in situ. Unlike conventional knowledge graph construction methods relying on static extraction or single-pass learning, our approach couples a reasoning-native large language model with a continually updated graph representation. At each step, the system actively generates new concepts and relationships, merges them into a global graph, and formulates subsequent prompts based on its evolving structure. Through this feedback-driven loop, the model organizes information into a scale-free network characterized by hub formation, stable modularity, and bridging nodes that link disparate knowledge clusters. Over hundreds of iterations, new nodes and edges continue to appear without saturating, while centrality measures and shortest path distributions evolve to yield increasingly distributed connectivity. Our analysis reveals emergent patterns, such as the rise of highly connected 'hub' concepts and the shifting influence of 'bridge' nodes, indicating that agentic, self-reinforcing graph construction can yield open-ended, coherent knowledge structures. Applied to materials design problems, we present compositional reasoning experiments by extracting node-specific and synergy-level principles to foster genuinely novel knowledge synthesis, yielding cross-domain ideas that transcend rote summarization and strengthen the framework's potential for open-ended scientific discovery. We discuss other applications in scientific discovery and outline future directions for enhancing scalability and interpretability. </p> </div> </dd> <dt> <a name='item174'>[174]</a> <a href ="/abs/2502.13095" title="Abstract" id="2502.13095"> arXiv:2502.13095 </a> (cross-list from cs.CV) [<a href="/pdf/2502.13095" title="Download PDF" id="pdf-2502.13095" aria-labelledby="pdf-2502.13095">pdf</a>, <a href="https://arxiv.org/html/2502.13095v1" title="View HTML" id="html-2502.13095" aria-labelledby="html-2502.13095" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.13095" title="Other formats" id="oth-2502.13095" aria-labelledby="oth-2502.13095">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Understanding and Rectifying Safety Perception Distortion in VLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zou,+X">Xiaohan Zou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kang,+J">Jian Kang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kesidis,+G">George Kesidis</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+L">Lu Lin</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Computation and Language (cs.CL); Machine Learning (cs.LG) </div> <p class='mathjax'> Recent studies reveal that vision-language models (VLMs) become more susceptible to harmful requests and jailbreak attacks after integrating the vision modality, exhibiting greater vulnerability than their text-only LLM backbones. To uncover the root cause of this phenomenon, we conduct an in-depth analysis and identify a key issue: multimodal inputs introduce an modality-induced activation shift toward a "safer" direction compared to their text-only counterparts, leading VLMs to systematically overestimate the safety of harmful inputs. We refer to this issue as safety perception distortion. To mitigate such distortion, we propose Activation Shift Disentanglement and Calibration (ShiftDC), a training-free method that decomposes and calibrates the modality-induced activation shift to reduce the impact of modality on safety. By isolating and removing the safety-relevant component, ShiftDC restores the inherent safety alignment of the LLM backbone while preserving the vision-language capabilities of VLMs. Empirical results demonstrate that ShiftDC significantly enhances alignment performance on safety benchmarks without impairing model utility. </p> </div> </dd> <dt> <a name='item175'>[175]</a> <a href ="/abs/2502.13131" title="Abstract" id="2502.13131"> arXiv:2502.13131 </a> (cross-list from cs.AI) [<a href="/pdf/2502.13131" title="Download PDF" id="pdf-2502.13131" aria-labelledby="pdf-2502.13131">pdf</a>, <a href="https://arxiv.org/html/2502.13131v1" title="View HTML" id="html-2502.13131" aria-labelledby="html-2502.13131" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.13131" title="Other formats" id="oth-2502.13131" aria-labelledby="oth-2502.13131">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Rethinking Diverse Human Preference Learning through Principal Component Analysis </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+F">Feng Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+R">Rui Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+H">Hao Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Deng,+C">Chunyuan Deng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yao,+J">Jiarui Yao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+J">Jingyan Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+H">Huan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+H">Hanjie Chen</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 14 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Understanding human preferences is crucial for improving foundation models and building personalized AI systems. However, preferences are inherently diverse and complex, making it difficult for traditional reward models to capture their full range. While fine-grained preference data can help, collecting it is expensive and hard to scale. In this paper, we introduce Decomposed Reward Models (DRMs), a novel approach that extracts diverse human preferences from binary comparisons without requiring fine-grained annotations. Our key insight is to represent human preferences as vectors and analyze them using Principal Component Analysis (PCA). By constructing a dataset of embedding differences between preferred and rejected responses, DRMs identify orthogonal basis vectors that capture distinct aspects of preference. These decomposed rewards can be flexibly combined to align with different user needs, offering an interpretable and scalable alternative to traditional reward models. We demonstrate that DRMs effectively extract meaningful preference dimensions (e.g., helpfulness, safety, humor) and adapt to new users without additional training. Our results highlight DRMs as a powerful framework for personalized and interpretable LLM alignment. </p> </div> </dd> <dt> <a name='item176'>[176]</a> <a href ="/abs/2502.13135" title="Abstract" id="2502.13135"> arXiv:2502.13135 </a> (cross-list from cs.LG) [<a href="/pdf/2502.13135" title="Download PDF" id="pdf-2502.13135" aria-labelledby="pdf-2502.13135">pdf</a>, <a href="https://arxiv.org/html/2502.13135v1" title="View HTML" id="html-2502.13135" aria-labelledby="html-2502.13135" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.13135" title="Other formats" id="oth-2502.13135" aria-labelledby="oth-2502.13135">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Sleepless Nights, Sugary Days: Creating Synthetic Users with Health Conditions for Realistic Coaching Agent Interactions </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yun,+T">Taedong Yun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+E">Eric Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Safdari,+M">Mustafa Safdari</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+J+H">Jong Ha Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kumar,+V+V">Vaishnavi Vinod Kumar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mahdavi,+S+S">S. Sara Mahdavi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Amar,+J">Jonathan Amar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Peyton,+D">Derek Peyton</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Aharony,+R">Reut Aharony</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Michaelides,+A">Andreas Michaelides</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Schneider,+L">Logan Schneider</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Galatzer-Levy,+I">Isaac Galatzer-Levy</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jia,+Y">Yugang Jia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Canny,+J">John Canny</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gretton,+A">Arthur Gretton</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Matari%C4%87,+M">Maja Matari膰</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> We present an end-to-end framework for generating synthetic users for evaluating interactive agents designed to encourage positive behavior changes, such as in health and lifestyle coaching. The synthetic users are grounded in health and lifestyle conditions, specifically sleep and diabetes management in this study, to ensure realistic interactions with the health coaching agent. Synthetic users are created in two stages: first, structured data are generated grounded in real-world health and lifestyle factors in addition to basic demographics and behavioral attributes; second, full profiles of the synthetic users are developed conditioned on the structured data. Interactions between synthetic users and the coaching agent are simulated using generative agent-based models such as Concordia, or directly by prompting a language model. Using two independently-developed agents for sleep and diabetes coaching as case studies, the validity of this framework is demonstrated by analyzing the coaching agent's understanding of the synthetic users' needs and challenges. Finally, through multiple blinded evaluations of user-coach interactions by human experts, we demonstrate that our synthetic users with health and behavioral attributes more accurately portray real human users with the same attributes, compared to generic synthetic users not grounded in such attributes. The proposed framework lays the foundation for efficient development of conversational agents through extensive, realistic, and grounded simulated interactions. </p> </div> </dd> </dl> <dl id='articles'> <h3>Replacement submissions (showing 117 of 117 entries)</h3> <dt> <a name='item177'>[177]</a> <a href ="/abs/2205.01845" title="Abstract" id="2205.01845"> arXiv:2205.01845 </a> (replaced) [<a href="/pdf/2205.01845" title="Download PDF" id="pdf-2205.01845" aria-labelledby="pdf-2205.01845">pdf</a>, <a href="https://arxiv.org/html/2205.01845v2" title="View HTML" id="html-2205.01845" aria-labelledby="html-2205.01845" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2205.01845" title="Other formats" id="oth-2205.01845" aria-labelledby="oth-2205.01845">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Seed-Guided Topic Discovery with Out-of-Vocabulary Seeds </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yu Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Meng,+Y">Yu Meng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+X">Xuan Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Sheng Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+J">Jiawei Han</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 12 pages; Accepted to NAACL 2022 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Information Retrieval (cs.IR) </div> <p class='mathjax'> Discovering latent topics from text corpora has been studied for decades. Many existing topic models adopt a fully unsupervised setting, and their discovered topics may not cater to users' particular interests due to their inability of leveraging user guidance. Although there exist seed-guided topic discovery approaches that leverage user-provided seeds to discover topic-representative terms, they are less concerned with two factors: (1) the existence of out-of-vocabulary seeds and (2) the power of pre-trained language models (PLMs). In this paper, we generalize the task of seed-guided topic discovery to allow out-of-vocabulary seeds. We propose a novel framework, named SeeTopic, wherein the general knowledge of PLMs and the local semantics learned from the input corpus can mutually benefit each other. Experiments on three real datasets from different domains demonstrate the effectiveness of SeeTopic in terms of topic coherence, accuracy, and diversity. </p> </div> </dd> <dt> <a name='item178'>[178]</a> <a href ="/abs/2402.07625" title="Abstract" id="2402.07625"> arXiv:2402.07625 </a> (replaced) [<a href="/pdf/2402.07625" title="Download PDF" id="pdf-2402.07625" aria-labelledby="pdf-2402.07625">pdf</a>, <a href="https://arxiv.org/html/2402.07625v4" title="View HTML" id="html-2402.07625" aria-labelledby="html-2402.07625" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2402.07625" title="Other formats" id="oth-2402.07625" aria-labelledby="oth-2402.07625">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Autonomous Data Selection with Zero-shot Generative Classifiers for Mathematical Texts </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yifan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+Y">Yifan Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+Y">Yang Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yao,+A+C">Andrew Chi-Chih Yao</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 24 pages, 8 figures. arXiv admin note: text overlap with <a href="https://arxiv.org/abs/0808.2664" data-arxiv-id="0808.2664" class="link-https">arXiv:0808.2664</a>, <a href="https://arxiv.org/abs/0806.2159" data-arxiv-id="0806.2159" class="link-https">arXiv:0806.2159</a>, <a href="https://arxiv.org/abs/1703.08834" data-arxiv-id="1703.08834" class="link-https">arXiv:1703.08834</a>, <a href="https://arxiv.org/abs/math/0610707" data-arxiv-id="math/0610707" class="link-https">arXiv:math/0610707</a> by other authors </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> We present Autonomous Data Selection (AutoDS), a method that leverages base language models themselves as zero-shot "generative classifiers" to automatically curate high-quality mathematical texts. Unlike prior approaches that require human annotations or training a dedicated data filter, AutoDS relies solely on a model's logits to determine whether a given passage is mathematically informative and educational. By integrating AutoDS into a continual pretraining pipeline, we substantially boost downstream performance on challenging math benchmarks (MATH, GSM8K, and BBH) while using far fewer tokens than previous methods. Empirically, our approach achieves roughly a twofold improvement in pretraining token efficiency over strong baselines, underscoring the potential of self-directed data selection in enhancing mathematical reasoning. We release our curated AutoMathText dataset to facilitate future research in automated domain-specific data curation. The AutoMathText dataset is available at <a href="https://huggingface.co/datasets/math-ai/AutoMathText" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. The code is available at <a href="https://github.com/yifanzhang-pro/AutoMathText" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item179'>[179]</a> <a href ="/abs/2404.09077" title="Abstract" id="2404.09077"> arXiv:2404.09077 </a> (replaced) [<a href="/pdf/2404.09077" title="Download PDF" id="pdf-2404.09077" aria-labelledby="pdf-2404.09077">pdf</a>, <a href="https://arxiv.org/html/2404.09077v3" title="View HTML" id="html-2404.09077" aria-labelledby="html-2404.09077" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2404.09077" title="Other formats" id="oth-2404.09077" aria-labelledby="oth-2404.09077">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CuriousLLM: Elevating Multi-Document Question Answering with LLM-Enhanced Knowledge Graph Reasoning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+Z">Zukang Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+Z">Zixuan Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+X">Xuan Zhu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted for publication in NAACL 2025. The official version will be available in the ACL Anthology </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Information Retrieval (cs.IR); Machine Learning (cs.LG) </div> <p class='mathjax'> Large Language Models (LLMs) have achieved significant success in open-domain question answering. However, they continue to face challenges such as hallucinations and knowledge cutoffs. These issues can be mitigated through in-context learning by providing LLMs with relevant context before generating answers. Recent literature proposes Knowledge Graph Prompting (KGP) which integrates knowledge graphs with an LLM-based traversal agent to substantially enhance document retrieval quality. However, KGP requires costly fine-tuning with large datasets and remains prone to hallucination. In this paper, we propose CuriousLLM, an enhancement that integrates a curiosity-driven reasoning mechanism into an LLM agent. This mechanism enables the agent to generate relevant follow-up questions, thereby guiding the information retrieval process more efficiently. Central to our approach is the development of the new Follow-upQA dataset, which includes questions and supporting evidence as input, with follow-up questions serving as ground truths. These follow-up questions either inquire about what is still missing to fully answer the user's query or use special tokens to signify that the retrieved evidence is sufficient. Our experiments show that CuriousLLM significantly boosts LLM performance in multi-document question answering (MD-QA), circumventing the substantial computational costs and latency from the original KGP framework. </p> </div> </dd> <dt> <a name='item180'>[180]</a> <a href ="/abs/2405.03371" title="Abstract" id="2405.03371"> arXiv:2405.03371 </a> (replaced) [<a href="/pdf/2405.03371" title="Download PDF" id="pdf-2405.03371" aria-labelledby="pdf-2405.03371">pdf</a>, <a href="https://arxiv.org/html/2405.03371v3" title="View HTML" id="html-2405.03371" aria-labelledby="html-2405.03371" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2405.03371" title="Other formats" id="oth-2405.03371" aria-labelledby="oth-2405.03371">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Explainable Fake News Detection With Large Language Model via Defense Among Competing Wisdom </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+B">Bo Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+J">Jing Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+H">Hongzhan Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+Z">Zhiwei Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+R">Ruichao Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tian,+Y">Yuan Tian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chang,+Y">Yi Chang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 12 pages, WWW'2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Most fake news detection methods learn latent feature representations based on neural networks, which makes them black boxes to classify a piece of news without giving any justification. Existing explainable systems generate veracity justifications from investigative journalism, which suffer from debunking delayed and low efficiency. Recent studies simply assume that the justification is equivalent to the majority opinions expressed in the wisdom of crowds. However, the opinions typically contain some inaccurate or biased information since the wisdom of crowds is uncensored. To detect fake news from a sea of diverse, crowded and even competing narratives, in this paper, we propose a novel defense-based explainable fake news detection framework. Specifically, we first propose an evidence extraction module to split the wisdom of crowds into two competing parties and respectively detect salient evidences. To gain concise insights from evidences, we then design a prompt-based module that utilizes a large language model to generate justifications by inferring reasons towards two possible veracities. Finally, we propose a defense-based inference module to determine veracity via modeling the defense among these justifications. Extensive experiments conducted on two real-world benchmarks demonstrate that our proposed method outperforms state-of-the-art baselines in terms of fake news detection and provides high-quality justifications. </p> </div> </dd> <dt> <a name='item181'>[181]</a> <a href ="/abs/2405.07764" title="Abstract" id="2405.07764"> arXiv:2405.07764 </a> (replaced) [<a href="/pdf/2405.07764" title="Download PDF" id="pdf-2405.07764" aria-labelledby="pdf-2405.07764">pdf</a>, <a href="https://arxiv.org/html/2405.07764v3" title="View HTML" id="html-2405.07764" aria-labelledby="html-2405.07764" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2405.07764" title="Other formats" id="oth-2405.07764" aria-labelledby="oth-2405.07764">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LGDE: Local Graph-based Dictionary Expansion </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Schindler,+D+J">Dominik J. Schindler</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jha,+S">Sneha Jha</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+X">Xixuan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Buehling,+K">Kilian Buehling</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Heft,+A">Annett Heft</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Barahona,+M">Mauricio Barahona</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Python code available at: <a href="https://github.com/barahona-research-group/LGDE" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Social and Information Networks (cs.SI); Physics and Society (physics.soc-ph) </div> <p class='mathjax'> We present Local Graph-based Dictionary Expansion (LGDE), a method for data-driven discovery of the semantic neighbourhood of words using tools from manifold learning and network science. At the heart of LGDE lies the creation of a word similarity graph from the geometry of word embeddings followed by local community detection based on graph diffusion. The diffusion in the local graph manifold allows the exploration of the complex nonlinear geometry of word embeddings to capture word similarities based on paths of semantic association, over and above direct pairwise similarities. Exploiting such semantic neighbourhoods enables the expansion of dictionaries of pre-selected keywords, an important step for tasks in information retrieval, such as database queries and online data collection. We validate LGDE on two user-generated English-language corpora and show that LGDE enriches the list of keywords with improved performance relative to methods based on direct word similarities or co-occurrences. We further demonstrate our method through a real-world use case from communication science, where LGDE is evaluated quantitatively on the expansion of a conspiracy-related dictionary from online data collected and analysed by domain experts. Our empirical results and expert user assessment indicate that LGDE expands the seed dictionary with more useful keywords due to the manifold-learning-based similarity network. </p> </div> </dd> <dt> <a name='item182'>[182]</a> <a href ="/abs/2405.16681" title="Abstract" id="2405.16681"> arXiv:2405.16681 </a> (replaced) [<a href="/pdf/2405.16681" title="Download PDF" id="pdf-2405.16681" aria-labelledby="pdf-2405.16681">pdf</a>, <a href="https://arxiv.org/html/2405.16681v2" title="View HTML" id="html-2405.16681" aria-labelledby="html-2405.16681" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2405.16681" title="Other formats" id="oth-2405.16681" aria-labelledby="oth-2405.16681">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Triple Preference Optimization: Achieving Better Alignment using a Single Step Optimization </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Saeidi,+A">Amir Saeidi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Verma,+S">Shivanshu Verma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=RRV,+A">Aswin RRV</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rasul,+K">Kashif Rasul</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Baral,+C">Chitta Baral</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Reinforcement Learning with Human Feedback (RLHF) enhances the alignment of Large Language Models (LLMs). However, its limitations have led to the development of Direct Preference Optimization (DPO), an RL-free approach designed to overcome these shortcomings. While studies have shown that DPO improves instruction-following capabilities, it negatively impacts the reasoning ability of LLMs. Additionally, DPO is highly sensitive to judgment noise in preference datasets and the size of the training set. Although several modifications to DPO have been proposed, they still fail to fully resolve these issues. To address these limitations, we propose Triple Preference Optimization (TPO), a new preference learning method designed to enhance both reasoning and instruction-following abilities through one-step optimization. We compare TPO against DPO and its recent variants using state-of-the-art training setups, including both base and instruction-tuned models such as Mistral and Llama 3. Our evaluation covers a comprehensive range of chat-based and reasoning benchmarks. The results demonstrate that TPO achieves significant improvements over existing methods without substantially increasing response length across different dataset sizes. Specifically, TPO outperforms DPO and SimPO by up to 7.0% and 7.3% points on Arena-Hard, 12.2% and 13.3% points on MixEval-Hard, 10.4% and 10.1% points on MMLU-Pro, and 19.0% and 19.2% points on GSM8K, respectively. Furthermore, TPO achieves these improvements while requiring less data than DPO. </p> </div> </dd> <dt> <a name='item183'>[183]</a> <a href ="/abs/2406.01506" title="Abstract" id="2406.01506"> arXiv:2406.01506 </a> (replaced) [<a href="/pdf/2406.01506" title="Download PDF" id="pdf-2406.01506" aria-labelledby="pdf-2406.01506">pdf</a>, <a href="https://arxiv.org/html/2406.01506v3" title="View HTML" id="html-2406.01506" aria-labelledby="html-2406.01506" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.01506" title="Other formats" id="oth-2406.01506" aria-labelledby="oth-2406.01506">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> The Geometry of Categorical and Hierarchical Concepts in Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Park,+K">Kiho Park</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Choe,+Y+J">Yo Joong Choe</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+Y">Yibo Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Veitch,+V">Victor Veitch</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted for an oral presentation at ICLR 2025. Best Paper Award at the ICML 2024 Workshop on Mechanistic Interpretability. Code is available at <a href="https://github.com/KihoPark/LLM_Categorical_Hierarchical_Representations" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG); Machine Learning (stat.ML) </div> <p class='mathjax'> The linear representation hypothesis is the informal idea that semantic concepts are encoded as linear directions in the representation spaces of large language models (LLMs). Previous work has shown how to make this notion precise for representing binary concepts that have natural contrasts (e.g., {male, female}) as directions in representation space. However, many natural concepts do not have natural contrasts (e.g., whether the output is about an animal). In this work, we show how to extend the formalization of the linear representation hypothesis to represent features (e.g., is_animal) as vectors. This allows us to immediately formalize the representation of categorical concepts as polytopes in the representation space. Further, we use the formalization to prove a relationship between the hierarchical structure of concepts and the geometry of their representations. We validate these theoretical results on the Gemma and LLaMA-3 large language models, estimating representations for 900+ hierarchically related concepts using data from WordNet. </p> </div> </dd> <dt> <a name='item184'>[184]</a> <a href ="/abs/2406.05661" title="Abstract" id="2406.05661"> arXiv:2406.05661 </a> (replaced) [<a href="/pdf/2406.05661" title="Download PDF" id="pdf-2406.05661" aria-labelledby="pdf-2406.05661">pdf</a>, <a href="https://arxiv.org/html/2406.05661v4" title="View HTML" id="html-2406.05661" aria-labelledby="html-2406.05661" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.05661" title="Other formats" id="oth-2406.05661" aria-labelledby="oth-2406.05661">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MS-HuBERT: Mitigating Pre-training and Inference Mismatch in Masked Language Modelling methods for learning Speech Representations </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yadav,+H">Hemant Yadav</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sitaram,+S">Sunayana Sitaram</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shah,+R+R">Rajiv Ratn Shah</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 4 pages, submitted to interspeech2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> In recent years, self-supervised pre-training methods have gained significant traction in learning high-level information from raw speech. Among these methods, HuBERT has demonstrated SOTA performance in automatic speech recognition (ASR). However, HuBERT's performance lags behind data2vec due to disparities in pre-training strategies. In this paper, we propose (i) a Swap method to address pre-training and inference mismatch observed in HuBERT and (ii) incorporates Multicluster masked prediction loss for more effective utilization of the models capacity. The resulting method is, MS-HuBERT, an end-to-end self-supervised pre-training method for learning robust speech representations. It beats vanilla HuBERT on the ASR Librispeech benchmark on average by a 5% margin when evaluated on different finetuning splits. Additionally, we demonstrate that the learned embeddings obtained during pre-training encode essential information for improving performance of content based tasks such as ASR. </p> </div> </dd> <dt> <a name='item185'>[185]</a> <a href ="/abs/2406.08754" title="Abstract" id="2406.08754"> arXiv:2406.08754 </a> (replaced) [<a href="/pdf/2406.08754" title="Download PDF" id="pdf-2406.08754" aria-labelledby="pdf-2406.08754">pdf</a>, <a href="https://arxiv.org/html/2406.08754v3" title="View HTML" id="html-2406.08754" aria-labelledby="html-2406.08754" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.08754" title="Other formats" id="oth-2406.08754" aria-labelledby="oth-2406.08754">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> StructuralSleight: Automated Jailbreak Attacks on Large Language Models Utilizing Uncommon Text-Organization Structures </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+B">Bangxin Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xing,+H">Hengrui Xing</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tian,+C">Cong Tian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+C">Chao Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qian,+J">Jin Qian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiao,+H">Huangqing Xiao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+L">Linfeng Feng</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 15 pages, 7 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Cryptography and Security (cs.CR) </div> <p class='mathjax'> Large Language Models (LLMs) are widely used in natural language processing but face the risk of jailbreak attacks that maliciously induce them to generate harmful content. Existing jailbreak attacks, including character-level and context-level attacks, mainly focus on the prompt of plain text without specifically exploring the significant influence of its structure. In this paper, we focus on studying how the prompt structure contributes to the jailbreak attack. We introduce a novel structure-level attack method based on long-tailed structures, which we refer to as Uncommon Text-Organization Structures (UTOS). We extensively study 12 UTOS templates and 6 obfuscation methods to build an effective automated jailbreak tool named StructuralSleight that contains three escalating attack strategies: Structural Attack, Structural and Character/Context Obfuscation Attack, and Fully Obfuscated Structural Attack. Extensive experiments on existing LLMs show that StructuralSleight significantly outperforms the baseline methods. In particular, the attack success rate reaches 94.62\% on GPT-4o, which has not been addressed by state-of-the-art techniques. </p> </div> </dd> <dt> <a name='item186'>[186]</a> <a href ="/abs/2406.12221" title="Abstract" id="2406.12221"> arXiv:2406.12221 </a> (replaced) [<a href="/pdf/2406.12221" title="Download PDF" id="pdf-2406.12221" aria-labelledby="pdf-2406.12221">pdf</a>, <a href="https://arxiv.org/html/2406.12221v3" title="View HTML" id="html-2406.12221" aria-labelledby="html-2406.12221" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.12221" title="Other formats" id="oth-2406.12221" aria-labelledby="oth-2406.12221">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> On-Policy Self-Alignment with Fine-grained Knowledge Feedback for Hallucination Mitigation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wen,+X">Xueru Wen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lou,+J">Jie Lou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+X">Xinyu Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuqiu,+J">Ji Yuqiu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guan,+X">Xinyan Guan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+Y">Yaojie Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+H">Hongyu Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+B">Ben He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+X">Xianpei Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+D">Debing Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+L">Le Sun</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Hallucination occurs when large language models exhibit behavior that deviates from the boundaries of their knowledge during response generation. To address this critical issue, previous learning-based methods attempt to finetune models but are limited by off-policy sampling and coarse-grained feedback. In this paper, we present \textit{\b{R}einforcement \b{L}earning \b{f}or \b{H}allucination} (RLFH), an on-policy self-alignment approach that enables LLMs to actively explore their knowledge boundaries and self-correct generation behavior through fine-grained feedback signals. RLFH introduces a self-assessment framework where the policy serves as its own judge. Through this framework, responses are automatically decomposed into atomic facts and their truthfulness and informativeness are assessed against external knowledge sources. The resulting fine-grained feedback at the statement level are then converted into token-level dense reward signals. This enables online reinforcement learning to achieve precise and timely optimization without human intervention. Comprehensive evaluations on HotpotQA, SQuADv2, and Biography benchmarks validate RLFH's effectiveness in hallucination mitigation. </p> </div> </dd> <dt> <a name='item187'>[187]</a> <a href ="/abs/2406.12382" title="Abstract" id="2406.12382"> arXiv:2406.12382 </a> (replaced) [<a href="/pdf/2406.12382" title="Download PDF" id="pdf-2406.12382" aria-labelledby="pdf-2406.12382">pdf</a>, <a href="https://arxiv.org/html/2406.12382v5" title="View HTML" id="html-2406.12382" aria-labelledby="html-2406.12382" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.12382" title="Other formats" id="oth-2406.12382" aria-labelledby="oth-2406.12382">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> From Instance Training to Instruction Learning: Task Adapters Generation from Instructions </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liao,+H">Huanxuan Liao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+S">Shizhu He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+Y">Yao Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yuanzhe Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hao,+Y">Yanchao Hao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+S">Shengping Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+K">Kang Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+J">Jun Zhao</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> accepted to NeurIPS 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large language models (LLMs) have acquired the ability to solve general tasks by utilizing instruction finetuning (IFT). However, IFT still relies heavily on instance training of extensive task data, which greatly limits the adaptability of LLMs to real-world scenarios where labeled task instances are scarce and broader task generalization becomes paramount. Contrary to LLMs, humans acquire skills and complete tasks not merely through repeated practice but also by understanding and following instructional guidelines. This paper is dedicated to simulating human learning to address the shortcomings of instance training, focusing on instruction learning to enhance cross-task generalization. Within this context, we introduce Task Adapters Generation from Instructions (TAGI), which automatically constructs the task-specific model in a parameter generation manner based on the given task instructions without retraining for unseen tasks. Specifically, we utilize knowledge distillation to enhance the consistency between TAGI developed through Learning with Instruction and task-specific models developed through Training with Instance, by aligning the labels, output logits, and adapter parameters between them. TAGI is endowed with cross-task generalization capabilities through a two-stage training process that includes hypernetwork pretraining and finetuning. We evaluate TAGI on the Super-Natural Instructions and P3 datasets. The experimental results demonstrate that TAGI can match or even outperform traditional meta-trained models and other hypernetwork models, while significantly reducing computational requirements. </p> </div> </dd> <dt> <a name='item188'>[188]</a> <a href ="/abs/2406.13555" title="Abstract" id="2406.13555"> arXiv:2406.13555 </a> (replaced) [<a href="/pdf/2406.13555" title="Download PDF" id="pdf-2406.13555" aria-labelledby="pdf-2406.13555">pdf</a>, <a href="https://arxiv.org/html/2406.13555v3" title="View HTML" id="html-2406.13555" aria-labelledby="html-2406.13555" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.13555" title="Other formats" id="oth-2406.13555" aria-labelledby="oth-2406.13555">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> BiLD: Bi-directional Logits Difference Loss for Large Language Model Distillation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+M">Minchong Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+F">Feng Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+X">Xiaohui Song</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> COLING 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> In recent years, large language models (LLMs) have shown exceptional capabilities across various natural language processing (NLP) tasks. However, such impressive performance often comes with the trade-off of an increased parameter size, posing significant challenges for widespread deployment. Knowledge distillation (KD) provides a solution by transferring knowledge from a large teacher model to a smaller student model. In this paper, we explore the task-specific distillation of LLMs at the logit level. Our investigation reveals that the logits of fine-tuned LLMs exhibit a more extreme long-tail distribution than those from vision models, with hidden "noise" in the long tail affecting distillation performance. Furthermore, existing logits distillation methods often struggle to effectively utilize the internal ranking information from the logits. To address these, we propose the Bi-directional Logits Difference (BiLD) loss. The BiLD loss filters out the long-tail noise by utilizing only top-$k$ teacher and student logits, and leverages the internal logits ranking information by constructing logits differences. To evaluate BiLD loss, we conduct comprehensive experiments on 13 datasets using two types of LLMs. Our results show that the BiLD loss, with only the top-8 logits, outperforms supervised fine-tuning (SFT), vanilla KL loss, and five other distillation methods from both NLP and CV fields. </p> </div> </dd> <dt> <a name='item189'>[189]</a> <a href ="/abs/2406.17261" title="Abstract" id="2406.17261"> arXiv:2406.17261 </a> (replaced) [<a href="/pdf/2406.17261" title="Download PDF" id="pdf-2406.17261" aria-labelledby="pdf-2406.17261">pdf</a>, <a href="https://arxiv.org/html/2406.17261v3" title="View HTML" id="html-2406.17261" aria-labelledby="html-2406.17261" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.17261" title="Other formats" id="oth-2406.17261" aria-labelledby="oth-2406.17261">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> TRAWL: Tensor Reduced and Approximated Weights for Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+Y">Yiran Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Patel,+H">Het Patel</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fu,+Y">Yu Fu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ahn,+D">Dawon Ahn</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+J">Jia Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dong,+Y">Yue Dong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Papalexakis,+E+E">Evangelos E. Papalexakis</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 12 pages. To appear on PAKDD 2025 Special Session on 'Data Science: Foundations and Applications (DSFA)' </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Recent research has shown that pruning large-scale language models for inference is an effective approach to improving model efficiency, significantly reducing model weights with minimal impact on performance. Interestingly, pruning can sometimes even enhance accuracy by removing noise that accumulates during training, particularly through matrix decompositions. However, recent work has primarily focused on single matrix decompositions or lower precision techniques, which may fail to fully capture structural patterns. To address these limitations, we introduce TRAWL (Tensor Reduced and Approximated Weights for Large Language Models), a technique that applies tensor decomposition across multiple weight matrices to effectively denoise LLMs by capturing global structural patterns. Our experiments show that TRAWL improves model performance by up to 16% over baseline models on benchmark datasets, without requiring additional data, training, or fine-tuning. </p> </div> </dd> <dt> <a name='item190'>[190]</a> <a href ="/abs/2407.01461" title="Abstract" id="2407.01461"> arXiv:2407.01461 </a> (replaced) [<a href="/pdf/2407.01461" title="Download PDF" id="pdf-2407.01461" aria-labelledby="pdf-2407.01461">pdf</a>, <a href="https://arxiv.org/html/2407.01461v2" title="View HTML" id="html-2407.01461" aria-labelledby="html-2407.01461" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2407.01461" title="Other formats" id="oth-2407.01461" aria-labelledby="oth-2407.01461">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Enhancing the Capability and Robustness of Large Language Models through Reinforcement Learning-Driven Query Refinement </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+Z">Zisu Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+X">Xiaohua Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+F">Feiran Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+Z">Zhibo Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+C">Cenyuan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qian,+Q">Qi Qian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zheng,+X">Xiaoqing Zheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+X">Xuanjing Huang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The capacity of large language models (LLMs) to generate honest, harmless, and helpful responses heavily relies on the quality of user prompts. However, these prompts often tend to be brief and vague, thereby significantly limiting the full potential of LLMs. Moreover, harmful prompts can be meticulously crafted and manipulated by adversaries to jailbreak LLMs, inducing them to produce potentially toxic content. To enhance the capabilities of LLMs while maintaining strong robustness against harmful jailbreak inputs, this study proposes a transferable and pluggable framework that refines user prompts before they are input into LLMs. This strategy improves the quality of the queries, empowering LLMs to generate more truthful, benign and useful responses. Specifically, a lightweight query refinement model is introduced and trained using a specially designed reinforcement learning approach that incorporates multiple objectives to enhance particular capabilities of LLMs. Extensive experiments demonstrate that the refinement model not only improves the quality of responses but also strengthens their robustness against jailbreak attacks. Code is available at: <a href="https://github.com/Huangzisu/query-refinement" rel="external noopener nofollow" class="link-external link-https">this https URL</a> . </p> </div> </dd> <dt> <a name='item191'>[191]</a> <a href ="/abs/2407.02302" title="Abstract" id="2407.02302"> arXiv:2407.02302 </a> (replaced) [<a href="/pdf/2407.02302" title="Download PDF" id="pdf-2407.02302" aria-labelledby="pdf-2407.02302">pdf</a>, <a href="https://arxiv.org/html/2407.02302v2" title="View HTML" id="html-2407.02302" aria-labelledby="html-2407.02302" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2407.02302" title="Other formats" id="oth-2407.02302" aria-labelledby="oth-2407.02302">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Towards Human Understanding of Paraphrase Types in Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Meier,+D">Dominik Meier</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wahle,+J+P">Jan Philip Wahle</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ruas,+T">Terry Ruas</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gipp,+B">Bela Gipp</a></div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> Proceedings of the 31st International Conference on Computational Linguistics (2025), pages 6298-6316 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Paraphrases represent a human's intuitive ability to understand expressions presented in various different ways. Current paraphrase evaluations of language models primarily use binary approaches, offering limited interpretability of specific text changes. Atomic paraphrase types (APT) decompose paraphrases into different linguistic changes and offer a granular view of the flexibility in linguistic expression (e.g., a shift in syntax or vocabulary used). In this study, we assess the human preferences towards ChatGPT in generating English paraphrases with ten APTs and five prompting techniques. We introduce APTY (Atomic Paraphrase TYpes), a dataset of 800 sentence-level and word-level annotations by 15 annotators. The dataset also provides a human preference ranking of paraphrases with different types that can be used to fine-tune models with RLHF and DPO methods. Our results reveal that ChatGPT and a DPO-trained LLama 7B model can generate simple APTs, such as additions and deletions, but struggle with complex structures (e.g., subordination changes). This study contributes to understanding which aspects of paraphrasing language models have already succeeded at understanding and what remains elusive. In addition, we show how our curated datasets can be used to develop language models with specific linguistic capabilities. </p> </div> </dd> <dt> <a name='item192'>[192]</a> <a href ="/abs/2408.07471" title="Abstract" id="2408.07471"> arXiv:2408.07471 </a> (replaced) [<a href="/pdf/2408.07471" title="Download PDF" id="pdf-2408.07471" aria-labelledby="pdf-2408.07471">pdf</a>, <a href="https://arxiv.org/html/2408.07471v4" title="View HTML" id="html-2408.07471" aria-labelledby="html-2408.07471" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2408.07471" title="Other formats" id="oth-2408.07471" aria-labelledby="oth-2408.07471">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Bridging and Modeling Correlations in Pairwise Data for Direct Preference Optimization </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+Y">Yuxin Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+B">Bo Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yufei Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zeng,+X">Xingshan Zeng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+L">Liangyou Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yasheng Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+X">Xin Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shang,+L">Lifeng Shang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+R">Ruiming Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+W">Wei Wang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 20 pages, 9 figures, 12 tables. Accepted at ICLR 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Direct preference optimization (DPO), a widely adopted offline preference optimization algorithm, aims to align large language models (LLMs) with human-desired behaviors using pairwise preference data. However, the generation of the winning response and the losing response within pairwise data are typically isolated, leading to weak correlations between them as well as suboptimal alignment performance. To address this issue, we propose an effective framework for Bridging and Modeling Correlations in pairwise data, named BMC. Firstly, we increase the consistency and informativeness of the pairwise preference signals through targeted modifications, synthesizing a pseudo-winning response by improving the losing response with the winning response as a reference. Secondly, we identify that DPO alone is insufficient to model these correlations and capture nuanced variations. Therefore, we propose learning token-level correlations by dynamically leveraging the policy model's confidence during training. Comprehensive experiments on QA, math, and instruction-following tasks demonstrate the effectiveness of our approach, significantly surpassing competitive baselines, including DPO. Additionally, our in-depth quantitative analysis reveals the reasons behind our method's superior performance over DPO and showcases its versatility to other DPO variants. We release our repository at <a href="https://github.com/YJiangcm/BMC" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item193'>[193]</a> <a href ="/abs/2408.08780" title="Abstract" id="2408.08780"> arXiv:2408.08780 </a> (replaced) [<a href="/pdf/2408.08780" title="Download PDF" id="pdf-2408.08780" aria-labelledby="pdf-2408.08780">pdf</a>, <a href="https://arxiv.org/html/2408.08780v4" title="View HTML" id="html-2408.08780" aria-labelledby="html-2408.08780" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2408.08780" title="Other formats" id="oth-2408.08780" aria-labelledby="oth-2408.08780">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Large Language Models Might Not Care What You Are Saying: Prompt Format Beats Descriptions </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+C">Chenming Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zhixiang Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+H">Hao Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Y">Yunfang Wu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> With the help of in-context learning (ICL), large language models (LLMs) have achieved impressive performance across various tasks. However, the function of descriptive instructions during ICL remains under-explored. In this work, we propose an ensemble prompt framework to describe the selection criteria of multiple in-context examples, and preliminary experiments on machine translation (MT) across six translation directions confirm that this framework boosts ICL performance. But to our surprise, LLMs might not care what the descriptions actually say, and the performance gain is primarily caused by the ensemble format, since it could lead to improvement even with random descriptive nouns. We further apply this new ensemble framework on a range of commonsense, math, logical reasoning and hallucination tasks with three LLMs and achieve promising results, suggesting again that designing a proper prompt format would be much more effective and efficient than paying effort into specific descriptions. Our code will be publicly available once this paper is published. </p> </div> </dd> <dt> <a name='item194'>[194]</a> <a href ="/abs/2408.15091" title="Abstract" id="2408.15091"> arXiv:2408.15091 </a> (replaced) [<a href="/pdf/2408.15091" title="Download PDF" id="pdf-2408.15091" aria-labelledby="pdf-2408.15091">pdf</a>, <a href="https://arxiv.org/html/2408.15091v2" title="View HTML" id="html-2408.15091" aria-labelledby="html-2408.15091" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2408.15091" title="Other formats" id="oth-2408.15091" aria-labelledby="oth-2408.15091">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Relation Also Knows: Rethinking the Recall and Editing of Factual Associations in Auto-Regressive Transformer Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+X">Xiyu Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zhengxiao Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gu,+N">Naibin Gu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+Z">Zheng Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+W">Wanli Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiang,+J">Ji Xiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+W">Weiping Wang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted by AAAI25 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The storage and recall of factual associations in auto-regressive transformer language models (LMs) have drawn a great deal of attention, inspiring knowledge editing by directly modifying the located model weights. Most editing works achieve knowledge editing under the guidance of existing interpretations of knowledge recall that mainly focus on subject knowledge. However, these interpretations are seriously flawed, neglecting relation information and leading to the over-generalizing problem for editing. In this work, we discover a novel relation-focused perspective to interpret the knowledge recall of transformer LMs during inference and apply it on single knowledge editing to avoid over-generalizing. Experimental results on the dataset supplemented with a new R-Specificity criterion demonstrate that our editing approach significantly alleviates over-generalizing while remaining competitive on other criteria, breaking the domination of subject-focused editing for future research. </p> </div> </dd> <dt> <a name='item195'>[195]</a> <a href ="/abs/2409.11261" title="Abstract" id="2409.11261"> arXiv:2409.11261 </a> (replaced) [<a href="/pdf/2409.11261" title="Download PDF" id="pdf-2409.11261" aria-labelledby="pdf-2409.11261">pdf</a>, <a href="https://arxiv.org/html/2409.11261v4" title="View HTML" id="html-2409.11261" aria-labelledby="html-2409.11261" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.11261" title="Other formats" id="oth-2409.11261" aria-labelledby="oth-2409.11261">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> The Art of Storytelling: Multi-Agent Generative AI for Dynamic Multimodal Narratives </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Arif,+S">Samee Arif</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Arif,+T">Taimoor Arif</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Haroon,+M+S">Muhammad Saad Haroon</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Khan,+A+J">Aamina Jamal Khan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Raza,+A+A">Agha Ali Raza</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Athar,+A">Awais Athar</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> This paper introduces the concept of an education tool that utilizes Generative Artificial Intelligence (GenAI) to enhance storytelling for children. The system combines GenAI-driven narrative co-creation, text-to-speech conversion, and text-to-video generation to produce an engaging experience for learners. We describe the co-creation process, the adaptation of narratives into spoken words using text-to-speech models, and the transformation of these narratives into contextually relevant visuals through text-to-video technology. Our evaluation covers the linguistics of the generated stories, the text-to-speech conversion quality, and the accuracy of the generated visuals. </p> </div> </dd> <dt> <a name='item196'>[196]</a> <a href ="/abs/2409.13203" title="Abstract" id="2409.13203"> arXiv:2409.13203 </a> (replaced) [<a href="/pdf/2409.13203" title="Download PDF" id="pdf-2409.13203" aria-labelledby="pdf-2409.13203">pdf</a>, <a href="https://arxiv.org/html/2409.13203v4" title="View HTML" id="html-2409.13203" aria-labelledby="html-2409.13203" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.13203" title="Other formats" id="oth-2409.13203" aria-labelledby="oth-2409.13203">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Neural-Symbolic Collaborative Distillation: Advancing Small Language Models for Complex Reasoning Tasks </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liao,+H">Huanxuan Liao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+S">Shizhu He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+Y">Yao Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yuanzhe Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+K">Kang Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+J">Jun Zhao</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted to AAAI 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> In this paper, we propose $\textbf{Ne}$ural-$\textbf{Sy}$mbolic $\textbf{C}$ollaborative $\textbf{D}$istillation ($\textbf{NesyCD}$), a novel knowledge distillation method for learning the complex reasoning abilities of Large Language Models (LLMs, e.g., \textgreater 13B). We argue that complex reasoning tasks are difficult for Small Language Models (SLMs, e.g., $\leq$ 7B), as these tasks demand not only general cognitive abilities but also specialized knowledge, which is often sparse and difficult for these neural-based SLMs to effectively capture. Therefore, NesyCD distills the general capabilities and specialized knowledge in LLMs using different manners. On the one hand, we distill only general abilities from teacher LLMs into the student SLMs of parameterized neural networks. On the other hand, for the specialized abilities and uncommon knowledge of a complex reasoning task, we employ a symbolic knowledge distillation approach to obtain and store the specialized knowledge within a symbolic knowledge base (KB). By decoupling general and specialized capabilities, the proposed NesyCD can achieve superior performance cost-effectively, utilizing smaller models and blending parameterized neural networks with symbolic KB. Moreover, the specialized KB generalizes well and is comprehended and manipulated by humans. Our experiments show that NesyCD significantly boosts SLMs' complex reasoning performance on in-domain (BBH, GSM8K) and out-of-domain (AGIEval, ARC) datasets. Notably, our approach enabled the LLaMA3-8B and Qwen2-7B to surpass GPT-3.5-turbo in performance and come close to matching LLaMA3-70B, despite the latter having nine times more parameters. Our code will be available at <a href="https://github.com/Xnhyacinth/NesyCD" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item197'>[197]</a> <a href ="/abs/2409.18511" title="Abstract" id="2409.18511"> arXiv:2409.18511 </a> (replaced) [<a href="/pdf/2409.18511" title="Download PDF" id="pdf-2409.18511" aria-labelledby="pdf-2409.18511">pdf</a>, <a href="https://arxiv.org/html/2409.18511v4" title="View HTML" id="html-2409.18511" aria-labelledby="html-2409.18511" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.18511" title="Other formats" id="oth-2409.18511" aria-labelledby="oth-2409.18511">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Do We Need Domain-Specific Embedding Models? An Empirical Investigation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+Y">Yixuan Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+Y">Yi Yang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> <a href="https://github.com/yixuantt/FinMTEB" rel="external noopener nofollow" class="link-external link-https">this https URL</a>, The newer version: <a href="https://arxiv.org/abs/2502.10990" data-arxiv-id="2502.10990" class="link-https">arXiv:2502.10990</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Information Retrieval (cs.IR) </div> <p class='mathjax'> Embedding models play a crucial role in representing and retrieving information across various NLP applications. Recent advancements in Large Language Models (LLMs) have further enhanced the performance of embedding models, which are trained on massive amounts of text covering almost every domain. These models are often benchmarked on general-purpose datasets like Massive Text Embedding Benchmark (MTEB), where they demonstrate superior performance. However, a critical question arises: Is the development of domain-specific embedding models necessary when general-purpose models are trained on vast corpora that already include specialized domain texts? In this paper, we empirically investigate this question, choosing the finance domain as an example. We introduce the Finance Massive Text Embedding Benchmark (FinMTEB), a counterpart to MTEB that consists of financial domain-specific text datasets. We evaluate the performance of seven state-of-the-art embedding models on FinMTEB and observe a significant performance drop compared to their performance on MTEB. To account for the possibility that this drop is driven by FinMTEB's higher complexity, we propose four measures to quantify dataset complexity and control for this factor in our analysis. Our analysis provides compelling evidence that state-of-the-art embedding models struggle to capture domain-specific linguistic and semantic patterns. Moreover, we find that the performance of general-purpose embedding models on MTEB is not correlated with their performance on FinMTEB, indicating the need for domain-specific embedding benchmarks for domain-specific embedding models. This study sheds light on developing domain-specific embedding models in the LLM era. FinMTEB comes with open-source code at <a href="https://github.com/yixuantt/FinMTEB" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item198'>[198]</a> <a href ="/abs/2410.01171" title="Abstract" id="2410.01171"> arXiv:2410.01171 </a> (replaced) [<a href="/pdf/2410.01171" title="Download PDF" id="pdf-2410.01171" aria-labelledby="pdf-2410.01171">pdf</a>, <a href="https://arxiv.org/html/2410.01171v2" title="View HTML" id="html-2410.01171" aria-labelledby="html-2410.01171" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.01171" title="Other formats" id="oth-2410.01171" aria-labelledby="oth-2410.01171">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Multilingual Retrieval Augmented Generation for Culturally-Sensitive Tasks: A Benchmark for Cross-lingual Robustness </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+B">Bryan Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+F">Fiona Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Haider,+S">Samar Haider</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Agashe,+A">Adwait Agashe</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+T">Tammy Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+R">Runqi Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Miao,+M">Muqing Miao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ramakrishnan,+S">Shriya Ramakrishnan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+Y">Yuan Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Callison-Burch,+C">Chris Callison-Burch</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The paradigm of retrieval-augmented generated (RAG) helps mitigate hallucinations of large language models (LLMs). However, RAG also introduces biases contained within the retrieved documents. These biases can be amplified in scenarios which are multilingual and culturally-sensitive, such as territorial disputes. In this paper, we introduce BordIRLines, a benchmark consisting of 720 territorial dispute queries paired with 14k Wikipedia documents across 49 languages. To evaluate LLMs' cross-lingual robustness for this task, we formalize several modes for multilingual retrieval. Our experiments on several LLMs reveal that retrieving multilingual documents best improves response consistency and decreases geopolitical bias over using purely in-language documents, showing how incorporating diverse perspectives improves robustness. Also, querying in low-resource languages displays a much wider variance in the linguistic distribution of response citations. Our further experiments and case studies investigate how cross-lingual RAG is affected by aspects from IR to document contents. We release our benchmark and code to support further research towards ensuring equitable information access across languages at <a href="https://huggingface.co/datasets/borderlines/bordirlines" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item199'>[199]</a> <a href ="/abs/2410.01497" title="Abstract" id="2410.01497"> arXiv:2410.01497 </a> (replaced) [<a href="/pdf/2410.01497" title="Download PDF" id="pdf-2410.01497" aria-labelledby="pdf-2410.01497">pdf</a>, <a href="https://arxiv.org/html/2410.01497v2" title="View HTML" id="html-2410.01497" aria-labelledby="html-2410.01497" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.01497" title="Other formats" id="oth-2410.01497" aria-labelledby="oth-2410.01497">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> DLP-LoRA: Efficient Task-Specific LoRA Fusion with a Dynamic, Lightweight Plugin for Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yuxuan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+R">Ruizhe Li</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Preprint under review, 18 pages, 7 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Recent advancements in Large Language Models (LLMs) have achieved robust performance across diverse tasks, but fine-tuning these models for specific domains remains resource-intensive. Parameter-Efficient Fine-Tuning (PEFT) methods like Low-Rank Adaptation (LoRA) address this challenge by fine-tuning a small subset of parameters. However, existing methods for fusing multiple LoRAs lack dynamic fusion based on contextual inputs and often increase inference time due to token-level operations. We propose DLP-LoRA, a Dynamic Lightweight Plugin that employs a mini-MLP module with only 5M parameters to dynamically fuse multiple LoRAs at the sentence level using top-p sampling strategies. This approach reduces inference time to less than twice that of single LoRA inference by leveraging parallel computation. Evaluations across 26 tasks-including multiple-choice questions and question answering-demonstrate that DLP-LoRA achieves an average accuracy of 92.34% on multiple-choice datasets and significant improvements in BLEU and ROUGE scores on QA datasets, outperforming different LLMs backbones under composite task settings. DLP-LoRA effectively balances performance and efficiency, making it a practical solution for dynamic multi-task adaptation in LLMs. Our code is available at <a href="https://github.com/MeCuping/DLP-LoRA" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item200'>[200]</a> <a href ="/abs/2410.02089" title="Abstract" id="2410.02089"> arXiv:2410.02089 </a> (replaced) [<a href="/pdf/2410.02089" title="Download PDF" id="pdf-2410.02089" aria-labelledby="pdf-2410.02089">pdf</a>, <a href="https://arxiv.org/html/2410.02089v2" title="View HTML" id="html-2410.02089" aria-labelledby="html-2410.02089" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.02089" title="Other formats" id="oth-2410.02089" aria-labelledby="oth-2410.02089">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> RLEF: Grounding Code LLMs in Execution Feedback with Reinforcement Learning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Gehring,+J">Jonas Gehring</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zheng,+K">Kunhao Zheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Copet,+J">Jade Copet</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mella,+V">Vegard Mella</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Carbonneaux,+Q">Quentin Carbonneaux</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cohen,+T">Taco Cohen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Synnaeve,+G">Gabriel Synnaeve</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Add repair model ablation, update related work </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Large language models (LLMs) deployed as agents solve user-specified tasks over multiple steps while keeping the required manual engagement to a minimum. Crucially, such LLMs need to ground their generations in any feedback obtained to reliably achieve the desired outcomes. We propose an end-to-end reinforcement learning method for teaching models to leverage execution feedback in the realm of code synthesis, where state-of-the-art LLMs struggle to improve code iteratively compared to independent sampling. We benchmark on competitive programming tasks, where we achieve new state-of-the art results with both small (8B parameters) and large (70B) models while reducing the amount of samples required by an order of magnitude. Our analysis of inference-time behavior demonstrates that our method produces LLMs that effectively leverage automatic feedback over multiple steps. </p> </div> </dd> <dt> <a name='item201'>[201]</a> <a href ="/abs/2410.05193" title="Abstract" id="2410.05193"> arXiv:2410.05193 </a> (replaced) [<a href="/pdf/2410.05193" title="Download PDF" id="pdf-2410.05193" aria-labelledby="pdf-2410.05193">pdf</a>, <a href="https://arxiv.org/html/2410.05193v2" title="View HTML" id="html-2410.05193" aria-labelledby="html-2410.05193" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.05193" title="Other formats" id="oth-2410.05193" aria-labelledby="oth-2410.05193">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> RevisEval: Improving LLM-as-a-Judge via Response-Adapted References </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Q">Qiyuan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yufei Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=YU,+T">Tiezheng YU</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+Y">Yuxin Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+C">Chuhan Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+L">Liangyou Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yasheng Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+X">Xin Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shang,+L">Lifeng Shang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+R">Ruiming Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lyu,+F">Fuyuan Lyu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+C">Chen Ma</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> With significant efforts in recent studies, LLM-as-a-Judge has become a cost-effective alternative to human evaluation for assessing text generation quality in a wide range of tasks. However, there still remains a reliability gap between LLM-as-a-Judge and human evaluation. One important reason is the lack of guided oracles in the evaluation process. Motivated by the role of reference pervasively used in classic text evaluation, we introduce RevisEval, a novel text generation evaluation paradigm via the response-adapted references. RevisEval is driven by the key observation that an ideal reference should maintain the necessary relevance to the response to be evaluated. Specifically, RevisEval leverages the text revision capabilities of large language models (LLMs) to adaptively revise the response, then treat the revised text as the reference (response-adapted reference) for the subsequent evaluation. Extensive experiments demonstrate that RevisEval outperforms traditional reference-free and reference-based evaluation paradigms that use LLM-as-a-Judge across NLG tasks and open-ended instruction-following tasks. More importantly, our response-adapted references can further boost the classical text metrics, e.g., BLEU and BERTScore, compared to traditional references and even rival the LLM-as-a-Judge. A detailed analysis is also conducted to confirm RevisEval's effectiveness in bias reduction, the impact of inference cost, and reference relevance. </p> </div> </dd> <dt> <a name='item202'>[202]</a> <a href ="/abs/2410.07745" title="Abstract" id="2410.07745"> arXiv:2410.07745 </a> (replaced) [<a href="/pdf/2410.07745" title="Download PDF" id="pdf-2410.07745" aria-labelledby="pdf-2410.07745">pdf</a>, <a href="/format/2410.07745" title="Other formats" id="oth-2410.07745" aria-labelledby="oth-2410.07745">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> StepTool: Enhancing Multi-Step Tool Usage in LLMs through Step-Grained Reinforcement Learning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+Y">Yuanqing Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zhefan Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+W">Weizhi Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Shuai Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+C">Chuhan Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+Z">Zhiqiang Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+M">Min Zhang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Ongoning Work </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Despite powerful text generation capabilities, large language models (LLMs) still need to learn how to utilize external tools to solve complex tasks, a process known as tool learning. Existing methods primarily rely on supervised fine-tuning to enhance tool-use capabilities, treating tool learning as a text-generation task while overlooking the decision-making complexities inherent in multi-step contexts. In this work, we propose modeling tool learning as a dynamic decision-making task and introduce StepTool, a novel step-grained reinforcement learning framework that enhances the multi-step tool use capabilities of LLMs. StepTool consists of two main components: Step-grained Reward Shaping, which assigns rewards at each tool interaction based on the success of tool invocation and its contribution to the task; and Step-grained Optimization, which uses policy gradient methods to optimize the model in a multi-step manner. Experimental results demonstrate that StepTool significantly outperforms existing methods in multi-step, tool-based tasks, offering a robust solution for tool learning. </p> </div> </dd> <dt> <a name='item203'>[203]</a> <a href ="/abs/2410.08115" title="Abstract" id="2410.08115"> arXiv:2410.08115 </a> (replaced) [<a href="/pdf/2410.08115" title="Download PDF" id="pdf-2410.08115" aria-labelledby="pdf-2410.08115">pdf</a>, <a href="https://arxiv.org/html/2410.08115v2" title="View HTML" id="html-2410.08115" aria-labelledby="html-2410.08115" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.08115" title="Other formats" id="oth-2410.08115" aria-labelledby="oth-2410.08115">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Optima: Optimizing Effectiveness and Efficiency for LLM-Based Multi-Agent System </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+W">Weize Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+J">Jiarui Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qian,+C">Chen Qian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+C">Cheng Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zhiyuan Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+M">Maosong Sun</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Under review </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Large Language Model (LLM) based multi-agent systems (MAS) show remarkable potential in collaborative problem-solving, yet they still face critical challenges: low communication efficiency, poor scalability, and a lack of effective parameter-updating optimization methods. We present Optima, a novel framework that addresses these issues by significantly enhancing both communication efficiency and task effectiveness in LLM-based MAS through LLM training. Optima employs an iterative generate, rank, select, and train paradigm with a reward function balancing task performance, token efficiency, and communication readability. We explore various RL algorithms, including Supervised Fine-Tuning, Direct Preference Optimization, and their hybrid approaches, providing insights into their effectiveness-efficiency trade-offs. We integrate Monte Carlo Tree Search-inspired techniques for DPO data generation, treating conversation turns as tree nodes to explore diverse interaction paths. Evaluated on common multi-agent tasks, including information-asymmetric question answering and complex reasoning, Optima shows consistent and substantial improvements over single-agent baselines and vanilla MAS based on Llama 3 8B, achieving up to 2.8x performance gain with less than 10\% tokens on tasks requiring heavy information exchange. Moreover, Optima's efficiency gains open new possibilities for leveraging inference-compute more effectively, leading to improved inference-time scaling laws. By addressing fundamental challenges in LLM-based MAS, Optima shows the potential towards scalable, efficient, and effective MAS (<a href="https://chenweize1998.github.io/optima-project-page" rel="external noopener nofollow" class="link-external link-https">this https URL</a>). </p> </div> </dd> <dt> <a name='item204'>[204]</a> <a href ="/abs/2410.11006" title="Abstract" id="2410.11006"> arXiv:2410.11006 </a> (replaced) [<a href="/pdf/2410.11006" title="Download PDF" id="pdf-2410.11006" aria-labelledby="pdf-2410.11006">pdf</a>, <a href="https://arxiv.org/html/2410.11006v2" title="View HTML" id="html-2410.11006" aria-labelledby="html-2410.11006" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.11006" title="Other formats" id="oth-2410.11006" aria-labelledby="oth-2410.11006">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Effective Self-Mining of In-Context Examples for Unsupervised Machine Translation with LLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Mekki,+A+E">Abdellah El Mekki</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Abdul-Mageed,+M">Muhammad Abdul-Mageed</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted at NAACL 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large Language Models (LLMs) have demonstrated impressive performance on a wide range of natural language processing (NLP) tasks, primarily through in-context learning (ICL). In ICL, the LLM is provided with examples that represent a given task such that it learns to generate answers for test inputs. However, access to these in-context examples is not guaranteed especially for low-resource or massively multilingual tasks. In this work, we propose an unsupervised approach to mine in-context examples for machine translation (MT), enabling unsupervised MT (UMT) across different languages. Our approach begins with word-level mining to acquire word translations that are then used to perform sentence-level mining. As the quality of mined parallel pairs may not be optimal due to noise or mistakes, we introduce a filtering criterion to select the optimal in-context examples from a pool of unsupervised parallel sentences. We evaluate our approach using two multilingual LLMs on 288 directions from the FLORES-200 dataset and analyze the impact of various linguistic features on performance. Our findings demonstrate the effectiveness of our unsupervised approach in mining in-context examples for MT, leading to better or comparable translation performance as translation with regular in-context samples (extracted from human-annotated data), while also outperforming the other state-of-the-art UMT methods by an average of $7$ BLEU points. </p> </div> </dd> <dt> <a name='item205'>[205]</a> <a href ="/abs/2410.11086" title="Abstract" id="2410.11086"> arXiv:2410.11086 </a> (replaced) [<a href="/pdf/2410.11086" title="Download PDF" id="pdf-2410.11086" aria-labelledby="pdf-2410.11086">pdf</a>, <a href="https://arxiv.org/html/2410.11086v3" title="View HTML" id="html-2410.11086" aria-labelledby="html-2410.11086" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.11086" title="Other formats" id="oth-2410.11086" aria-labelledby="oth-2410.11086">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> JOOCI: a Framework for Learning Comprehensive Speech Representations </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yadav,+H">Hemant Yadav</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shah,+R+R">Rajiv Ratn Shah</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sitaram,+S">Sunayana Sitaram</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Submitted to ICLR 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Information in speech can be categorized into two groups: Content (what is being said, such as linguistics) and Other (how it is expressed such as information about speaker and paralinguistic features). Current self-supervised learning (SSL) methods are shown to divide the model's representational-depth or layers in two, with earlier layers specializing in Other and later layers in Content related tasks. This layer-wise division is inherently sub-optimal, as neither information type can use all layers to build hierarchical representations. To address this, we propose JOOCI, a novel speech representation learning method that does not compromise on the representational-depth for either information type. JOOCI outperforms WavLM by 26.5%, and other models of similar size (100M parameters), when evaluated on two speaker recognition and two language tasks from the SUPERB benchmark, demonstrating its effectiveness in Jointly Optimizing Other and Content Information (JOOCI). </p> </div> </dd> <dt> <a name='item206'>[206]</a> <a href ="/abs/2410.12499" title="Abstract" id="2410.12499"> arXiv:2410.12499 </a> (replaced) [<a href="/pdf/2410.12499" title="Download PDF" id="pdf-2410.12499" aria-labelledby="pdf-2410.12499">pdf</a>, <a href="https://arxiv.org/html/2410.12499v2" title="View HTML" id="html-2410.12499" aria-labelledby="html-2410.12499" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.12499" title="Other formats" id="oth-2410.12499" aria-labelledby="oth-2410.12499">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> With a Grain of SALT: Are LLMs Fair Across Social Dimensions? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Arif,+S">Samee Arif</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Khan,+Z">Zohaib Khan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kaleem,+M">Maaidah Kaleem</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rashid,+S">Suhaib Rashid</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Raza,+A+A">Agha Ali Raza</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Athar,+A">Awais Athar</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> This paper presents a systematic analysis of biases in open-source Large Language Models (LLMs), across gender, religion, and race. Our study evaluates bias in smaller-scale Llama and Gemma models using the SALT ($\textbf{S}$ocial $\textbf{A}$ppropriateness in $\textbf{L}$LM-Generated $\textbf{T}$ext) dataset, which incorporates five distinct bias triggers: General Debate, Positioned Debate, Career Advice, Problem Solving, and CV Generation. To quantify bias, we measure win rates in General Debate and the assignment of negative roles in Positioned Debate. For real-world use cases, such as Career Advice, Problem Solving, and CV Generation, we anonymize the outputs to remove explicit demographic identifiers and use DeepSeek-R1 as an automated evaluator. We also address inherent biases in LLM-based evaluation, including evaluation bias, positional bias, and length bias, and validate our results through human evaluations. Our findings reveal consistent polarization across models, with certain demographic groups receiving systematically favorable or unfavorable treatment. By introducing SALT, we provide a comprehensive benchmark for bias analysis and underscore the need for robust bias mitigation strategies in the development of equitable AI systems. </p> </div> </dd> <dt> <a name='item207'>[207]</a> <a href ="/abs/2410.12866" title="Abstract" id="2410.12866"> arXiv:2410.12866 </a> (replaced) [<a href="/pdf/2410.12866" title="Download PDF" id="pdf-2410.12866" aria-labelledby="pdf-2410.12866">pdf</a>, <a href="https://arxiv.org/html/2410.12866v2" title="View HTML" id="html-2410.12866" aria-labelledby="html-2410.12866" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.12866" title="Other formats" id="oth-2410.12866" aria-labelledby="oth-2410.12866">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Towards Homogeneous Lexical Tone Decoding from Heterogeneous Intracranial Recordings </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+D">Di Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+S">Siyuan Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+C">Chen Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+L">Lu Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yue Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+J">Jie Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sawan,+M">Mohamad Sawan</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> ICLR2025 Poster (Preprint V2) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG); Sound (cs.SD); Audio and Speech Processing (eess.AS); Neurons and Cognition (q-bio.NC) </div> <p class='mathjax'> Recent advancements in brain-computer interfaces (BCIs) have enabled the decoding of lexical tones from intracranial recordings, offering the potential to restore the communication abilities of speech-impaired tonal language speakers. However, data heterogeneity induced by both physiological and instrumental factors poses a significant challenge for unified invasive brain tone decoding. Traditional subject-specific models, which operate under a heterogeneous decoding paradigm, fail to capture generalized neural representations and cannot effectively leverage data across subjects. To address these limitations, we introduce Homogeneity-Heterogeneity Disentangled Learning for neural Representations (H2DiLR), a novel framework that disentangles and learns both the homogeneity and heterogeneity from intracranial recordings across multiple subjects. To evaluate H2DiLR, we collected stereoelectroencephalography (sEEG) data from multiple participants reading Mandarin materials comprising 407 syllables, representing nearly all Mandarin characters. Extensive experiments demonstrate that H2DiLR, as a unified decoding paradigm, significantly outperforms the conventional heterogeneous decoding approach. Furthermore, we empirically confirm that H2DiLR effectively captures both homogeneity and heterogeneity during neural representation learning. </p> </div> </dd> <dt> <a name='item208'>[208]</a> <a href ="/abs/2410.14157" title="Abstract" id="2410.14157"> arXiv:2410.14157 </a> (replaced) [<a href="/pdf/2410.14157" title="Download PDF" id="pdf-2410.14157" aria-labelledby="pdf-2410.14157">pdf</a>, <a href="https://arxiv.org/html/2410.14157v3" title="View HTML" id="html-2410.14157" aria-labelledby="html-2410.14157" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.14157" title="Other formats" id="oth-2410.14157" aria-labelledby="oth-2410.14157">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Beyond Autoregression: Discrete Diffusion for Complex Reasoning and Planning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ye,+J">Jiacheng Ye</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+J">Jiahui Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gong,+S">Shansan Gong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zheng,+L">Lin Zheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+X">Xin Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Z">Zhenguo Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kong,+L">Lingpeng Kong</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> ICLR 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Autoregressive language models, despite their impressive capabilities, struggle with complex reasoning and long-term planning tasks. We introduce discrete diffusion models as a novel solution to these challenges. Through the lens of subgoal imbalance, we demonstrate how diffusion models effectively learn difficult subgoals that elude autoregressive approaches. We propose Multi-Granularity Diffusion Modeling (MGDM), which prioritizes subgoals based on difficulty during learning. On complex tasks like Countdown, Sudoku, and Boolean Satisfiability Problems, MGDM significantly outperforms autoregressive models without using search techniques. For instance, MGDM achieves 91.5\% and 100\% accuracy on Countdown and Sudoku, respectively, compared to 45.8\% and 20.7\% for autoregressive models. Our work highlights the potential of diffusion-based approaches in advancing AI capabilities for sophisticated language understanding and problem-solving tasks. All associated codes are available at \href{<a href="https://github.com/HKUNLP/diffusion-vs-ar" rel="external noopener nofollow" class="link-external link-https">this https URL</a>}{<a href="https://github.com/HKUNLP/diffusion-vs-ar" rel="external noopener nofollow" class="link-external link-https">this https URL</a>}. </p> </div> </dd> <dt> <a name='item209'>[209]</a> <a href ="/abs/2410.15761" title="Abstract" id="2410.15761"> arXiv:2410.15761 </a> (replaced) [<a href="/pdf/2410.15761" title="Download PDF" id="pdf-2410.15761" aria-labelledby="pdf-2410.15761">pdf</a>, <a href="/format/2410.15761" title="Other formats" id="oth-2410.15761" aria-labelledby="oth-2410.15761">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Optimal Query Allocation in Extractive QA with LLMs: A Learning-to-Defer Framework with Theoretical Guarantees </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Montreuil,+Y">Yannis Montreuil</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yeo,+S+H">Shu Heng Yeo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Carlier,+A">Axel Carlier</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ng,+L+X">Lai Xing Ng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ooi,+W+T">Wei Tsang Ooi</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 25 pages, 17 main paper </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Machine Learning (cs.LG); Machine Learning (stat.ML) </div> <p class='mathjax'> Large Language Models excel in generative tasks but exhibit inefficiencies in structured text selection, particularly in extractive question answering. This challenge is magnified in resource-constrained environments, where deploying multiple specialized models for different tasks is impractical. We propose a Learning-to-Defer framework that allocates queries to specialized experts, ensuring high-confidence predictions while optimizing computational efficiency. Our approach integrates a principled allocation strategy with theoretical guarantees on optimal deferral that balances performance and cost. Empirical evaluations on SQuADv1, SQuADv2, and TriviaQA demonstrate that our method enhances answer reliability while significantly reducing computational overhead, making it well-suited for scalable and efficient EQA deployment. </p> </div> </dd> <dt> <a name='item210'>[210]</a> <a href ="/abs/2410.15939" title="Abstract" id="2410.15939"> arXiv:2410.15939 </a> (replaced) [<a href="/pdf/2410.15939" title="Download PDF" id="pdf-2410.15939" aria-labelledby="pdf-2410.15939">pdf</a>, <a href="https://arxiv.org/html/2410.15939v2" title="View HTML" id="html-2410.15939" aria-labelledby="html-2410.15939" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.15939" title="Other formats" id="oth-2410.15939" aria-labelledby="oth-2410.15939">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CausalGraph2LLM: Evaluating LLMs for Causal Queries </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Sheth,+I">Ivaxi Sheth</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fatemi,+B">Bahare Fatemi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fritz,+M">Mario Fritz</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> NAACL'25 Findings, Code - <a href="https://github.com/ivaxi0s/CausalGraph2LLM" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Causality is essential in scientific research, enabling researchers to interpret true relationships between variables. These causal relationships are often represented by causal graphs, which are directed acyclic graphs. With the recent advancements in Large Language Models (LLMs), there is an increasing interest in exploring their capabilities in causal reasoning and their potential use to hypothesize causal graphs. These tasks necessitate the LLMs to encode the causal graph effectively for subsequent downstream tasks. In this paper, we introduce CausalGraph2LLM, a comprehensive benchmark comprising over 700k queries across diverse causal graph settings to evaluate the causal reasoning capabilities of LLMs. We categorize the causal queries into two types: graph-level and node-level queries. We benchmark both open-sourced and propriety models for our study. Our findings reveal that while LLMs show promise in this domain, they are highly sensitive to the encoding used. Even capable models like GPT-4 and Gemini-1.5 exhibit sensitivity to encoding, with deviations of about $60\%$. We further demonstrate this sensitivity for downstream causal intervention tasks. Moreover, we observe that LLMs can often display biases when presented with contextual information about a causal graph, potentially stemming from their parametric memory. </p> </div> </dd> <dt> <a name='item211'>[211]</a> <a href ="/abs/2410.17714" title="Abstract" id="2410.17714"> arXiv:2410.17714 </a> (replaced) [<a href="/pdf/2410.17714" title="Download PDF" id="pdf-2410.17714" aria-labelledby="pdf-2410.17714">pdf</a>, <a href="https://arxiv.org/html/2410.17714v2" title="View HTML" id="html-2410.17714" aria-labelledby="html-2410.17714" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.17714" title="Other formats" id="oth-2410.17714" aria-labelledby="oth-2410.17714">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CogSteer: Cognition-Inspired Selective Layer Intervention for Efficiently Steering Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+X">Xintong Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pan,+J">Jingheng Pan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ding,+L">Liang Ding</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+L">Longyue Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+L">Longqin Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+X">Xingshan Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Biemann,+C">Chris Biemann</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Large Language Models (LLMs) achieve remarkable performance through pretraining on extensive data. This enables efficient adaptation to diverse downstream tasks. However, the lack of interpretability in their underlying mechanisms limits the ability to effectively steer LLMs for specific applications. In this work, we investigate the intrinsic mechanisms of LLMs from a cognitive perspective using eye movement measures. Specifically, we analyze the layer-wise correlation between human cognitive indicators and LLM representations. Building on these insights, we propose a heuristic approach for selecting the optimal steering layer to modulate LLM semantics. To this end, we introduce an efficient selective layer intervention based on prominent parameter-efficient fine-tuning methods, which conventionally adjust either all layers or only the final layer. Additionally, we present an implicit layer contrastive intervention during inference to steer LLMs away from toxic outputs. Extensive experiments on natural language understanding, reasoning, and generation tasks, conducted on GPT-2, LLaMa2-7B, and Mixtral-7B, demonstrate the effectiveness and efficiency of our approach. As a model-agnostic framework, it enhances the interpretability of LLMs while improving efficiency for safe deployment. </p> </div> </dd> <dt> <a name='item212'>[212]</a> <a href ="/abs/2410.21545" title="Abstract" id="2410.21545"> arXiv:2410.21545 </a> (replaced) [<a href="/pdf/2410.21545" title="Download PDF" id="pdf-2410.21545" aria-labelledby="pdf-2410.21545">pdf</a>, <a href="/format/2410.21545" title="Other formats" id="oth-2410.21545" aria-labelledby="oth-2410.21545">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CARMO: Dynamic Criteria Generation for Context-Aware Reward Modelling </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Gupta,+T">Taneesh Gupta</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shandilya,+S">Shivam Shandilya</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+X">Xuchao Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Madhavan,+R">Rahul Madhavan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ghosh,+S">Supriyo Ghosh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bansal,+C">Chetan Bansal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yao,+H">Huaxiu Yao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rajmohan,+S">Saravan Rajmohan</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Reward modeling in large language models is susceptible to reward hacking, causing models to latch onto superficial features such as the tendency to generate lists or unnecessarily long responses. In reinforcement learning from human feedback (RLHF) and more generally during post-training flawed reward signals often lead to outputs that optimize for these spurious correlates instead of genuine quality or correctness. We propose Context-Aware Reward Modeling (CARMO), a novel approach that first generates dynamic, context-relevant criteria to ground the reward model before producing reward scores. Unlike prior methods that rely on static rubrics, CARMO leverages large language models (LLMs) to adaptively create evaluation criteria such as logical consistency, clarity, and depth tailored to the user query. Our theoretical analysis shows that such criteria generation can mitigate reward hacking. We further demonstrate that CARMO can be distilled into smaller models, reducing the computational cost of alignment. We establish a new state-of-the-art performance in zero-shot settings for generative models, achieving a 2.1\% improvement on Reward Bench. Furthermore, alignment performed on the CARMO-curated preference dataset achieves 22.5\% and 21.1\% LC-WR and WR, respectively, on Mistral-Base (7B). </p> </div> </dd> <dt> <a name='item213'>[213]</a> <a href ="/abs/2410.22071" title="Abstract" id="2410.22071"> arXiv:2410.22071 </a> (replaced) [<a href="/pdf/2410.22071" title="Download PDF" id="pdf-2410.22071" aria-labelledby="pdf-2410.22071">pdf</a>, <a href="https://arxiv.org/html/2410.22071v2" title="View HTML" id="html-2410.22071" aria-labelledby="html-2410.22071" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.22071" title="Other formats" id="oth-2410.22071" aria-labelledby="oth-2410.22071">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Distinguishing Ignorance from Error in LLM Hallucinations </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Simhi,+A">Adi Simhi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Herzig,+J">Jonathan Herzig</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Szpektor,+I">Idan Szpektor</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Belinkov,+Y">Yonatan Belinkov</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large language models (LLMs) are susceptible to hallucinations -- factually incorrect outputs -- leading to a large body of work on detecting and mitigating such cases. We argue that it is important to distinguish between two types of hallucinations: ones where the model does not hold the correct answer in its parameters, which we term HK-, and ones where the model answers incorrectly despite having the required knowledge, termed HK+. We first find that HK+ hallucinations are prevalent and occur across models and datasets. Then, we demonstrate that distinguishing between these two cases is beneficial for mitigating hallucinations. Importantly, we show that different models hallucinate on different examples, which motivates constructing model-specific hallucination datasets for training detectors. Overall, our findings draw attention to classifying types of hallucinations and provide means to handle them more effectively. The code is available at <a href="https://github.com/technion-cs-nlp/hallucination-mitigation" rel="external noopener nofollow" class="link-external link-https">this https URL</a> . </p> </div> </dd> <dt> <a name='item214'>[214]</a> <a href ="/abs/2411.01077" title="Abstract" id="2411.01077"> arXiv:2411.01077 </a> (replaced) [<a href="/pdf/2411.01077" title="Download PDF" id="pdf-2411.01077" aria-labelledby="pdf-2411.01077">pdf</a>, <a href="https://arxiv.org/html/2411.01077v2" title="View HTML" id="html-2411.01077" aria-labelledby="html-2411.01077" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.01077" title="Other formats" id="oth-2411.01077" aria-labelledby="oth-2411.01077">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Emoji Attack: Enhancing Jailbreak Attacks Against Judge LLM Detection </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wei,+Z">Zhipeng Wei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Y">Yuqi Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Erichson,+N+B">N. Benjamin Erichson</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Jailbreaking techniques trick Large Language Models (LLMs) into producing restricted outputs, posing a serious threat. One line of defense is to use another LLM as a Judge to evaluate the harmfulness of generated text. However, we reveal that these Judge LLMs are vulnerable to token segmentation bias, an issue that arises when delimiters alter the tokenization process, splitting words into smaller sub-tokens. This disrupts the embeddings of the entire sequence, reducing detection accuracy and allowing harmful content to be misclassified as safe. In this paper, we introduce Emoji Attack, a novel strategy that amplifies existing jailbreak prompts by exploiting token segmentation bias. Our method leverages in-context learning to systematically insert emojis into text before it is evaluated by a Judge LLM, inducing embedding distortions that significantly lower the likelihood of detecting unsafe content. Unlike traditional delimiters, emojis also introduce semantic ambiguity, making them particularly effective in this attack. Through experiments on state-of-the-art Judge LLMs, we demonstrate that Emoji Attack substantially reduces the "unsafe" prediction rate, bypassing existing safeguards. </p> </div> </dd> <dt> <a name='item215'>[215]</a> <a href ="/abs/2411.01281" title="Abstract" id="2411.01281"> arXiv:2411.01281 </a> (replaced) [<a href="/pdf/2411.01281" title="Download PDF" id="pdf-2411.01281" aria-labelledby="pdf-2411.01281">pdf</a>, <a href="https://arxiv.org/html/2411.01281v2" title="View HTML" id="html-2411.01281" aria-labelledby="html-2411.01281" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.01281" title="Other formats" id="oth-2411.01281" aria-labelledby="oth-2411.01281">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Varco Arena: A Tournament Approach to Reference-Free Benchmarking Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Son,+S">Seonil Son</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Oh,+J">Ju-Min Oh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jin,+H">Heegon Jin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jang,+C">Cheolhun Jang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jeong,+J">Jeongbeom Jeong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+K">Kuntae Kim</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 8 pages for main body, 17 pages in total </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Most existing benchmarking approaches for evaluating the output quality of large language models (LLMs) rely on comparing LLM responses to predefined references. Such methods, based on static datasets, quickly become outdated as LLM capabilities and use cases evolve. In this work, we introduce VARCO Arena--a novel, cost-effective, and robust benchmarking approach that leverages a single-elimination tournament structure to minimize the number of required comparisons while eliminating the need for static references or costly human annotations. We validate our approach through two experiments: (i) a simulation study that examines its robustness under various conditions, and (ii) an empirical evaluation using publicly available benchmark prompts. In both experiments, VARCO Arena consistently outperforms current LLM benchmarking practices, achieving stronger correlations with human-established Elo ratings. Our results demonstrate that VARCO Arena not only produces reliable LLM rankings but also provides a scalable, adaptable solution for qualitative evaluation across diverse, customized use cases. </p> </div> </dd> <dt> <a name='item216'>[216]</a> <a href ="/abs/2411.07320" title="Abstract" id="2411.07320"> arXiv:2411.07320 </a> (replaced) [<a href="/pdf/2411.07320" title="Download PDF" id="pdf-2411.07320" aria-labelledby="pdf-2411.07320">pdf</a>, <a href="https://arxiv.org/html/2411.07320v2" title="View HTML" id="html-2411.07320" aria-labelledby="html-2411.07320" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.07320" title="Other formats" id="oth-2411.07320" aria-labelledby="oth-2411.07320">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Richer Output for Richer Countries: Uncovering Geographical Disparities in Generated Stories and Travel Recommendations </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Bhagat,+K">Kirti Bhagat</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Vasisht,+K">Kinshuk Vasisht</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pruthi,+D">Danish Pruthi</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Findings of NAACL (2025) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Computers and Society (cs.CY); Machine Learning (cs.LG) </div> <p class='mathjax'> While a large body of work inspects language models for biases concerning gender, race, occupation and religion, biases of geographical nature are relatively less explored. Some recent studies benchmark the degree to which large language models encode geospatial knowledge. However, the impact of the encoded geographical knowledge (or lack thereof) on real-world applications has not been documented. In this work, we examine large language models for two common scenarios that require geographical knowledge: (a) travel recommendations and (b) geo-anchored story generation. Specifically, we study five popular language models, and across about $100$K travel requests, and $200$K story generations, we observe that travel recommendations corresponding to poorer countries are less unique with fewer location references, and stories from these regions more often convey emotions of hardship and sadness compared to those from wealthier nations. </p> </div> </dd> <dt> <a name='item217'>[217]</a> <a href ="/abs/2411.09109" title="Abstract" id="2411.09109"> arXiv:2411.09109 </a> (replaced) [<a href="/pdf/2411.09109" title="Download PDF" id="pdf-2411.09109" aria-labelledby="pdf-2411.09109">pdf</a>, <a href="https://arxiv.org/html/2411.09109v2" title="View HTML" id="html-2411.09109" aria-labelledby="html-2411.09109" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.09109" title="Other formats" id="oth-2411.09109" aria-labelledby="oth-2411.09109">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Personalized Help for Optimizing Low-Skilled Users' Strategy </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Gu,+F">Feng Gu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wongkamjan,+W">Wichayaporn Wongkamjan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kummerfeld,+J+K">Jonathan K. Kummerfeld</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Peskoff,+D">Denis Peskoff</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=May,+J">Jonathan May</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Boyd-Graber,+J">Jordan Boyd-Graber</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 9 pages, 3 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> AIs can beat humans in game environments; however, how helpful those agents are to human remains understudied. We augment CICERO, a natural language agent that demonstrates superhuman performance in Diplomacy, to generate both move and message advice based on player intentions. A dozen Diplomacy games with novice and experienced players, with varying advice settings, show that some of the generated advice is beneficial. It helps novices compete with experienced players and in some instances even surpass them. The mere presence of advice can be advantageous, even if players do not follow it. </p> </div> </dd> <dt> <a name='item218'>[218]</a> <a href ="/abs/2411.15175" title="Abstract" id="2411.15175"> arXiv:2411.15175 </a> (replaced) [<a href="/pdf/2411.15175" title="Download PDF" id="pdf-2411.15175" aria-labelledby="pdf-2411.15175">pdf</a>, <a href="https://arxiv.org/html/2411.15175v3" title="View HTML" id="html-2411.15175" aria-labelledby="html-2411.15175" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.15175" title="Other formats" id="oth-2411.15175" aria-labelledby="oth-2411.15175">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> ToxiLab: How Well Do Open-Source LLMs Generate Synthetic Toxicity Data? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Hui,+Z">Zheng Hui</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+Z">Zhaoxiao Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+H">Hang Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Duan,+J">Juanyong Duan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ai,+L">Lin Ai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yinheng Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hirschberg,+J">Julia Hirschberg</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+C">Congrui Huang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 14 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Effective toxic content detection relies heavily on high-quality and diverse data, which serve as the foundation for robust content moderation models. Synthetic data has become a common approach for training models across various NLP tasks. However, its effectiveness remains uncertain for highly subjective tasks like hate speech detection, with previous research yielding mixed results. This study explores the potential of open-source LLMs for harmful data synthesis, utilizing controlled prompting and supervised fine-tuning techniques to enhance data quality and diversity. We systematically evaluated 6 open source LLMs on 5 datasets, assessing their ability to generate diverse, high-quality harmful data while minimizing hallucination and duplication. Our results show that Mistral consistently outperforms other open models, and supervised fine-tuning significantly enhances data reliability and diversity. We further analyze the trade-offs between prompt-based vs. fine-tuned toxic data synthesis, discuss real-world deployment challenges, and highlight ethical considerations. Our findings demonstrate that fine-tuned open source LLMs provide scalable and cost-effective solutions to augment toxic content detection datasets, paving the way for more accessible and transparent content moderation tools. </p> </div> </dd> <dt> <a name='item219'>[219]</a> <a href ="/abs/2411.16365" title="Abstract" id="2411.16365"> arXiv:2411.16365 </a> (replaced) [<a href="/pdf/2411.16365" title="Download PDF" id="pdf-2411.16365" aria-labelledby="pdf-2411.16365">pdf</a>, <a href="https://arxiv.org/html/2411.16365v3" title="View HTML" id="html-2411.16365" aria-labelledby="html-2411.16365" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.16365" title="Other formats" id="oth-2411.16365" aria-labelledby="oth-2411.16365">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Multi-modal Retrieval Augmented Multi-modal Generation: Datasets, Evaluation Metrics and Strong Baselines </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+Z">Zi-Ao Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lan,+T">Tian Lan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tu,+R">Rong-Cheng Tu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+Y">Yong Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+Y">Yu-Shi Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+T">Tong Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+H">Heyan Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mao,+X">Xian-Ling Mao</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> We present a systematic investigation of Multi-modal Retrieval Augmented Multi-modal Generation (M$^2$RAG), a novel task that enables foundation models to process multi-modal web content and generate multi-modal responses, which exhibits better information density and readability. Despite its potential impact, M$^2$RAG remains understudied, lacking comprehensive analysis and high-quality data resources. To address this gap, we establish a comprehensive benchmark through a rigorous data curation pipeline, and employ text-modal metrics and multi-modal metrics based on foundation models for evaluation. We further propose several strategies for foundation models to process M$^2$RAG effectively and construct a training set by filtering high-quality samples using designed metrics. Our extensive experiments demonstrate the reliability of our proposed metrics, a landscape of model performance within our designed strategies, and show that our fine-tuned 7B-8B models outperform the state-of-the-art GPT-4o model. Additionally, we perform fine-grained analyses across diverse domains and validate the effectiveness of our designs in data curation pipeline. All resources, including codes, datasets, and model weights, will be publicly released. </p> </div> </dd> <dt> <a name='item220'>[220]</a> <a href ="/abs/2412.04726" title="Abstract" id="2412.04726"> arXiv:2412.04726 </a> (replaced) [<a href="/pdf/2412.04726" title="Download PDF" id="pdf-2412.04726" aria-labelledby="pdf-2412.04726">pdf</a>, <a href="https://arxiv.org/html/2412.04726v2" title="View HTML" id="html-2412.04726" aria-labelledby="html-2412.04726" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.04726" title="Other formats" id="oth-2412.04726" aria-labelledby="oth-2412.04726">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> BESSTIE: A Benchmark for Sentiment and Sarcasm Classification for Varieties of English </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Dipankar">Dipankar Srirag</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Joshi,+A">Aditya Joshi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Painter,+J">Jordan Painter</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kanojia,+D">Diptesh Kanojia</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Under review </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Despite large language models (LLMs) being known to exhibit bias against non-mainstream varieties, there are no known labeled datasets for sentiment analysis of English. To address this gap, we introduce BESSTIE, a benchmark for sentiment and sarcasm classification for three varieties of English: Australian (en-AU), Indian (en-IN), and British (en-UK). Using web-based content from two domains, namely, Google Place reviews and Reddit comments, we collect datasets for these language varieties using two methods: location-based and topic-based filtering. Native speakers of the language varieties manually annotate the datasets with sentiment and sarcasm labels. To assess whether the dataset accurately represents these varieties, we conduct two validation steps: (a) manual annotation of language varieties and (b) automatic language variety prediction. Subsequently, we fine-tune nine large language models (LLMs) (representing a range of encoder/decoder and mono/multilingual models) on these datasets, and evaluate their performance on the two tasks. Our results reveal that the models consistently perform better on inner-circle varieties (i.e., en-AU and en-UK), with significant performance drops for en-IN, particularly in sarcasm detection. We also report challenges in cross-variety generalisation, highlighting the need for language variety-specific datasets such as ours. BESSTIE promises to be a useful evaluative benchmark for future research in equitable LLMs, specifically in terms of language varieties. The BESSTIE datasets, code, and models will be publicly available upon acceptance. </p> </div> </dd> <dt> <a name='item221'>[221]</a> <a href ="/abs/2412.11936" title="Abstract" id="2412.11936"> arXiv:2412.11936 </a> (replaced) [<a href="/pdf/2412.11936" title="Download PDF" id="pdf-2412.11936" aria-labelledby="pdf-2412.11936">pdf</a>, <a href="/format/2412.11936" title="Other formats" id="oth-2412.11936" aria-labelledby="oth-2412.11936">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Survey of Mathematical Reasoning in the Era of Multimodal Large Language Model: Benchmark, Method & Challenges </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yan,+Y">Yibo Yan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Su,+J">Jiamin Su</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+J">Jianxiang He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fu,+F">Fangteng Fu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zheng,+X">Xu Zheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lyu,+Y">Yuanhuiyi Lyu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+K">Kun Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Shen Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wen,+Q">Qingsong Wen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+X">Xuming Hu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Mathematical reasoning, a core aspect of human cognition, is vital across many domains, from educational problem-solving to scientific advancements. As artificial general intelligence (AGI) progresses, integrating large language models (LLMs) with mathematical reasoning tasks is becoming increasingly significant. This survey provides the first comprehensive analysis of mathematical reasoning in the era of multimodal large language models (MLLMs). We review over 200 studies published since 2021, and examine the state-of-the-art developments in Math-LLMs, with a focus on multimodal settings. We categorize the field into three dimensions: benchmarks, methodologies, and challenges. In particular, we explore multimodal mathematical reasoning pipeline, as well as the role of (M)LLMs and the associated methodologies. Finally, we identify five major challenges hindering the realization of AGI in this domain, offering insights into the future direction for enhancing multimodal reasoning capabilities. This survey serves as a critical resource for the research community in advancing the capabilities of LLMs to tackle complex multimodal reasoning tasks. </p> </div> </dd> <dt> <a name='item222'>[222]</a> <a href ="/abs/2412.12509" title="Abstract" id="2412.12509"> arXiv:2412.12509 </a> (replaced) [<a href="/pdf/2412.12509" title="Download PDF" id="pdf-2412.12509" aria-labelledby="pdf-2412.12509">pdf</a>, <a href="https://arxiv.org/html/2412.12509v2" title="View HTML" id="html-2412.12509" aria-labelledby="html-2412.12509" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.12509" title="Other formats" id="oth-2412.12509" aria-labelledby="oth-2412.12509">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Can You Trust LLM Judgments? Reliability of LLM-as-a-Judge </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Schroeder,+K">Kayla Schroeder</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wood-Doughty,+Z">Zach Wood-Doughty</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large Language Models (LLMs) have become increasingly powerful and ubiquitous, but their stochastic nature poses challenges to the reliability of their outputs. While deterministic settings can improve consistency, they do not guarantee reliability, as a single sample from the model's probability distribution can still be misleading. Building upon the concept of LLM-as-a-judge, we introduce a novel framework for rigorously evaluating the reliability of LLM judgments, leveraging McDonald's omega. We evaluate the reliability of LLMs when judging the outputs of other LLMs on standard single-turn and multi-turn benchmarks, simultaneously investigating the impact of temperature on reliability. By analyzing these results, we demonstrate the limitations of fixed randomness and the importance of considering multiple samples, which we show has significant implications for downstream applications. Our findings highlight the need for a nuanced understanding of LLM reliability and the potential risks associated with over-reliance on single-shot evaluations. This work provides a crucial step towards building more trustworthy and reliable LLM-based systems and applications. </p> </div> </dd> <dt> <a name='item223'>[223]</a> <a href ="/abs/2412.13540" title="Abstract" id="2412.13540"> arXiv:2412.13540 </a> (replaced) [<a href="/pdf/2412.13540" title="Download PDF" id="pdf-2412.13540" aria-labelledby="pdf-2412.13540">pdf</a>, <a href="https://arxiv.org/html/2412.13540v2" title="View HTML" id="html-2412.13540" aria-labelledby="html-2412.13540" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.13540" title="Other formats" id="oth-2412.13540" aria-labelledby="oth-2412.13540">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Benchmarking and Improving Large Vision-Language Models for Fundamental Visual Graph Understanding and Reasoning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+Y">Yingjie Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bai,+X">Xuefeng Bai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+K">Kehai Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiang,+Y">Yang Xiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+J">Jun Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+M">Min Zhang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Large Vision-Language Models (LVLMs) have demonstrated remarkable performance across diverse tasks. Despite great success, recent studies show that LVLMs encounter substantial limitations when engaging with visual graphs. To study the reason behind these limitations, we propose VGCure, a comprehensive benchmark covering 22 tasks for examining the fundamental graph understanding and reasoning capacities of LVLMs. Extensive evaluations conducted on 14 LVLMs reveal that LVLMs are weak in basic graph understanding and reasoning tasks, particularly those concerning relational or structurally complex information. Based on this observation, we propose a structure-aware fine-tuning framework to enhance LVLMs with structure learning abilities through three self-supervised learning tasks. Experiments validate the effectiveness of our method in improving LVLMs' performance on fundamental and downstream graph learning tasks, as well as enhancing their robustness against complex visual graphs. </p> </div> </dd> <dt> <a name='item224'>[224]</a> <a href ="/abs/2412.13879" title="Abstract" id="2412.13879"> arXiv:2412.13879 </a> (replaced) [<a href="/pdf/2412.13879" title="Download PDF" id="pdf-2412.13879" aria-labelledby="pdf-2412.13879">pdf</a>, <a href="/format/2412.13879" title="Other formats" id="oth-2412.13879" aria-labelledby="oth-2412.13879">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Crabs: Consuming Resource via Auto-generation for LLM-DoS Attack under Black-box Settings </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yuanhe Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+Z">Zhenhong Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+W">Wei Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+X">Xinyue Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jia,+X">Xiaojun Jia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Y">Yang Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Su,+S">Sen Su</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 22 pages, 8 figures, 11 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Cryptography and Security (cs.CR) </div> <p class='mathjax'> Large Language Models (LLMs) have demonstrated remarkable performance across diverse tasks yet still are vulnerable to external threats, particularly LLM Denial-of-Service (LLM-DoS) attacks. Specifically, LLM-DoS attacks aim to exhaust computational resources and block services. However, existing studies predominantly focus on white-box attacks, leaving black-box scenarios underexplored. In this paper, we introduce Auto-Generation for LLM-DoS (AutoDoS) attack, an automated algorithm designed for black-box LLMs. AutoDoS constructs the DoS Attack Tree and expands the node coverage to achieve effectiveness under black-box conditions. By transferability-driven iterative optimization, AutoDoS could work across different models in one prompt. Furthermore, we reveal that embedding the Length Trojan allows AutoDoS to bypass existing defenses more effectively. Experimental results show that AutoDoS significantly amplifies service response latency by over 250$\times\uparrow$, leading to severe resource consumption in terms of GPU utilization and memory usage. Our work provides a new perspective on LLM-DoS attacks and security defenses. Our code is available at <a href="https://github.com/shuita2333/AutoDoS" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item225'>[225]</a> <a href ="/abs/2412.15628" title="Abstract" id="2412.15628"> arXiv:2412.15628 </a> (replaced) [<a href="/pdf/2412.15628" title="Download PDF" id="pdf-2412.15628" aria-labelledby="pdf-2412.15628">pdf</a>, <a href="https://arxiv.org/html/2412.15628v3" title="View HTML" id="html-2412.15628" aria-labelledby="html-2412.15628" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.15628" title="Other formats" id="oth-2412.15628" aria-labelledby="oth-2412.15628">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Can Input Attributions Interpret the Inductive Reasoning Process in In-Context Learning? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ye,+M">Mengyu Ye</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kuribayashi,+T">Tatsuki Kuribayashi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kobayashi,+G">Goro Kobayashi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Suzuki,+J">Jun Suzuki</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Preprint </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Interpreting the internal process of neural models has long been a challenge. This challenge remains relevant in the era of large language models (LLMs) and in-context learning (ICL); for example, ICL poses a new issue of interpreting which example in the few-shot examples contributed to identifying/solving the task. To this end, in this paper, we design synthetic diagnostic tasks of inductive reasoning, inspired by the generalization tests in linguistics; here, most in-context examples are ambiguous w.r.t. their underlying rule, and one critical example disambiguates the task demonstrated. The question is whether conventional input attribution (IA) methods can track such a reasoning process, i.e., identify the influential example, in ICL. Our experiments provide several practical findings; for example, a certain simple IA method works the best, and the larger the model, the generally harder it is to interpret the ICL with gradient-based IA methods. </p> </div> </dd> <dt> <a name='item226'>[226]</a> <a href ="/abs/2412.17395" title="Abstract" id="2412.17395"> arXiv:2412.17395 </a> (replaced) [<a href="/pdf/2412.17395" title="Download PDF" id="pdf-2412.17395" aria-labelledby="pdf-2412.17395">pdf</a>, <a href="https://arxiv.org/html/2412.17395v3" title="View HTML" id="html-2412.17395" aria-labelledby="html-2412.17395" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.17395" title="Other formats" id="oth-2412.17395" aria-labelledby="oth-2412.17395">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> WarriorCoder: Learning from Expert Battles to Augment Code Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+H">Huawen Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+P">Pu Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+Q">Qingfeng Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+C">Can Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+F">Fangkai Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+L">Lu Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+Q">Qianli Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+Q">Qingwei Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rajmohan,+S">Saravan Rajmohan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+D">Dongmei Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Q">Qi Zhang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Despite recent progress achieved by code large language models (LLMs), their remarkable abilities are largely dependent on fine-tuning on the high-quality data, posing challenges for data collection and annotation. To address this, current methods often design various data flywheels to collect complex code instructions, enabling models to handle more intricate tasks. However, these approaches typically rely on off-the-shelf datasets and data augmentation from a limited set of proprietary LLMs (e.g., Claude, GPT4, and so on), which restricts the diversity of the constructed data and makes it prone to systemic biases. In this paper, we propose WarriorCoder, a novel paradigm learns from expert battles to address these limitations. Specifically, we create an arena where leading expert code LLMs challenge each other, with evaluations conducted by impartial judges. This competitive framework generates novel training data from scratch, leveraging the strengths of all participants. Experimental results show that WarriorCoder achieves state-of-the-art performance compared to previous models of the same size, even without relying on proprietary LLMs. </p> </div> </dd> <dt> <a name='item227'>[227]</a> <a href ="/abs/2412.18069" title="Abstract" id="2412.18069"> arXiv:2412.18069 </a> (replaced) [<a href="/pdf/2412.18069" title="Download PDF" id="pdf-2412.18069" aria-labelledby="pdf-2412.18069">pdf</a>, <a href="https://arxiv.org/html/2412.18069v2" title="View HTML" id="html-2412.18069" aria-labelledby="html-2412.18069" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.18069" title="Other formats" id="oth-2412.18069" aria-labelledby="oth-2412.18069">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Improving Factuality with Explicit Working Memory </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+M">Mingda Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Padthe,+K">Karthik Padthe</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shao,+R">Rulin Shao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+A">Alicia Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zettlemoyer,+L">Luke Zettlemoyer</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ghosh,+G">Gargi Ghosh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yih,+W">Wen-tau Yih</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large language models can generate factually inaccurate content, a problem known as hallucination. Recent works have built upon retrieved-augmented generation to improve factuality through iterative prompting but these methods are limited by the traditional RAG design. To address these challenges, we introduce EWE (Explicit Working Memory), a novel approach that enhances factuality in long-form text generation by integrating a working memory that receives real-time feedback from external resources. The memory is refreshed based on online fact-checking and retrieval feedback, allowing EWE to rectify false claims during the generation process and ensure more accurate and reliable outputs. Our experiments demonstrate that Ewe outperforms strong baselines on four fact-seeking long-form generation datasets, increasing the factuality metric, VeriScore, by 2 to 6 points absolute without sacrificing the helpfulness of the responses. Further analysis reveals that the design of rules for memory updates, configurations of memory units, and the quality of the retrieval datastore are crucial factors for influencing model performance. </p> </div> </dd> <dt> <a name='item228'>[228]</a> <a href ="/abs/2412.19437" title="Abstract" id="2412.19437"> arXiv:2412.19437 </a> (replaced) [<a href="/pdf/2412.19437" title="Download PDF" id="pdf-2412.19437" aria-labelledby="pdf-2412.19437">pdf</a>, <a href="https://arxiv.org/html/2412.19437v2" title="View HTML" id="html-2412.19437" aria-labelledby="html-2412.19437" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.19437" title="Other formats" id="oth-2412.19437" aria-labelledby="oth-2412.19437">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> DeepSeek-V3 Technical Report </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=DeepSeek-AI">DeepSeek-AI</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+A">Aixin Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+B">Bei Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xue,+B">Bing Xue</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+B">Bingxuan Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+B">Bochao Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+C">Chengda Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+C">Chenggang Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Deng,+C">Chengqi Deng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+C">Chenyu Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ruan,+C">Chong Ruan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dai,+D">Damai Dai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+D">Daya Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+D">Dejian Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+D">Deli Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ji,+D">Dongjie Ji</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+E">Erhang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+F">Fangyun Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dai,+F">Fucong Dai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+F">Fuli Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hao,+G">Guangbo Hao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+G">Guanting Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+G">Guowei Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+H">H. Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bao,+H">Han Bao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+H">Hanwei Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+H">Haocheng Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+H">Haowei Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ding,+H">Honghui Ding</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xin,+H">Huajian Xin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+H">Huazuo Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+H">Hui Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qu,+H">Hui Qu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cai,+J">J.L. Cai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liang,+J">Jian Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+J">Jianzhong Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ni,+J">Jiaqi Ni</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+J">Jiashi Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+J">Jiawei Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+J">Jin Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+J">Jingchang Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+J">Jingyang Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qiu,+J">Junjie Qiu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+J">Junlong Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+J">Junxiao Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dong,+K">Kai Dong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+K">Kai Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+K">Kaige Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guan,+K">Kang Guan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+K">Kexin Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+K">Kuai Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+L">Lean Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+L">Lecong Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+L">Lei Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xia,+L">Leyi Xia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+L">Liang Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+L">Litong Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+L">Liyue Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+M">Meng Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+M">Miaojun Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+M">Mingchuan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+M">Minghua Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+M">Minghui Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+M">Mingming Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tian,+N">Ning Tian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+P">Panpan Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+P">Peiyi Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+P">Peng Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Q">Qiancheng Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+Q">Qihao Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Q">Qinyu Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Du,+Q">Qiushi Du</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+R">R.J. Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jin,+R">R.L. Jin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ge,+R">Ruiqi Ge</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+R">Ruisong Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pan,+R">Ruizhe Pan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+R">Runji Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+R">Runxin Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+R">Ruoyu Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+R">Ruyi Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+S">S.S. Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+S">Shanghao Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+S">Shangyan Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+S">Shanhuang Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+S">Shaoqing Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ye,+S">Shengfeng Ye</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ye,+S">Shengfeng Ye</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+S">Shirong Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Shiyu Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+S">Shuang Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+S">Shuiping Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+S">Shunfeng Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pan,+S">Shuting Pan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+T">T. Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yun,+T">Tao Yun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pei,+T">Tian Pei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+T">Tianyu Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiao,+W">W.L. Xiao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zeng,+W">Wangding Zeng</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> We present DeepSeek-V3, a strong Mixture-of-Experts (MoE) language model with 671B total parameters with 37B activated for each token. To achieve efficient inference and cost-effective training, DeepSeek-V3 adopts Multi-head Latent Attention (MLA) and DeepSeekMoE architectures, which were thoroughly validated in DeepSeek-V2. Furthermore, DeepSeek-V3 pioneers an auxiliary-loss-free strategy for load balancing and sets a multi-token prediction training objective for stronger performance. We pre-train DeepSeek-V3 on 14.8 trillion diverse and high-quality tokens, followed by Supervised Fine-Tuning and Reinforcement Learning stages to fully harness its capabilities. Comprehensive evaluations reveal that DeepSeek-V3 outperforms other open-source models and achieves performance comparable to leading closed-source models. Despite its excellent performance, DeepSeek-V3 requires only 2.788M H800 GPU hours for its full training. In addition, its training process is remarkably stable. Throughout the entire training process, we did not experience any irrecoverable loss spikes or perform any rollbacks. The model checkpoints are available at <a href="https://github.com/deepseek-ai/DeepSeek-V3" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item229'>[229]</a> <a href ="/abs/2501.00152" title="Abstract" id="2501.00152"> arXiv:2501.00152 </a> (replaced) [<a href="/pdf/2501.00152" title="Download PDF" id="pdf-2501.00152" aria-labelledby="pdf-2501.00152">pdf</a>, <a href="https://arxiv.org/html/2501.00152v2" title="View HTML" id="html-2501.00152" aria-labelledby="html-2501.00152" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.00152" title="Other formats" id="oth-2501.00152" aria-labelledby="oth-2501.00152">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Temporal reasoning for timeline summarisation in social media </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+J">Jiayu Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Akhter,+M">Mahmud Akhter</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Slonim,+D+A">Dana Atzil Slonim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liakata,+M">Maria Liakata</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> This paper explores whether enhancing temporal reasoning capabilities in Large Language Models (LLMs) can improve the quality of timeline summarisation, the task of summarising long texts containing sequences of events, such as social media threads. We first introduce NarrativeReason, a novel dataset focused on temporal relationships among sequential events within narratives, distinguishing it from existing temporal reasoning datasets that primarily address pair-wise event relationships. Our approach then combines temporal reasoning with timeline summarisation through a knowledge distillation framework, where we first fine-tune a teacher model on temporal reasoning tasks and then distill this knowledge into a student model while simultaneously training it for the task of timeline summarisation. Experimental results demonstrate that our model achieves superior performance on out-of-domain mental health-related timeline summarisation tasks, which involve long social media threads with repetitions of events and a mix of emotions, highlighting the importance and generalisability of leveraging temporal reasoning to improve timeline summarisation. </p> </div> </dd> <dt> <a name='item230'>[230]</a> <a href ="/abs/2501.02460" title="Abstract" id="2501.02460"> arXiv:2501.02460 </a> (replaced) [<a href="/pdf/2501.02460" title="Download PDF" id="pdf-2501.02460" aria-labelledby="pdf-2501.02460">pdf</a>, <a href="https://arxiv.org/html/2501.02460v2" title="View HTML" id="html-2501.02460" aria-labelledby="html-2501.02460" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.02460" title="Other formats" id="oth-2501.02460" aria-labelledby="oth-2501.02460">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Towards Omni-RAG: Comprehensive Retrieval-Augmented Generation for Large Language Models in Medical Applications </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Z">Zhe Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liao,+Y">Yusheng Liao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+S">Shuyang Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+P">Pingjie Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+Y">Yiqiu Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yanfeng Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yu Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large language models hold promise for addressing medical challenges, such as medical diagnosis reasoning, research knowledge acquisition, clinical decision-making, and consumer health inquiry support. However, they often generate hallucinations due to limited medical knowledge. Incorporating external knowledge is therefore critical, which necessitates multi-source knowledge acquisition. We address this challenge by framing it as a source planning problem, which is to formulate context-appropriate queries tailored to the attributes of diverse sources. Existing approaches either overlook source planning or fail to achieve it effectively due to misalignment between the model's expectation of the sources and their actual content. To bridge this gap, we present MedOmniKB, a repository comprising multigenre and multi-structured medical knowledge sources. Leveraging these sources, we propose the Source Planning Optimisation method, which enhances multi-source utilisation. Our approach involves enabling an expert model to explore and evaluate potential plans while training a smaller model to learn source alignment. Experimental results demonstrate that our method substantially improves multi-source planning performance, enabling the optimised small model to achieve state-of-the-art results in leveraging diverse medical knowledge sources. </p> </div> </dd> <dt> <a name='item231'>[231]</a> <a href ="/abs/2501.03545" title="Abstract" id="2501.03545"> arXiv:2501.03545 </a> (replaced) [<a href="/pdf/2501.03545" title="Download PDF" id="pdf-2501.03545" aria-labelledby="pdf-2501.03545">pdf</a>, <a href="https://arxiv.org/html/2501.03545v3" title="View HTML" id="html-2501.03545" aria-labelledby="html-2501.03545" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.03545" title="Other formats" id="oth-2501.03545" aria-labelledby="oth-2501.03545">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Beyond Factual Accuracy: Evaluating Coverage of Diverse Factual Information in Long-form Text Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Samarinas,+C">Chris Samarinas</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Krubner,+A">Alexander Krubner</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Salemi,+A">Alireza Salemi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+Y">Youngwoo Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zamani,+H">Hamed Zamani</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> This paper presents ICAT, an evaluation framework for measuring coverage of diverse factual information in long-form text generation. ICAT breaks down a long output text into a list of atomic claims and not only verifies each claim through retrieval from a (reliable) knowledge source, but also computes the alignment between the atomic factual claims and various aspects expected to be presented in the output. We study three implementations of the ICAT framework, each with a different assumption on the availability of aspects and alignment method. By adopting data from the diversification task in the TREC Web Track and the ClueWeb corpus, we evaluate the ICAT framework. We demonstrate strong correlation with human judgments and provide comprehensive evaluation across multiple state-of-the-art LLMs. Our framework further offers interpretable and fine-grained analysis of diversity and coverage. Its modular design allows for easy adaptation to different domains and datasets, making it a valuable tool for evaluating the qualitative aspects of long-form responses produced by LLMs. </p> </div> </dd> <dt> <a name='item232'>[232]</a> <a href ="/abs/2501.04962" title="Abstract" id="2501.04962"> arXiv:2501.04962 </a> (replaced) [<a href="/pdf/2501.04962" title="Download PDF" id="pdf-2501.04962" aria-labelledby="pdf-2501.04962">pdf</a>, <a href="https://arxiv.org/html/2501.04962v3" title="View HTML" id="html-2501.04962" aria-labelledby="html-2501.04962" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.04962" title="Other formats" id="oth-2501.04962" aria-labelledby="oth-2501.04962">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> VoxEval: Benchmarking the Knowledge Understanding Capabilities of End-to-End Spoken Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Cui,+W">Wenqian Cui</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiao,+X">Xiaoqi Jiao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Meng,+Z">Ziqiao Meng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=King,+I">Irwin King</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> <p class='mathjax'> With the rising need for speech-based interaction models, end-to-end Spoken Language Models (SLMs) have emerged as a promising solution. While these models require comprehensive world knowledge for meaningful and reliable human interactions, existing question-answering (QA) benchmarks fall short in evaluating SLMs' knowledge understanding due to their inability to support end-to-end speech evaluation and account for varied input audio conditions. To address these limitations, we present VoxEval, a novel SpeechQA benchmark that assesses SLMs' knowledge understanding through pure speech interactions. Our benchmark 1) uniquely maintains speech format for both inputs and outputs, 2) evaluates model robustness across diverse input audio conditions, and 3) pioneers the assessment of complex tasks like mathematical reasoning in spoken format. Systematic evaluation demonstrates that VoxEval presents significant challenges to current SLMs, revealing their sensitivity to varying audio conditions and highlighting the need to enhance reasoning capabilities in future development. We hope this benchmark could guide the advancement of more sophisticated and reliable SLMs.\footnote{VoxEval dataset is available at: <a href="https://github.com/dreamtheater123/VoxEval" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item233'>[233]</a> <a href ="/abs/2501.05767" title="Abstract" id="2501.05767"> arXiv:2501.05767 </a> (replaced) [<a href="/pdf/2501.05767" title="Download PDF" id="pdf-2501.05767" aria-labelledby="pdf-2501.05767">pdf</a>, <a href="https://arxiv.org/html/2501.05767v3" title="View HTML" id="html-2501.05767" aria-labelledby="html-2501.05767" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.05767" title="Other formats" id="oth-2501.05767" aria-labelledby="oth-2501.05767">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Migician: Revealing the Magic of Free-Form Multi-Image Grounding in Multimodal Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">You Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+H">Heyu Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+C">Chi Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+K">Kaiyu Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+C">Chao Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+Z">Zonghao Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zhiyuan Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+J">Jinan Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yuhua Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+R">Ruixuan Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+M">Maosong Sun</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 21 pages, 8 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> The recent advancement of Multimodal Large Language Models (MLLMs) has significantly improved their fine-grained perception of single images and general comprehension across multiple images. However, existing MLLMs still face challenges in achieving precise grounding in complex multi-image scenarios. To address this, we first explore a Chain-of-Thought (CoT) framework that integrates single-image grounding with multi-image comprehension. While partially effective, it remains unstable and struggles to capture abstract visual information due to its non-end-to-end nature. Therefore, we introduce Migician, the first multi-image grounding model capable of performing free-form and accurate grounding across multiple images. To support this, we present the MGrounding-630k dataset, which comprises data for several multi-image grounding tasks derived from existing datasets, along with newly generated free-form grounding instruction-following data. Furthermore, we propose MIG-Bench, a comprehensive benchmark specifically designed for evaluating multi-image grounding capabilities. Experimental results demonstrate that our model achieves significantly superior multi-image grounding capabilities, outperforming the best existing MLLMs by 24.94% and even surpassing much larger 70B models. Our code, model, dataset, and benchmark are fully open-sourced at <a href="https://migician-vg.github.io/" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item234'>[234]</a> <a href ="/abs/2501.06254" title="Abstract" id="2501.06254"> arXiv:2501.06254 </a> (replaced) [<a href="/pdf/2501.06254" title="Download PDF" id="pdf-2501.06254" aria-labelledby="pdf-2501.06254">pdf</a>, <a href="https://arxiv.org/html/2501.06254v2" title="View HTML" id="html-2501.06254" aria-labelledby="html-2501.06254" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.06254" title="Other formats" id="oth-2501.06254" aria-labelledby="oth-2501.06254">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Rethinking Evaluation of Sparse Autoencoders through the Representation of Polysemous Words </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Minegishi,+G">Gouki Minegishi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Furuta,+H">Hiroki Furuta</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Iwasawa,+Y">Yusuke Iwasawa</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Matsuo,+Y">Yutaka Matsuo</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Published at ICLR2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Sparse autoencoders (SAEs) have gained a lot of attention as a promising tool to improve the interpretability of large language models (LLMs) by mapping the complex superposition of polysemantic neurons into monosemantic features and composing a sparse dictionary of words. However, traditional performance metrics like Mean Squared Error and L0 sparsity ignore the evaluation of the semantic representational power of SAEs -- whether they can acquire interpretable monosemantic features while preserving the semantic relationship of words. For instance, it is not obvious whether a learned sparse feature could distinguish different meanings in one word. In this paper, we propose a suite of evaluations for SAEs to analyze the quality of monosemantic features by focusing on polysemous words. Our findings reveal that SAEs developed to improve the MSE-L0 Pareto frontier may confuse interpretability, which does not necessarily enhance the extraction of monosemantic features. The analysis of SAEs with polysemous words can also figure out the internal mechanism of LLMs; deeper layers and the Attention module contribute to distinguishing polysemy in a word. Our semantics focused evaluation offers new insights into the polysemy and the existing SAE objective and contributes to the development of more practical SAEs. </p> </div> </dd> <dt> <a name='item235'>[235]</a> <a href ="/abs/2501.10741" title="Abstract" id="2501.10741"> arXiv:2501.10741 </a> (replaced) [<a href="/pdf/2501.10741" title="Download PDF" id="pdf-2501.10741" aria-labelledby="pdf-2501.10741">pdf</a>, <a href="/format/2501.10741" title="Other formats" id="oth-2501.10741" aria-labelledby="oth-2501.10741">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Development of Application-Specific Large Language Models to Facilitate Research Ethics Review </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Mann,+S+P">Sebastian Porsdam Mann</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiehao,+J+S">Joel Seah Jiehao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Latham,+S+R">Stephen R. Latham</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Savulescu,+J">Julian Savulescu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Aboy,+M">Mateo Aboy</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Earp,+B+D">Brian D. Earp</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 11 pages, 0 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Computers and Society (cs.CY) </div> <p class='mathjax'> Institutional review boards (IRBs) play a crucial role in ensuring the ethical conduct of human subjects research, but face challenges including inconsistency, delays, and inefficiencies. We propose the development and implementation of application-specific large language models (LLMs) to facilitate IRB review processes. These IRB-specific LLMs would be fine-tuned on IRB-specific literature and institutional datasets, and equipped with retrieval capabilities to access up-to-date, context-relevant information. We outline potential applications, including pre-review screening, preliminary analysis, consistency checking, and decision support. While addressing concerns about accuracy, context sensitivity, and human oversight, we acknowledge remaining challenges such as over-reliance on AI and the need for transparency. By enhancing the efficiency and quality of ethical review while maintaining human judgment in critical decisions, IRB-specific LLMs offer a promising tool to improve research oversight. We call for pilot studies to evaluate the feasibility and impact of this approach. </p> </div> </dd> <dt> <a name='item236'>[236]</a> <a href ="/abs/2501.11849" title="Abstract" id="2501.11849"> arXiv:2501.11849 </a> (replaced) [<a href="/pdf/2501.11849" title="Download PDF" id="pdf-2501.11849" aria-labelledby="pdf-2501.11849">pdf</a>, <a href="https://arxiv.org/html/2501.11849v3" title="View HTML" id="html-2501.11849" aria-labelledby="html-2501.11849" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.11849" title="Other formats" id="oth-2501.11849" aria-labelledby="oth-2501.11849">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Network-informed Prompt Engineering against Organized Astroturf Campaigns under Extreme Class Imbalance </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Kanakaris,+N">Nikos Kanakaris</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ping,+H">Heng Ping</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiao,+X">Xiongye Xiao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ahmed,+N+K">Nesreen K. Ahmed</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luceri,+L">Luca Luceri</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ferrara,+E">Emilio Ferrara</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bogdan,+P">Paul Bogdan</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Social and Information Networks (cs.SI) </div> <p class='mathjax'> Detecting organized political campaigns is of paramount importance in fighting against disinformation on social media. Existing approaches for the identification of such organized actions employ techniques mostly from network science, graph machine learning and natural language processing. Their ultimate goal is to analyze the relationships and interactions (e.g. re-posting) among users and the textual similarities of their posts. Despite their effectiveness in recognizing astroturf campaigns, these methods face significant challenges, notably the class imbalance in available training datasets. To mitigate this issue, recent methods usually resort to data augmentation or increasing the number of positive samples, which may not always be feasible or sufficient in real-world settings. Following a different path, in this paper, we propose a novel framework for identifying astroturf campaigns based solely on large language models (LLMs), introducing a Balanced Retrieval-Augmented Generation (Balanced RAG) component. Our approach first gives both textual information concerning the posts (in our case tweets) and the user interactions of the social network as input to a language model. Then, through prompt engineering and the proposed Balanced RAG method, it effectively detects coordinated disinformation campaigns on X (Twitter). The proposed framework does not require any training or fine-tuning of the language model. Instead, by strategically harnessing the strengths of prompt engineering and Balanced RAG, it facilitates LLMs to overcome the effects of class imbalance and effectively identify coordinated political campaigns. The experimental results demonstrate that by incorporating the proposed prompt engineering and Balanced RAG methods, our framework outperforms the traditional graph-based baselines, achieving 2x-3x improvements in terms of precision, recall and F1 scores. </p> </div> </dd> <dt> <a name='item237'>[237]</a> <a href ="/abs/2501.13264" title="Abstract" id="2501.13264"> arXiv:2501.13264 </a> (replaced) [<a href="/pdf/2501.13264" title="Download PDF" id="pdf-2501.13264" aria-labelledby="pdf-2501.13264">pdf</a>, <a href="https://arxiv.org/html/2501.13264v2" title="View HTML" id="html-2501.13264" aria-labelledby="html-2501.13264" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.13264" title="Other formats" id="oth-2501.13264" aria-labelledby="oth-2501.13264">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> RAG-Reward: Optimizing RAG with Reward Modeling and RLHF </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+H">Hanning Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+J">Juntong Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+J">Juno Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Y">Yuanhao Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+T">Tong Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Niu,+C">Cheng Niu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Preprint </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Retrieval-augmented generation (RAG) enhances Large Language Models (LLMs) with relevant and up-to-date knowledge, improving their ability to answer knowledge-intensive questions. It has been shown to enhance both generation quality and trustworthiness. While numerous works have focused on improving retrieval, generation, and evaluation, the role of reward models in reinforcement learning for optimizing RAG remains underexplored. In this paper, we introduce \textbf{RAG-Reward}, a framework designed to develop reward models to enable \textit{hallucination-free, comprehensive, reliable, and efficient RAG}. We define four key metrics to assess generation quality and develop an automated benchmarking pipeline to evaluate the outputs of multiple LLMs across a variety of RAG scenarios. Using \textbf{RAG-Reward}, we train reward models and apply {reinforcement learning with human feedback (RLHF)} to improve LLMs' effectiveness in RAG. Experimental results demonstrate that our reward model achieves state-of-the-art performance in automatic benchmarking and aligns closely with human evaluations. Furthermore, the improved generation quality of the trained policy model highlights the feasibility and efficiency of using RLHF to enhance RAG outputs. </p> </div> </dd> <dt> <a name='item238'>[238]</a> <a href ="/abs/2501.13288" title="Abstract" id="2501.13288"> arXiv:2501.13288 </a> (replaced) [<a href="/pdf/2501.13288" title="Download PDF" id="pdf-2501.13288" aria-labelledby="pdf-2501.13288">pdf</a>, <a href="https://arxiv.org/html/2501.13288v2" title="View HTML" id="html-2501.13288" aria-labelledby="html-2501.13288" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.13288" title="Other formats" id="oth-2501.13288" aria-labelledby="oth-2501.13288">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Task-Oriented Automatic Fact-Checking with Frame-Semantics </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Devasier,+J">Jacob Devasier</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mediratta,+R">Rishabh Mediratta</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Putta,+A">Akshith Putta</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+C">Chengkai Li</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> We propose a novel paradigm for automatic fact-checking that leverages frame semantics to enhance the structured understanding of claims and guide the process of fact-checking them. To support this, we introduce a pilot dataset of real-world claims extracted from PolitiFact, specifically annotated for large-scale structured data. This dataset underpins two case studies: the first investigates voting-related claims using the Vote semantic frame, while the second explores various semantic frames based on data sources from the Organisation for Economic Co-operation and Development (OECD). Our findings demonstrate the effectiveness of frame semantics in improving evidence retrieval and explainability for fact-checking. Finally, we conducted a survey of frames evoked in fact-checked claims, identifying high-impact frames to guide future work in this direction. </p> </div> </dd> <dt> <a name='item239'>[239]</a> <a href ="/abs/2501.14002" title="Abstract" id="2501.14002"> arXiv:2501.14002 </a> (replaced) [<a href="/pdf/2501.14002" title="Download PDF" id="pdf-2501.14002" aria-labelledby="pdf-2501.14002">pdf</a>, <a href="https://arxiv.org/html/2501.14002v2" title="View HTML" id="html-2501.14002" aria-labelledby="html-2501.14002" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.14002" title="Other formats" id="oth-2501.14002" aria-labelledby="oth-2501.14002">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Advancing Math Reasoning in Language Models: The Impact of Problem-Solving Data, Data Synthesis Methods, and Training Stages </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Z">Zui Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+T">Tianqiao Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tian,+M">Mi Tian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tong,+Q">Qing Tong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+W">Weiqi Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zitao Liu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> ICLR 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Mathematical reasoning remains a challenging area for large language models (LLMs), prompting the development of math-specific LLMs such as LLEMMA, DeepSeekMath, and Qwen2-Math, among others. These models typically follow a two-stage training paradigm: pre-training with math-related corpora and post-training with problem datasets for supervised fine-tuning (SFT). Despite these efforts, the improvements in mathematical reasoning achieved through continued pre-training (CPT) are often less significant compared to those obtained via SFT. This study addresses this discrepancy by exploring alternative strategies during the pre-training phase, focusing on the use of problem-solving data over general mathematical corpora. We investigate three primary research questions: (1) Can problem-solving data enhance the model's mathematical reasoning capabilities more effectively than general mathematical corpora during CPT? (2) Are synthetic data from the same source equally effective, and which synthesis methods are most efficient? (3) How do the capabilities developed from the same problem-solving data differ between the CPT and SFT stages, and what factors contribute to these differences? Our findings indicate that problem-solving data significantly enhances the model's mathematical capabilities compared to general mathematical corpora. We also identify effective data synthesis methods, demonstrating that the tutorship amplification synthesis method achieves the best performance. Furthermore, while SFT facilitates instruction-following abilities, it underperforms compared to CPT with the same data, which can be partially attributed to its poor learning capacity for more challenging problem-solving data. These insights provide valuable guidance for optimizing the mathematical reasoning capabilities of LLMs, culminating in our development of a powerful mathematical base model called MathGPT-8B. </p> </div> </dd> <dt> <a name='item240'>[240]</a> <a href ="/abs/2501.14073" title="Abstract" id="2501.14073"> arXiv:2501.14073 </a> (replaced) [<a href="/pdf/2501.14073" title="Download PDF" id="pdf-2501.14073" aria-labelledby="pdf-2501.14073">pdf</a>, <a href="https://arxiv.org/html/2501.14073v2" title="View HTML" id="html-2501.14073" aria-labelledby="html-2501.14073" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.14073" title="Other formats" id="oth-2501.14073" aria-labelledby="oth-2501.14073">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LLMs are Vulnerable to Malicious Prompts Disguised as Scientific Language </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ge,+Y">Yubin Ge</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kirtane,+N">Neeraja Kirtane</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Peng,+H">Hao Peng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hakkani-T%C3%BCr,+D">Dilek Hakkani-T眉r</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 16 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> As large language models (LLMs) have been deployed in various real-world settings, concerns about the harm they may propagate have grown. Various jailbreaking techniques have been developed to expose the vulnerabilities of these models and improve their safety. This work reveals that many state-of-the-art LLMs are vulnerable to malicious requests hidden behind scientific language. Specifically, our experiments with GPT4o, GPT4o-mini, GPT-4, LLama3-405B-Instruct, Llama3-70B-Instruct, Cohere, Gemini models demonstrate that, the models' biases and toxicity substantially increase when prompted with requests that deliberately misinterpret social science and psychological studies as evidence supporting the benefits of stereotypical biases. Alarmingly, these models can also be manipulated to generate fabricated scientific arguments claiming that biases are beneficial, which can be used by ill-intended actors to systematically jailbreak these strong LLMs. Our analysis studies various factors that contribute to the models' vulnerabilities to malicious requests in academic language. Mentioning author names and venues enhances the persuasiveness of models, and the bias scores increase as dialogues progress. Our findings call for a more careful investigation on the use of scientific data for training LLMs. </p> </div> </dd> <dt> <a name='item241'>[241]</a> <a href ="/abs/2501.15427" title="Abstract" id="2501.15427"> arXiv:2501.15427 </a> (replaced) [<a href="/pdf/2501.15427" title="Download PDF" id="pdf-2501.15427" aria-labelledby="pdf-2501.15427">pdf</a>, <a href="https://arxiv.org/html/2501.15427v2" title="View HTML" id="html-2501.15427" aria-labelledby="html-2501.15427" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.15427" title="Other formats" id="oth-2501.15427" aria-labelledby="oth-2501.15427">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> OpenCharacter: Training Customizable Role-Playing LLMs with Large-Scale Synthetic Personas </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+X">Xiaoyang Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+H">Hongming Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ge,+T">Tao Ge</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+W">Wenhao Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+D">Dian Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+D">Dong Yu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Customizable role-playing in large language models (LLMs), also known as character generalization, is gaining increasing attention for its versatility and cost-efficiency in developing and deploying role-playing dialogue agents. This study explores a large-scale data synthesis approach to equip LLMs with character generalization capabilities. We begin by synthesizing large-scale character profiles using personas from Persona Hub and then explore two strategies: response rewriting and response generation, to create character-aligned instructional responses. To validate the effectiveness of our synthetic instruction tuning data for character generalization, we perform supervised fine-tuning (SFT) using the LLaMA-3 8B model. Our best-performing model strengthens the original LLaMA-3 8B Instruct model and achieves performance comparable to GPT-4o models on role-playing dialogue. We release our synthetic characters and instruction-tuning dialogues to support public research. </p> </div> </dd> <dt> <a name='item242'>[242]</a> <a href ="/abs/2501.17191" title="Abstract" id="2501.17191"> arXiv:2501.17191 </a> (replaced) [<a href="/pdf/2501.17191" title="Download PDF" id="pdf-2501.17191" aria-labelledby="pdf-2501.17191">pdf</a>, <a href="/format/2501.17191" title="Other formats" id="oth-2501.17191" aria-labelledby="oth-2501.17191">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Aspect-Aware Decomposition for Opinion Summarization </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+M">Miao Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lau,+J+H">Jey Han Lau</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hovy,+E">Eduard Hovy</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lapata,+M">Mirella Lapata</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 35 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Information Retrieval (cs.IR) </div> <p class='mathjax'> Opinion summarization plays a key role in deriving meaningful insights from large-scale online reviews. To make this process more explainable and grounded, we propose a modular approach guided by review aspects which separates the tasks of aspect identification, opinion consolidation, and meta-review synthesis, enabling greater transparency and ease of inspection. We conduct extensive experiments across datasets representing scientific research, business, and product domains. Results show that our method generates more grounded summaries compared to strong baseline models, as verified through automated and human evaluations. Additionally, our modular approach, which incorporates reasoning based on review aspects, produces more informative intermediate outputs than knowledge-agnostic decomposed prompting. These intermediate outputs can also effectively support humans in summarizing opinions from large volumes of reviews. </p> </div> </dd> <dt> <a name='item243'>[243]</a> <a href ="/abs/2501.18585" title="Abstract" id="2501.18585"> arXiv:2501.18585 </a> (replaced) [<a href="/pdf/2501.18585" title="Download PDF" id="pdf-2501.18585" aria-labelledby="pdf-2501.18585">pdf</a>, <a href="https://arxiv.org/html/2501.18585v2" title="View HTML" id="html-2501.18585" aria-labelledby="html-2501.18585" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.18585" title="Other formats" id="oth-2501.18585" aria-labelledby="oth-2501.18585">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Thoughts Are All Over the Place: On the Underthinking of o1-Like LLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yue Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Q">Qiuzhi Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+J">Jiahao Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liang,+T">Tian Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+X">Xingyu Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+Z">Zhiwei He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+L">Linfeng Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+D">Dian Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+J">Juntao Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Z">Zhuosheng Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+R">Rui Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tu,+Z">Zhaopeng Tu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mi,+H">Haitao Mi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+D">Dong Yu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 1. We have updated the results for DeepSeek-R1, and all of our original conclusions remain valid. 2. Our proposed Tip approach remains effective in Best-of-N scenarios (e.g., self-consistency and Laconic Decoding) when built on DeepSeek-R1 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large language models (LLMs) such as OpenAI's o1 have demonstrated remarkable abilities in complex reasoning tasks by scaling test-time compute and exhibiting human-like deep thinking. However, we identify a phenomenon we term underthinking, where o1-like LLMs frequently switch between different reasoning thoughts without sufficiently exploring promising paths to reach a correct solution. This behavior leads to inadequate depth of reasoning and decreased performance, particularly on challenging mathematical problems. To systematically analyze this issue, we conduct experiments on three challenging test sets and two representative open-source o1-like models, revealing that frequent thought switching correlates with incorrect responses. We introduce a novel metric to quantify underthinking by measuring token efficiency in incorrect answers. To address underthinking, we propose a decoding strategy with thought switching penalty TIP that discourages premature transitions between thoughts, encouraging deeper exploration of each reasoning path. Experimental results demonstrate that our approach improves accuracy across challenging datasets without requiring model fine-tuning. Our findings contribute to understanding reasoning inefficiencies in o1-like LLMs and offer a practical solution to enhance their problem-solving capabilities. </p> </div> </dd> <dt> <a name='item244'>[244]</a> <a href ="/abs/2501.19353" title="Abstract" id="2501.19353"> arXiv:2501.19353 </a> (replaced) [<a href="/pdf/2501.19353" title="Download PDF" id="pdf-2501.19353" aria-labelledby="pdf-2501.19353">pdf</a>, <a href="https://arxiv.org/html/2501.19353v3" title="View HTML" id="html-2501.19353" aria-labelledby="html-2501.19353" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.19353" title="Other formats" id="oth-2501.19353" aria-labelledby="oth-2501.19353">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Do Large Multimodal Models Solve Caption Generation for Scientific Figures? Lessons Learned from SciCap Challenge 2023 </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Hsu,+T+E">Ting-Yao E. Hsu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hsu,+Y">Yi-Li Hsu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rohatgi,+S">Shaurya Rohatgi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+C">Chieh-Yang Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ng,+H+Y+S">Ho Yin Sam Ng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rossi,+R">Ryan Rossi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+S">Sungchul Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+T">Tong Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ku,+L">Lun-Wei Ku</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Giles,+C+L">C. Lee Giles</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+T+K">Ting-Hao K. Huang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted to TACL 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Since the SciCap datasets launch in 2021, the research community has made significant progress in generating captions for scientific figures in scholarly articles. In 2023, the first SciCap Challenge took place, inviting global teams to use an expanded SciCap dataset to develop models for captioning diverse figure types across various academic fields. At the same time, text generation models advanced quickly, with many powerful pre-trained large multimodal models (LMMs) emerging that showed impressive capabilities in various vision-and-language tasks. This paper presents an overview of the first SciCap Challenge and details the performance of various models on its data, capturing a snapshot of the fields state. We found that professional editors overwhelmingly preferred figure captions generated by GPT-4V over those from all other models and even the original captions written by authors. Following this key finding, we conducted detailed analyses to answer this question: Have advanced LMMs solved the task of generating captions for scientific figures? </p> </div> </dd> <dt> <a name='item245'>[245]</a> <a href ="/abs/2502.00761" title="Abstract" id="2502.00761"> arXiv:2502.00761 </a> (replaced) [<a href="/pdf/2502.00761" title="Download PDF" id="pdf-2502.00761" aria-labelledby="pdf-2502.00761">pdf</a>, <a href="https://arxiv.org/html/2502.00761v2" title="View HTML" id="html-2502.00761" aria-labelledby="html-2502.00761" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.00761" title="Other formats" id="oth-2502.00761" aria-labelledby="oth-2502.00761">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> FIRE: Flexible Integration of Data Quality Ratings for Effective Pre-Training </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+L">Liangyu Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+X">Xuemiao Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Duan,+F">Feiyu Duan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Sirui Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+J">Jingang Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cai,+X">Xunliang Cai</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 19 pages, 11 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Selecting high-quality data can significantly improve the pretraining efficiency of large language models (LLMs). Existing methods generally rely on heuristic techniques and single-quality signals, limiting their ability to evaluate data quality comprehensively. In this work, we propose FIRE, a flexible and scalable framework for integrating multiple data quality raters, which allows for a comprehensive assessment of data quality across various dimensions. FIRE aligns multiple quality signals into a unified space, and integrates diverse data quality raters to provide a comprehensive quality signal for each data point. Further, we introduce a progressive data selection scheme based on FIRE that iteratively refines the selection of high-quality data points. Experiments on the SlimPajama dataset reveal that FIRE outperforms other data selection methods and significantly enhances the pretrained model across a wide range of downstream tasks, with a 2.9% average performance improvement over Random and reducing the FLOPs necessary to achieve a certain performance level by more than half. </p> </div> </dd> <dt> <a name='item246'>[246]</a> <a href ="/abs/2502.03627" title="Abstract" id="2502.03627"> arXiv:2502.03627 </a> (replaced) [<a href="/pdf/2502.03627" title="Download PDF" id="pdf-2502.03627" aria-labelledby="pdf-2502.03627">pdf</a>, <a href="https://arxiv.org/html/2502.03627v2" title="View HTML" id="html-2502.03627" aria-labelledby="html-2502.03627" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.03627" title="Other formats" id="oth-2502.03627" aria-labelledby="oth-2502.03627">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Sorting the Babble in Babel: Assessing the Performance of Language Detection Algorithms on the OpenAlex Database </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Sainte-Marie,+M+H">Maxime Holmberg Sainte-Marie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kozlowski,+D">Diego Kozlowski</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=C%C3%A9spedes,+L">Luc铆a C茅spedes</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Larivi%C3%A8re,+V">Vincent Larivi猫re</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 33 pages, 4 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> This project aims to compare various language classification procedures, procedures combining various Python language detection algorithms and metadata-based corpora extracted from manually-annotated articles sampled from the OpenAlex database. Following an analysis of precision and recall performance for each algorithm, corpus, and language as well as of processing speeds recorded for each algorithm and corpus type, overall procedure performance at the database level was simulated using probabilistic confusion matrices for each algorithm, corpus, and language as well as a probabilistic model of relative article language frequencies for the whole OpenAlex database. Results show that procedure performance strongly depends on the importance given to each of the measures implemented: for contexts where precision is preferred, using the LangID algorithm on the greedy corpus gives the best results; however, for all cases where recall is considered at least slightly more important than precision or as soon as processing times are given any kind of consideration, the procedure combining the FastSpell algorithm and the Titles corpus outperforms all other alternatives. Given the lack of truly multilingual, large-scale bibliographic databases, it is hoped that these results help confirm and foster the unparalleled potential of the OpenAlex database for cross-linguistic, bibliometric-based research and analysis. </p> </div> </dd> <dt> <a name='item247'>[247]</a> <a href ="/abs/2502.05551" title="Abstract" id="2502.05551"> arXiv:2502.05551 </a> (replaced) [<a href="/pdf/2502.05551" title="Download PDF" id="pdf-2502.05551" aria-labelledby="pdf-2502.05551">pdf</a>, <a href="https://arxiv.org/html/2502.05551v3" title="View HTML" id="html-2502.05551" aria-labelledby="html-2502.05551" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.05551" title="Other formats" id="oth-2502.05551" aria-labelledby="oth-2502.05551">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> FRAME: Boosting LLMs with A Four-Quadrant Multi-Stage Pretraining Strategy </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+X">Xuemiao Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Duan,+F">Feiyu Duan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+L">Liangyu Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+Y">Yongwei Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Sirui Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Weng,+R">Rongxiang Weng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+J">Jingang Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cai,+X">Xunliang Cai</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large language models (LLMs) have significantly advanced human language understanding and generation, with pretraining data quality and organization being crucial to their performance. Multi-stage pretraining is a promising approach, but existing methods often lack quantitative criteria for data partitioning and instead rely on intuitive heuristics. In this paper, we propose the novel Four-quadRAnt Multi-stage prEtraining strategy (FRAME), guided by the established principle of organizing the pretraining process into four stages to achieve significant loss reductions four times. This principle is grounded in two key findings: first, training on high Perplexity (PPL) data followed by low PPL data, and second, training on low PPL difference (PD) data followed by high PD data, both causing the loss to drop significantly twice and performance enhancements. By partitioning data into four quadrants and strategically organizing them, FRAME achieves a remarkable 16.8% average improvement over random across MMLU and CMMLU for the 3B model, effectively boosting LLM performance. </p> </div> </dd> <dt> <a name='item248'>[248]</a> <a href ="/abs/2502.05759" title="Abstract" id="2502.05759"> arXiv:2502.05759 </a> (replaced) [<a href="/pdf/2502.05759" title="Download PDF" id="pdf-2502.05759" aria-labelledby="pdf-2502.05759">pdf</a>, <a href="https://arxiv.org/html/2502.05759v2" title="View HTML" id="html-2502.05759" aria-labelledby="html-2502.05759" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.05759" title="Other formats" id="oth-2502.05759" aria-labelledby="oth-2502.05759">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Reinforced Lifelong Editing for Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Z">Zherui Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+H">Houcheng Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+H">Hao Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bi,+B">Baolong Bi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+Z">Zhenhong Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+F">Fei Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fang,+J">Junfeng Fang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+X">Xiang Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large language models (LLMs) acquire information from pre-training corpora, but their stored knowledge can become inaccurate or outdated over time. Model editing addresses this challenge by modifying model parameters without retraining, and prevalent approaches leverage hypernetworks to generate these parameter updates. However, they face significant challenges in lifelong editing due to their incompatibility with LLM parameters that dynamically change during the editing process. To address this, we observed that hypernetwork-based lifelong editing aligns with reinforcement learning modeling and proposed RLEdit, an RL-based editing method. By treating editing losses as rewards and optimizing hypernetwork parameters at the full knowledge sequence level, we enable it to precisely capture LLM changes and generate appropriate parameter updates. Our extensive empirical evaluation across several LLMs demonstrates that RLEdit outperforms existing methods in lifelong editing with superior effectiveness and efficiency, achieving a 59.24% improvement while requiring only 2.11% of the time compared to most approaches. Our code is available at: <a href="https://github.com/zhrli324/RLEdit" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item249'>[249]</a> <a href ="/abs/2502.07623" title="Abstract" id="2502.07623"> arXiv:2502.07623 </a> (replaced) [<a href="/pdf/2502.07623" title="Download PDF" id="pdf-2502.07623" aria-labelledby="pdf-2502.07623">pdf</a>, <a href="https://arxiv.org/html/2502.07623v2" title="View HTML" id="html-2502.07623" aria-labelledby="html-2502.07623" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.07623" title="Other formats" id="oth-2502.07623" aria-labelledby="oth-2502.07623">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Lexical categories of stem-forming roots in Mapud眉ngun verb forms </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chand%C3%ADa,+A">Andr茅s Chand铆a</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 22 pages, 2 large tables, 2 sample tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> After developing a computational system for morphological analysis of the Mapuche language, and evaluating it with texts from various authors and styles, it became necessary to verify the linguistic assumptions of the source used as the basis for implementing this tool. <br>In the present work, the primary focus is on the lexical category classification of Mapud眉ngun roots recognised as verbal in the source utilised for the development of the morphological analysis system. <br>The results of this lexical category revision directly benefit the computational analyser, as they are implemented as soon as they are verified. Additionally, it is hoped that these results will help clarify some uncertainties about lexical categories in the Mapuche language. <br>This work addresses a preliminary task to identify the valency of true verbal roots, the results of which will be presented in a subsequent work that complements this article. </p> </div> </dd> <dt> <a name='item250'>[250]</a> <a href ="/abs/2502.08788" title="Abstract" id="2502.08788"> arXiv:2502.08788 </a> (replaced) [<a href="/pdf/2502.08788" title="Download PDF" id="pdf-2502.08788" aria-labelledby="pdf-2502.08788">pdf</a>, <a href="https://arxiv.org/html/2502.08788v2" title="View HTML" id="html-2502.08788" aria-labelledby="html-2502.08788" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.08788" title="Other formats" id="oth-2502.08788" aria-labelledby="oth-2502.08788">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> If Multi-Agent Debate is the Answer, What is the Question? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+H">Hangfan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cui,+Z">Zhiyao Cui</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+X">Xinrun Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Q">Qiaosheng Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zhen Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+D">Dinghao Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+S">Shuyue Hu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> This position paper takes a critical view of the status quo of MAD research, and outline multiple potential directions to improve MAD </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Multi-agent debate (MAD) has emerged as a promising approach to enhance the factual accuracy and reasoning quality of large language models (LLMs) by engaging multiple agents in iterative discussions during inference. Despite its potential, we argue that current MAD research suffers from critical shortcomings in evaluation practices, including limited dataset overlap and inconsistent baselines, raising significant concerns about generalizability. Correspondingly, this paper presents a systematic evaluation of five representative MAD methods across nine benchmarks using four foundational models. Surprisingly, our findings reveal that MAD methods fail to reliably outperform simple single-agent baselines such as Chain-of-Thought and Self-Consistency, even when consuming additional inference-time computation. From our analysis, we found that model heterogeneity can significantly improve MAD frameworks. We propose Heter-MAD enabling a single LLM agent to access the output from heterogeneous foundation models, which boosts the performance of current MAD frameworks. Finally, we outline potential directions for advancing MAD, aiming to spark a broader conversation and inspire future work in this area. </p> </div> </dd> <dt> <a name='item251'>[251]</a> <a href ="/abs/2502.08826" title="Abstract" id="2502.08826"> arXiv:2502.08826 </a> (replaced) [<a href="/pdf/2502.08826" title="Download PDF" id="pdf-2502.08826" aria-labelledby="pdf-2502.08826">pdf</a>, <a href="https://arxiv.org/html/2502.08826v2" title="View HTML" id="html-2502.08826" aria-labelledby="html-2502.08826" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.08826" title="Other formats" id="oth-2502.08826" aria-labelledby="oth-2502.08826">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Ask in Any Modality: A Comprehensive Survey on Multimodal Retrieval-Augmented Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Abootorabi,+M+M">Mohammad Mahdi Abootorabi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zobeiri,+A">Amirhosein Zobeiri</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dehghani,+M">Mahdi Dehghani</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mohammadkhani,+M">Mohammadali Mohammadkhani</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mohammadi,+B">Bardia Mohammadi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ghahroodi,+O">Omid Ghahroodi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Baghshah,+M+S">Mahdieh Soleymani Baghshah</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Asgari,+E">Ehsaneddin Asgari</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> GitHub repository: <a href="https://github.com/llm-lab-org/Multimodal-RAG-Survey" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Information Retrieval (cs.IR) </div> <p class='mathjax'> Large Language Models (LLMs) struggle with hallucinations and outdated knowledge due to their reliance on static training data. Retrieval-Augmented Generation (RAG) mitigates these issues by integrating external dynamic information enhancing factual and updated grounding. Recent advances in multimodal learning have led to the development of Multimodal RAG, incorporating multiple modalities such as text, images, audio, and video to enhance the generated outputs. However, cross-modal alignment and reasoning introduce unique challenges to Multimodal RAG, distinguishing it from traditional unimodal RAG. This survey offers a structured and comprehensive analysis of Multimodal RAG systems, covering datasets, metrics, benchmarks, evaluation, methodologies, and innovations in retrieval, fusion, augmentation, and generation. We precisely review training strategies, robustness enhancements, and loss functions, while also exploring the diverse Multimodal RAG scenarios. Furthermore, we discuss open challenges and future research directions to support advancements in this evolving field. This survey lays the foundation for developing more capable and reliable AI systems that effectively leverage multimodal dynamic external knowledge bases. Resources are available at <a href="https://github.com/llm-lab-org/Multimodal-RAG-Survey" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item252'>[252]</a> <a href ="/abs/2502.09387" title="Abstract" id="2502.09387"> arXiv:2502.09387 </a> (replaced) [<a href="/pdf/2502.09387" title="Download PDF" id="pdf-2502.09387" aria-labelledby="pdf-2502.09387">pdf</a>, <a href="https://arxiv.org/html/2502.09387v2" title="View HTML" id="html-2502.09387" aria-labelledby="html-2502.09387" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.09387" title="Other formats" id="oth-2502.09387" aria-labelledby="oth-2502.09387">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Truth Knows No Language: Evaluating Truthfulness Beyond English </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Figueras,+B+C">Blanca Calvo Figueras</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sagarzazu,+E">Eneko Sagarzazu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Etxaniz,+J">Julen Etxaniz</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Barnes,+J">Jeremy Barnes</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gamallo,+P">Pablo Gamallo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=De+Dios+Flores,+I">Iria De Dios Flores</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Agerri,+R">Rodrigo Agerri</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 14 pages, 6 figures, 8 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Computers and Society (cs.CY) </div> <p class='mathjax'> We introduce a professionally translated extension of the TruthfulQA benchmark designed to evaluate truthfulness in Basque, Catalan, Galician, and Spanish. Truthfulness evaluations of large language models (LLMs) have primarily been conducted in English. However, the ability of LLMs to maintain truthfulness across languages remains under-explored. Our study evaluates 12 state-of-the-art open LLMs, comparing base and instruction-tuned models using human evaluation, multiple-choice metrics, and LLM-as-a-Judge scoring. Our findings reveal that, while LLMs perform best in English and worst in Basque (the lowest-resourced language), overall truthfulness discrepancies across languages are smaller than anticipated. Furthermore, we show that LLM-as-a-Judge correlates more closely with human judgments than multiple-choice metrics, and that informativeness plays a critical role in truthfulness assessment. Our results also indicate that machine translation provides a viable approach for extending truthfulness benchmarks to additional languages, offering a scalable alternative to professional translation. Finally, we observe that universal knowledge questions are better handled across languages than context- and time-dependent ones, highlighting the need for truthfulness evaluations that account for cultural and temporal variability. Dataset and code are publicly available under open licenses. </p> </div> </dd> <dt> <a name='item253'>[253]</a> <a href ="/abs/2502.09674" title="Abstract" id="2502.09674"> arXiv:2502.09674 </a> (replaced) [<a href="/pdf/2502.09674" title="Download PDF" id="pdf-2502.09674" aria-labelledby="pdf-2502.09674">pdf</a>, <a href="https://arxiv.org/html/2502.09674v2" title="View HTML" id="html-2502.09674" aria-labelledby="html-2502.09674" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.09674" title="Other formats" id="oth-2502.09674" aria-labelledby="oth-2502.09674">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> The Hidden Dimensions of LLM Alignment: A Multi-Dimensional Safety Analysis </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Pan,+W">Wenbo Pan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zhichao Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Q">Qiguang Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+X">Xiangyang Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+H">Haining Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jia,+X">Xiaohua Jia</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Code and artifacts: <a href="https://github.com/BMPixel/safety-residual-space" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Large Language Models' safety-aligned behaviors, such as refusing harmful queries, can be represented by linear directions in activation space. Previous research modeled safety behavior with a single direction, limiting mechanistic understanding to an isolated safety feature. In this work, we discover that safety-aligned behavior is jointly controlled by multi-dimensional directions. Namely, we study the vector space of representation shifts during safety fine-tuning on Llama 3 8B for refusing jailbreaks. By studying orthogonal directions in the space, we first find that a dominant direction governs the model's refusal behavior, while multiple smaller directions represent distinct and interpretable features like hypothetical narrative and role-playing. We then measure how different directions promote or suppress the dominant direction, showing the important role of secondary directions in shaping the model's refusal representation. Finally, we demonstrate that removing certain trigger tokens in harmful queries can mitigate these directions to bypass the learned safety capability, providing new insights on understanding safety alignment vulnerability from a multi-dimensional perspective. Code and artifacts are available at <a href="https://github.com/BMPixel/safety-residual-space" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item254'>[254]</a> <a href ="/abs/2502.09747" title="Abstract" id="2502.09747"> arXiv:2502.09747 </a> (replaced) [<a href="/pdf/2502.09747" title="Download PDF" id="pdf-2502.09747" aria-labelledby="pdf-2502.09747">pdf</a>, <a href="https://arxiv.org/html/2502.09747v2" title="View HTML" id="html-2502.09747" aria-labelledby="html-2502.09747" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.09747" title="Other formats" id="oth-2502.09747" aria-labelledby="oth-2502.09747">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> The Widespread Adoption of Large Language Model-Assisted Writing Across Society </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liang,+W">Weixin Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yaohui Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Codreanu,+M">Mihai Codreanu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+J">Jiayu Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+H">Hancheng Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zou,+J">James Zou</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The recent advances in large language models (LLMs) attracted significant public and policymaker interest in its adoption patterns. In this paper, we systematically analyze LLM-assisted writing across four domains-consumer complaints, corporate communications, job postings, and international organization press releases-from January 2022 to September 2024. Our dataset includes 687,241 consumer complaints, 537,413 corporate press releases, 304.3 million job postings, and 15,919 United Nations (UN) press releases. Using a robust population-level statistical framework, we find that LLM usage surged following the release of ChatGPT in November 2022. By late 2024, roughly 18% of financial consumer complaint text appears to be LLM-assisted, with adoption patterns spread broadly across regions and slightly higher in urban areas. For corporate press releases, up to 24% of the text is attributable to LLMs. In job postings, LLM-assisted writing accounts for just below 10% in small firms, and is even more common among younger firms. UN press releases also reflect this trend, with nearly 14% of content being generated or modified by LLMs. Although adoption climbed rapidly post-ChatGPT, growth appears to have stabilized by 2024, reflecting either saturation in LLM adoption or increasing subtlety of more advanced models. Our study shows the emergence of a new reality in which firms, consumers and even international organizations substantially rely on generative AI for communications. </p> </div> </dd> <dt> <a name='item255'>[255]</a> <a href ="/abs/2502.09992" title="Abstract" id="2502.09992"> arXiv:2502.09992 </a> (replaced) [<a href="/pdf/2502.09992" title="Download PDF" id="pdf-2502.09992" aria-labelledby="pdf-2502.09992">pdf</a>, <a href="https://arxiv.org/html/2502.09992v2" title="View HTML" id="html-2502.09992" aria-labelledby="html-2502.09992" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.09992" title="Other formats" id="oth-2502.09992" aria-labelledby="oth-2502.09992">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Large Language Diffusion Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Nie,+S">Shen Nie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+F">Fengqi Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=You,+Z">Zebin You</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+X">Xiaolu Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ou,+J">Jingyang Ou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+J">Jun Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+J">Jun Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+Y">Yankai Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wen,+J">Ji-Rong Wen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+C">Chongxuan Li</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Autoregressive models (ARMs) are widely regarded as the cornerstone of large language models (LLMs). We challenge this notion by introducing LLaDA, a diffusion model trained from scratch under the pre-training and supervised fine-tuning (SFT) paradigm. LLaDA models distributions through a forward data masking process and a reverse process, parameterized by a vanilla Transformer to predict masked tokens. By optimizing a likelihood bound, it provides a principled generative approach for probabilistic inference. Across extensive benchmarks, LLaDA demonstrates strong scalability, outperforming our self-constructed ARM baselines. Remarkably, LLaDA 8B is competitive with strong LLMs like LLaMA3 8B in in-context learning and, after SFT, exhibits impressive instruction-following abilities in case studies such as multi-turn dialogue. Moreover, LLaDA addresses the reversal curse, surpassing GPT-4o in a reversal poem completion task. Our findings establish diffusion models as a viable and promising alternative to ARMs, challenging the assumption that key LLM capabilities discussed above are inherently tied to ARMs. Project page and codes: <a href="https://ml-gsai.github.io/LLaDA-demo/" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item256'>[256]</a> <a href ="/abs/2502.11054" title="Abstract" id="2502.11054"> arXiv:2502.11054 </a> (replaced) [<a href="/pdf/2502.11054" title="Download PDF" id="pdf-2502.11054" aria-labelledby="pdf-2502.11054">pdf</a>, <a href="https://arxiv.org/html/2502.11054v2" title="View HTML" id="html-2502.11054" aria-labelledby="html-2502.11054" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11054" title="Other formats" id="oth-2502.11054" aria-labelledby="oth-2502.11054">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Reasoning-Augmented Conversation for Multi-Turn Jailbreak Attacks on Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ying,+Z">Zonghao Ying</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+D">Deyue Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jing,+Z">Zonglei Jing</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiao,+Y">Yisong Xiao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zou,+Q">Quanchen Zou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+A">Aishan Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liang,+S">Siyuan Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+X">Xiangzheng Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+X">Xianglong Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tao,+D">Dacheng Tao</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Cryptography and Security (cs.CR) </div> <p class='mathjax'> Multi-turn jailbreak attacks simulate real-world human interactions by engaging large language models (LLMs) in iterative dialogues, exposing critical safety vulnerabilities. However, existing methods often struggle to balance semantic coherence with attack effectiveness, resulting in either benign semantic drift or ineffective detection evasion. To address this challenge, we propose Reasoning-Augmented Conversation, a novel multi-turn jailbreak framework that reformulates harmful queries into benign reasoning tasks and leverages LLMs' strong reasoning capabilities to compromise safety alignment. Specifically, we introduce an attack state machine framework to systematically model problem translation and iterative reasoning, ensuring coherent query generation across multiple turns. Building on this framework, we design gain-guided exploration, self-play, and rejection feedback modules to preserve attack semantics, enhance effectiveness, and sustain reasoning-driven attack progression. Extensive experiments on multiple LLMs demonstrate that RACE achieves state-of-the-art attack effectiveness in complex conversational scenarios, with attack success rates (ASRs) increasing by up to 96%. Notably, our approach achieves ASRs of 82% and 92% against leading commercial models, OpenAI o1 and DeepSeek R1, underscoring its potency. We release our code at <a href="https://github.com/NY1024/RACE" rel="external noopener nofollow" class="link-external link-https">this https URL</a> to facilitate further research in this critical domain. </p> </div> </dd> <dt> <a name='item257'>[257]</a> <a href ="/abs/2502.11090" title="Abstract" id="2502.11090"> arXiv:2502.11090 </a> (replaced) [<a href="/pdf/2502.11090" title="Download PDF" id="pdf-2502.11090" aria-labelledby="pdf-2502.11090">pdf</a>, <a href="https://arxiv.org/html/2502.11090v2" title="View HTML" id="html-2502.11090" aria-labelledby="html-2502.11090" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11090" title="Other formats" id="oth-2502.11090" aria-labelledby="oth-2502.11090">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SafeDialBench: A Fine-Grained Safety Benchmark for Large Language Models in Multi-Turn Dialogues with Diverse Jailbreak Attacks </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+H">Hongye Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yanming Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jing,+S">Sijia Jing</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Peng,+Z">Ziyue Peng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bai,+Z">Zhixin Bai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+Z">Zhe Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fang,+M">Meng Fang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+F">Fan Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+B">Boyan Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+J">Jiaheng Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+T">Tianpei Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huo,+J">Jing Huo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+Y">Yang Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Meng,+F">Fanyu Meng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+X">Xi Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Deng,+C">Chao Deng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+J">Junlan Feng</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> With the rapid advancement of Large Language Models (LLMs), the safety of LLMs has been a critical concern requiring precise assessment. Current benchmarks primarily concentrate on single-turn dialogues or a single jailbreak attack method to assess the safety. Additionally, these benchmarks have not taken into account the LLM's capability of identifying and handling unsafe information in detail. To address these issues, we propose a fine-grained benchmark SafeDialBench for evaluating the safety of LLMs across various jailbreak attacks in multi-turn dialogues. Specifically, we design a two-tier hierarchical safety taxonomy that considers 6 safety dimensions and generates more than 4000 multi-turn dialogues in both Chinese and English under 22 dialogue scenarios. We employ 7 jailbreak attack strategies, such as reference attack and purpose reverse, to enhance the dataset quality for dialogue generation. Notably, we construct an innovative assessment framework of LLMs, measuring capabilities in detecting, and handling unsafe information and maintaining consistency when facing jailbreak attacks. Experimental results across 17 LLMs reveal that Yi-34B-Chat and GLM4-9B-Chat demonstrate superior safety performance, while Llama3.1-8B-Instruct and o3-mini exhibit safety vulnerabilities. </p> </div> </dd> <dt> <a name='item258'>[258]</a> <a href ="/abs/2502.11113" title="Abstract" id="2502.11113"> arXiv:2502.11113 </a> (replaced) [<a href="/pdf/2502.11113" title="Download PDF" id="pdf-2502.11113" aria-labelledby="pdf-2502.11113">pdf</a>, <a href="https://arxiv.org/html/2502.11113v2" title="View HTML" id="html-2502.11113" aria-labelledby="html-2502.11113" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11113" title="Other formats" id="oth-2502.11113" aria-labelledby="oth-2502.11113">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Valuable Hallucinations: Realizable Non-realistic Propositions </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Q">Qiucheng Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+B">Bo Wang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 13 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> This paper introduces the first formal definition of valuable hallucinations in large language models (LLMs), addressing a gap in the existing literature. We provide a systematic definition and analysis of hallucination value, proposing methods for enhancing the value of hallucinations. In contrast to previous works, which often treat hallucinations as a broad flaw, we focus on the potential value that certain types of hallucinations can offer in specific contexts. Hallucinations in LLMs generally refer to the generation of unfaithful, fabricated, inconsistent, or nonsensical content. Rather than viewing all hallucinations negatively, this paper gives formal representations and manual judgments of "valuable hallucinations" and explores how realizable non-realistic propositions--ideas that are not currently true but could be achievable under certain conditions--can have constructive value. We present experiments using the Qwen2.5 model and HalluQA dataset, employing ReAct prompting (which involves reasoning, confidence assessment, and answer verification) to control and optimize hallucinations. Our findings show that ReAct prompting results in a 5.12\% reduction in overall hallucinations and an increase in the proportion of valuable hallucinations from 6.45\% to 7.92\%. These results demonstrate that systematically controlling hallucinations can improve their usefulness without compromising factual reliability. </p> </div> </dd> <dt> <a name='item259'>[259]</a> <a href ="/abs/2502.11177" title="Abstract" id="2502.11177"> arXiv:2502.11177 </a> (replaced) [<a href="/pdf/2502.11177" title="Download PDF" id="pdf-2502.11177" aria-labelledby="pdf-2502.11177">pdf</a>, <a href="/format/2502.11177" title="Other formats" id="oth-2502.11177" aria-labelledby="oth-2502.11177">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> The Mirage of Model Editing: Revisiting Evaluation in the Wild </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+W">Wanli Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+F">Fei Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tan,+J">Jiajun Tan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+X">Xinyu Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+Q">Qi Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yin,+D">Dawei Yin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+H">Huawei Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cheng,+X">Xueqi Cheng</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Despite near-perfect results in artificial evaluations, the effectiveness of model editing in real-world applications remains unexplored. To bridge this gap, we propose to study model editing in question answering (QA) by establishing a rigorous evaluation practice to assess the effectiveness of editing methods in correcting LLMs' errors. It consists of QAEdit, a new benchmark derived from popular QA datasets, and a standardized evaluation framework. Our single editing experiments indicate that current editing methods perform substantially worse than previously reported (38.5% vs. ~96%). Through module analysis and controlled experiments, we demonstrate that this performance decline stems from issues in evaluation practices of prior editing research. One key issue is the inappropriate use of teacher forcing in testing prevents error propagation by feeding ground truth tokens (inaccessible in real-world scenarios) as input. Furthermore, we simulate real-world deployment by sequential editing, revealing that current approaches fail drastically with only 1000 edits. Our analysis provides a fundamental reexamination of both the real-world applicability of existing model editing methods and their evaluation practices, and establishes a rigorous evaluation framework with key insights to advance reliable and practical model editing research. </p> </div> </dd> <dt> <a name='item260'>[260]</a> <a href ="/abs/2502.11364" title="Abstract" id="2502.11364"> arXiv:2502.11364 </a> (replaced) [<a href="/pdf/2502.11364" title="Download PDF" id="pdf-2502.11364" aria-labelledby="pdf-2502.11364">pdf</a>, <a href="/format/2502.11364" title="Other formats" id="oth-2502.11364" aria-labelledby="oth-2502.11364">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Blessing of Multilinguality: A Systematic Analysis of Multilingual In-Context Learning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Tu,+Y">Yilei Tu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xue,+A">Andrew Xue</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shi,+F">Freda Shi</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> While multilingual large language models generally perform adequately, and sometimes even rival English performance on high-resource languages (HRLs), they often significantly underperform on low-resource languages (LRLs). Among several prompting strategies aiming at bridging the gap, multilingual in-context learning (ICL) has been particularly effective when demonstration in target languages is unavailable. However, there lacks a systematic understanding of when and why it works well. <br>In this work, we systematically analyze multilingual ICL, using demonstrations in HRLs to enhance cross-lingual transfer. We show that demonstrations in mixed HRLs consistently outperform English-only ones across the board, particularly for tasks written in LRLs. Surprisingly, our ablation study shows that the presence of irrelevant non-English sentences in the prompt yields measurable gains, suggesting the effectiveness of multilingual exposure itself. Our results highlight the potential of strategically leveraging multilingual resources to bridge the performance gap for underrepresented languages. </p> </div> </dd> <dt> <a name='item261'>[261]</a> <a href ="/abs/2502.11735" title="Abstract" id="2502.11735"> arXiv:2502.11735 </a> (replaced) [<a href="/pdf/2502.11735" title="Download PDF" id="pdf-2502.11735" aria-labelledby="pdf-2502.11735">pdf</a>, <a href="https://arxiv.org/html/2502.11735v2" title="View HTML" id="html-2502.11735" aria-labelledby="html-2502.11735" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11735" title="Other formats" id="oth-2502.11735" aria-labelledby="oth-2502.11735">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MT-RAIG: Novel Benchmark and Evaluation Framework for Retrieval-Augmented Insight Generation over Multiple Tables </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Seo,+K">Kwangwook Seo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kwon,+D">Donguk Kwon</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+D">Dongha Lee</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Work in progress </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Recent advancements in table-based reasoning have expanded beyond factoid-level QA to address insight-level tasks, where systems should synthesize implicit knowledge in the table to provide explainable analyses. Although effective, existing studies remain confined to scenarios where a single gold table is given alongside the user query, failing to address cases where users seek comprehensive insights from multiple unknown tables. To bridge these gaps, we propose MT-RAIG Bench, design to evaluate systems on Retrieval-Augmented Insight Generation over Mulitple-Tables. Additionally, to tackle the suboptimality of existing automatic evaluation methods in the table domain, we further introduce a fine-grained evaluation framework MT-RAIG Eval, which achieves better alignment with human quality judgments on the generated insights. We conduct extensive experiments and reveal that even frontier LLMs still struggle with complex multi-table reasoning, establishing our MT-RAIG Bench as a challenging testbed for future research. </p> </div> </dd> <dt> <a name='item262'>[262]</a> <a href ="/abs/2502.11811" title="Abstract" id="2502.11811"> arXiv:2502.11811 </a> (replaced) [<a href="/pdf/2502.11811" title="Download PDF" id="pdf-2502.11811" aria-labelledby="pdf-2502.11811">pdf</a>, <a href="https://arxiv.org/html/2502.11811v2" title="View HTML" id="html-2502.11811" aria-labelledby="html-2502.11811" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11811" title="Other formats" id="oth-2502.11811" aria-labelledby="oth-2502.11811">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> FineFilter: A Fine-grained Noise Filtering Mechanism for Retrieval-Augmented Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Q">Qianchi Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+H">Hainan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pang,+L">Liang Pang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zheng,+H">Hongwei Zheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tong,+Y">Yongxin Tong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zheng,+Z">Zhiming Zheng</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Retrieved documents containing noise will hinder Retrieval-Augmented Generation (RAG) from detecting answer clues, necessitating noise filtering mechanisms to enhance accuracy. Existing methods use re-ranking or summarization to identify the most relevant sentences, but directly and accurately locating answer clues from these large-scale and complex documents remains challenging. Unlike these document-level operations, we treat noise filtering as a sentence-level MinMax optimization problem: first identifying the potential clues from multiple documents using contextual information, then ranking them by relevance, and finally retaining the least clues through truncation. In this paper, we propose FineFilter, a novel fine-grained noise filtering mechanism for RAG consisting of a clue extractor, a re-ranker, and a truncator. We optimize each module to tackle complex reasoning challenges: (1) Clue extractor firstly uses sentences containing the answer and similar ones as fine-tuned targets, aiming at extracting sufficient potential clues; (2) Re-ranker is trained to prioritize effective clues based on the real feedback from generation module, with clues capable of generating correct answer as positive samples and others as negative; (3) Truncator takes the minimum clues needed to answer the question (truncation point) as fine-tuned targets, and performs truncation on the re-ranked clues to achieve fine-grained noise filtering. Experiments on three QA datasets demonstrate that FineFilter significantly outperforms baselines in terms of performance and inference cost. Further analysis on each module shows the effectiveness of our optimizations for complex reasoning. </p> </div> </dd> <dt> <a name='item263'>[263]</a> <a href ="/abs/2502.11874" title="Abstract" id="2502.11874"> arXiv:2502.11874 </a> (replaced) [<a href="/pdf/2502.11874" title="Download PDF" id="pdf-2502.11874" aria-labelledby="pdf-2502.11874">pdf</a>, <a href="https://arxiv.org/html/2502.11874v2" title="View HTML" id="html-2502.11874" aria-labelledby="html-2502.11874" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11874" title="Other formats" id="oth-2502.11874" aria-labelledby="oth-2502.11874">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> VAQUUM: Are Vague Quantifiers Grounded in Visual Data? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wong,+H+M">Hugh Mee Wong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nouwen,+R">Rick Nouwen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gatt,+A">Albert Gatt</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Under review, 12 pages for main paper (5 figures), 15 pages including appendix (2 figures) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Vague quantifiers such as "a few" and "many" are influenced by many contextual factors, including how many objects are present in a given context. In this work, we evaluate the extent to which vision-and-language models (VLMs) are compatible with humans when producing or judging the appropriateness of vague quantifiers in visual contexts. We release a novel dataset, VAQUUM, containing 20300 human ratings on quantified statements across a total of 1089 images. Using this dataset, we compare human judgments and VLM predictions using three different evaluation methods. Our findings show that VLMs, like humans, are influenced by object counts in vague quantifier use. However, we find significant inconsistencies across models in different evaluation settings, suggesting that judging and producing vague quantifiers rely on two different processes. </p> </div> </dd> <dt> <a name='item264'>[264]</a> <a href ="/abs/2502.11890" title="Abstract" id="2502.11890"> arXiv:2502.11890 </a> (replaced) [<a href="/pdf/2502.11890" title="Download PDF" id="pdf-2502.11890" aria-labelledby="pdf-2502.11890">pdf</a>, <a href="/format/2502.11890" title="Other formats" id="oth-2502.11890" aria-labelledby="oth-2502.11890">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Revisiting Classification Taxonomy for Grammatical Errors </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zou,+D">Deqing Zou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ye,+J">Jingheng Ye</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Y">Yulu Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Y">Yu Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+Z">Zishan Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yinghui Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zheng,+H">Hai-Tao Zheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=An,+B">Bingxu An</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wei,+Z">Zhao Wei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+Y">Yong Xu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 26 pages, 4 figures and 5 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Grammatical error classification plays a crucial role in language learning systems, but existing classification taxonomies often lack rigorous validation, leading to inconsistencies and unreliable feedback. In this paper, we revisit previous classification taxonomies for grammatical errors by introducing a systematic and qualitative evaluation framework. Our approach examines four aspects of a taxonomy, i.e., exclusivity, coverage, balance, and usability. Then, we construct a high-quality grammatical error classification dataset annotated with multiple classification taxonomies and evaluate them grounding on our proposed evaluation framework. Our experiments reveal the drawbacks of existing taxonomies. Our contributions aim to improve the precision and effectiveness of error analysis, providing more understandable and actionable feedback for language learners. </p> </div> </dd> <dt> <a name='item265'>[265]</a> <a href ="/abs/2502.11946" title="Abstract" id="2502.11946"> arXiv:2502.11946 </a> (replaced) [<a href="/pdf/2502.11946" title="Download PDF" id="pdf-2502.11946" aria-labelledby="pdf-2502.11946">pdf</a>, <a href="https://arxiv.org/html/2502.11946v2" title="View HTML" id="html-2502.11946" aria-labelledby="html-2502.11946" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11946" title="Other formats" id="oth-2502.11946" aria-labelledby="oth-2502.11946">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Step-Audio: Unified Understanding and Generation in Intelligent Speech Interaction </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+A">Ailin Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+B">Boyong Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+B">Bruce Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yan,+C">Chao Yan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+C">Chen Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+C">Chengli Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tian,+F">Fei Tian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+F">Feiyu Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+J">Jingbei Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+M">Mingrui Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+P">Peng Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Miao,+R">Ruihang Miao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=You,+W">Wang You</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+X">Xi Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+X">Xuerui Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+Y">Yechang Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yuxiang Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gong,+Z">Zheng Gong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Z">Zixin Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+H">Hongyu Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+J">Jianjian Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+B">Brian Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+C">Chengting Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wan,+C">Changyi Wan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+H">Hanpeng Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+J">Jianchang Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhen,+J">Jiangjie Zhen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ming,+R">Ranchen Ming</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+S">Song Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+X">Xuelin Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+Y">Yu Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+B">Bingxin Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+B">Buyun Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+H">Hongyuan Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=An,+K">Kang An</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ji,+W">Wei Ji</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+W">Wen Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wen,+X">Xuan Wen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kong,+X">Xiangwen Kong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+Y">Yuankai Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liang,+Y">Yuanwei Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mou,+Y">Yun Mou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ahmidi,+B">Bahtiyar Ahmidi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+B">Bin Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+B">Bo Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Miao,+C">Changxin Miao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+C">Chen Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+C">Chenrun Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shi,+D">Dapeng Shi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+D">Deshan Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+D">Dingyuan Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sai,+D">Dula Sai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+E">Enle Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+G">Guanzhe Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yan,+G">Gulin Yan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+H">Heng Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jia,+H">Haonan Jia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+H">Haoyang Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gong,+J">Jiahao Gong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+J">Junjing Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+J">Jiashuai Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+J">Jiahong Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+J">Jie Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+J">Jie Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+J">Jiaoren Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+J">Jie Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+J">Jinguo Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+J">Jingyang Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+J">Junzhe Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+K">Kaixiang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xia,+L">Lei Xia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+L">Li Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+L">Liang Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gu,+L">Longlong Gu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+M">Mei Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+M">Menglin Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+M">Ming Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+M">Mingxiao Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+M">Mingliang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liang,+M">Mingyao Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+N">Na Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hao,+N">Nie Hao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Q">Qiling Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tan,+Q">Qinyuan Tan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+R">Ran Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shuai,+S">Shuai Shuai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pang,+S">Shaoliang Pang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+S">Shiliang Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+S">Shuli Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+S">Shanshan Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+S">Siqi Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Deng,+S">Shihong Deng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+S">Shilei Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+S">Sitong Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+T">Tiancheng Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+T">Tianyu Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Deng,+W">Wenjin Deng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xie,+W">Wuxun Xie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ming,+W">Weipeng Ming</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+W">Wenqing He</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Human-Computer Interaction (cs.HC); Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> <p class='mathjax'> Real-time speech interaction, serving as a fundamental interface for human-machine collaboration, holds immense potential. However, current open-source models face limitations such as high costs in voice data collection, weakness in dynamic control, and limited intelligence. To address these challenges, this paper introduces Step-Audio, the first production-ready open-source solution. Key contributions include: 1) a 130B-parameter unified speech-text multi-modal model that achieves unified understanding and generation, with the Step-Audio-Chat version open-sourced; 2) a generative speech data engine that establishes an affordable voice cloning framework and produces the open-sourced lightweight Step-Audio-TTS-3B model through distillation; 3) an instruction-driven fine control system enabling dynamic adjustments across dialects, emotions, singing, and RAP; 4) an enhanced cognitive architecture augmented with tool calling and role-playing abilities to manage complex tasks effectively. Based on our new StepEval-Audio-360 evaluation benchmark, Step-Audio achieves state-of-the-art performance in human evaluations, especially in terms of instruction following. On open-source benchmarks like LLaMA Question, shows 9.3% average performance improvement, demonstrating our commitment to advancing the development of open-source multi-modal language technologies. Our code and models are available at <a href="https://github.com/stepfun-ai/Step-Audio" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item266'>[266]</a> <a href ="/abs/2202.07269" title="Abstract" id="2202.07269"> arXiv:2202.07269 </a> (replaced) [<a href="/pdf/2202.07269" title="Download PDF" id="pdf-2202.07269" aria-labelledby="pdf-2202.07269">pdf</a>, <a href="/format/2202.07269" title="Other formats" id="oth-2202.07269" aria-labelledby="oth-2202.07269">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Media Slant is Contagious </div> <div class='list-authors'><a href="https://arxiv.org/search/econ?searchtype=author&query=Widmer,+P">Philine Widmer</a>, <a href="https://arxiv.org/search/econ?searchtype=author&query=Meraim,+C+A">Cl茅mentine Abed Meraim</a>, <a href="https://arxiv.org/search/econ?searchtype=author&query=Galletta,+S">Sergio Galletta</a>, <a href="https://arxiv.org/search/econ?searchtype=author&query=Ash,+E">Elliott Ash</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">General Economics (econ.GN)</span>; Computation and Language (cs.CL); Computers and Society (cs.CY) </div> <p class='mathjax'> This paper examines the diffusion of media slant. We document the influence of Fox News Channel (FNC) on the partisan slant of local newspapers in the U.S. over the years 1995-2008. We measure the political slant of local newspapers by scaling the news article texts to Republicans' and Democrats' speeches in Congress. Using channel positioning as an instrument for viewership, we find that higher FNC viewership causes local newspapers to adopt more right-wing slant. The effect emerges gradually, only several years after FNC's introduction, mirroring the channel's growing influence on voting behavior. A main driver of the shift in newspaper slant appears to be a change in local political preferences. </p> </div> </dd> <dt> <a name='item267'>[267]</a> <a href ="/abs/2308.15334" title="Abstract" id="2308.15334"> arXiv:2308.15334 </a> (replaced) [<a href="/pdf/2308.15334" title="Download PDF" id="pdf-2308.15334" aria-labelledby="pdf-2308.15334">pdf</a>, <a href="https://arxiv.org/html/2308.15334v3" title="View HTML" id="html-2308.15334" aria-labelledby="html-2308.15334" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2308.15334" title="Other formats" id="oth-2308.15334" aria-labelledby="oth-2308.15334">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> The Responsible Development of Automated Student Feedback with Generative AI </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lindsay,+E+D">Euan D Lindsay</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+M">Mike Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Johri,+A">Aditya Johri</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bjerva,+J">Johannes Bjerva</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Pre-print of version accepted to EDUCON 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computers and Society (cs.CY)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> Providing rich, constructive feedback to students is essential for supporting and enhancing their learning. Recent advancements in Generative Artificial Intelligence (AI), particularly with large language models (LLMs), present new opportunities to deliver scalable, repeatable, and instant feedback, effectively making abundant a resource that has historically been scarce and costly. From a technical perspective, this approach is now feasible due to breakthroughs in AI and Natural Language Processing (NLP). While the potential educational benefits are compelling, implementing these technologies also introduces a host of ethical considerations that must be thoughtfully addressed. One of the core advantages of AI systems is their ability to automate routine and mundane tasks, potentially freeing up human educators for more nuanced work. However, the ease of automation risks a ``tyranny of the majority'', where the diverse needs of minority or unique learners are overlooked, as they may be harder to systematize and less straightforward to accommodate. Ensuring inclusivity and equity in AI-generated feedback, therefore, becomes a critical aspect of responsible AI implementation in education. The process of developing machine learning models that produce valuable, personalized, and authentic feedback also requires significant input from human domain experts. Decisions around whose expertise is incorporated, how it is captured, and when it is applied have profound implications for the relevance and quality of the resulting feedback. Additionally, the maintenance and continuous refinement of these models are necessary to adapt feedback to evolving contextual, theoretical, and student-related factors. Without ongoing adaptation, feedback risks becoming obsolete or mismatched with the current needs of diverse student populations [...] </p> </div> </dd> <dt> <a name='item268'>[268]</a> <a href ="/abs/2401.10747" title="Abstract" id="2401.10747"> arXiv:2401.10747 </a> (replaced) [<a href="/pdf/2401.10747" title="Download PDF" id="pdf-2401.10747" aria-labelledby="pdf-2401.10747">pdf</a>, <a href="https://arxiv.org/html/2401.10747v4" title="View HTML" id="html-2401.10747" aria-labelledby="html-2401.10747" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2401.10747" title="Other formats" id="oth-2401.10747" aria-labelledby="oth-2401.10747">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Multimodal Sentiment Analysis with Missing Modality: A Knowledge-Transfer Approach </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+W">Weide Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhan,+H">Huijing Zhan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+H">Hao Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lv,+F">Fengmao Lv</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> We request to withdraw our paper from the archive due to significant errors identified in the analysis and conclusions. Upon further review, we realized that these errors undermine the validity of our findings. We plan to conduct additional research to correct these issues and resubmit a revised version in the future </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Machine Learning (cs.LG); Audio and Speech Processing (eess.AS) </div> <p class='mathjax'> Multimodal sentiment analysis aims to identify the emotions expressed by individuals through visual, language, and acoustic cues. However, most of the existing research efforts assume that all modalities are available during both training and testing, making their algorithms susceptible to the missing modality scenario. In this paper, we propose a novel knowledge-transfer network to translate between different modalities to reconstruct the missing audio modalities. Moreover, we develop a cross-modality attention mechanism to retain the maximal information of the reconstructed and observed modalities for sentiment prediction. Extensive experiments on three publicly available datasets demonstrate significant improvements over baselines and achieve comparable results to the previous methods with complete multi-modality supervision. </p> </div> </dd> <dt> <a name='item269'>[269]</a> <a href ="/abs/2404.08672" title="Abstract" id="2404.08672"> arXiv:2404.08672 </a> (replaced) [<a href="/pdf/2404.08672" title="Download PDF" id="pdf-2404.08672" aria-labelledby="pdf-2404.08672">pdf</a>, <a href="https://arxiv.org/html/2404.08672v2" title="View HTML" id="html-2404.08672" aria-labelledby="html-2404.08672" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2404.08672" title="Other formats" id="oth-2404.08672" aria-labelledby="oth-2404.08672">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Taxonomy and Analysis of Sensitive User Queries in Generative AI Search </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Jo,+H">Hwiyeol Jo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Park,+T">Taiwoo Park</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+H">Hyunwoo Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Choi,+N">Nayoung Choi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+C">Changbong Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kwon,+O">Ohjoon Kwon</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jeon,+D">Donghyeon Jeon</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+E">Eui-Hyeon Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shin,+K">Kyoungho Shin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lim,+S+S">Sun Suk Lim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+K">Kyungmi Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+J">Jihye Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+S">Sun Kim</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> NAACL2025(Findings) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Information Retrieval (cs.IR)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Computers and Society (cs.CY); Machine Learning (cs.LG) </div> <p class='mathjax'> Although there has been a growing interest among industries in integrating generative LLMs into their services, limited experience and scarcity of resources act as a barrier in launching and servicing large-scale LLM-based services. In this paper, we share our experiences in developing and operating generative AI models within a national-scale search engine, with a specific focus on the sensitiveness of user queries. We propose a taxonomy for sensitive search queries, outline our approaches, and present a comprehensive analysis report on sensitive queries from actual users. We believe that our experiences in launching generative AI search systems can contribute to reducing the barrier in building generative LLM-based services. </p> </div> </dd> <dt> <a name='item270'>[270]</a> <a href ="/abs/2406.13348" title="Abstract" id="2406.13348"> arXiv:2406.13348 </a> (replaced) [<a href="/pdf/2406.13348" title="Download PDF" id="pdf-2406.13348" aria-labelledby="pdf-2406.13348">pdf</a>, <a href="https://arxiv.org/html/2406.13348v2" title="View HTML" id="html-2406.13348" aria-labelledby="html-2406.13348" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.13348" title="Other formats" id="oth-2406.13348" aria-labelledby="oth-2406.13348">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Textual Unlearning Gives a False Sense of Unlearning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Du,+J">Jiacheng Du</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zhibo Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+J">Jie Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pang,+X">Xiaoyi Pang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+J">Jiahui Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ren,+K">Kui Ren</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Cryptography and Security (cs.CR)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Machine Learning (cs.LG) </div> <p class='mathjax'> Language Models (LMs) are prone to ''memorizing'' training data, including substantial sensitive user information. To mitigate privacy risks and safeguard the right to be forgotten, machine unlearning has emerged as a promising approach for enabling LMs to efficiently ''forget'' specific texts. However, despite the good intentions, is textual unlearning really as effective and reliable as expected? To address the concern, we first propose Unlearning Likelihood Ratio Attack+ (U-LiRA+), a rigorous textual unlearning auditing method, and find that unlearned texts can still be detected with very high confidence after unlearning. Further, we conduct an in-depth investigation on the privacy risks of textual unlearning mechanisms in deployment and present the Textual Unlearning Leakage Attack (TULA), along with its variants in both black- and white-box scenarios. We show that textual unlearning mechanisms could instead reveal more about the unlearned texts, exposing them to significant membership inference and data reconstruction risks. Our findings highlight that existing textual unlearning actually gives a false sense of unlearning, underscoring the need for more robust and secure unlearning mechanisms. </p> </div> </dd> <dt> <a name='item271'>[271]</a> <a href ="/abs/2407.13399" title="Abstract" id="2407.13399"> arXiv:2407.13399 </a> (replaced) [<a href="/pdf/2407.13399" title="Download PDF" id="pdf-2407.13399" aria-labelledby="pdf-2407.13399">pdf</a>, <a href="https://arxiv.org/html/2407.13399v3" title="View HTML" id="html-2407.13399" aria-labelledby="html-2407.13399" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2407.13399" title="Other formats" id="oth-2407.13399" aria-labelledby="oth-2407.13399">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Correcting the Mythos of KL-Regularization: Direct Alignment without Overoptimization via Chi-Squared Preference Optimization </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+A">Audrey Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhan,+W">Wenhao Zhan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xie,+T">Tengyang Xie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+J+D">Jason D. Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+W">Wen Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Krishnamurthy,+A">Akshay Krishnamurthy</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Foster,+D+J">Dylan J. Foster</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL); Machine Learning (cs.LG) </div> <p class='mathjax'> Language model alignment methods such as reinforcement learning from human feedback (RLHF) have led to impressive advances in language model capabilities, but are limited by a widely observed phenomenon known as overoptimization, where the quality of the language model degrades over the course of the alignment process. As the model optimizes performance with respect to an offline reward model, it overfits to inaccuracies and drifts away from preferred responses covered by the data. To discourage such distribution shift, KL-regularization is widely employed in existing offline alignment methods, but overoptimization continues to harm performance. Lending theoretical insight into the source of these empirical observations, we first show that the KL-regularization is too weak to prevent overfitting, then raise the following question: is it possible to design an efficient algorithm that is provably robust to overoptimization? <br>We address this question with a new algorithm for offline alignment, $\chi^2$-Preference Optimization ($\chi$PO). $\chi$PO is a one-line change to Direct Preference Optimization (DPO; Rafailov et al., 2023), which only involves modifying the logarithmic link function in the DPO objective. Despite this minimal change, $\chi$PO implicitly implements the principle of pessimism in the face of uncertainty via regularization with the $\chi^2$-divergence -- which quantifies uncertainty more effectively than KL-regularization -- and provably alleviates overoptimization, achieving sample-complexity guarantees based on single-policy concentrability -- the gold standard in offline reinforcement learning. $\chi$PO's simplicity and strong guarantees make it the first practical and general-purpose offline alignment algorithm that is provably robust to overoptimization. </p> </div> </dd> <dt> <a name='item272'>[272]</a> <a href ="/abs/2407.20756" title="Abstract" id="2407.20756"> arXiv:2407.20756 </a> (replaced) [<a href="/pdf/2407.20756" title="Download PDF" id="pdf-2407.20756" aria-labelledby="pdf-2407.20756">pdf</a>, <a href="https://arxiv.org/html/2407.20756v4" title="View HTML" id="html-2407.20756" aria-labelledby="html-2407.20756" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2407.20756" title="Other formats" id="oth-2407.20756" aria-labelledby="oth-2407.20756">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SynthVLM: High-Efficiency and High-Quality Synthetic Data for Vision Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zheng Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liang,+H">Hao Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+B">Bozhou Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bai,+T">Tianyi Bai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiong,+W">Wentao Xiong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+C">Chong Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+C">Conghui He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+W">Wentao Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cui,+B">Bin Cui</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Vision-Language Models (VLMs) have recently emerged, demonstrating remarkable vision-understanding capabilities. However, training these models requires large-scale datasets, which brings challenges related to efficiency, effectiveness, quality, and privacy of web data. In this paper, we introduce SynthVLM, a novel data synthesis and curation method for generating image-caption pairs. Unlike traditional methods, where captions are generated from images, SynthVLM utilizes advanced diffusion models and high-quality captions to automatically synthesize and select high-resolution images from text descriptions, thereby creating precisely aligned image-text pairs. To demonstrate the power of SynthVLM, we introduce SynthVLM-100K, a high-quality dataset consisting of 100,000 curated and synthesized image-caption pairs. In both model and human evaluations, SynthVLM-100K outperforms traditional real-world datasets. Leveraging this dataset, we develop a new family of multimodal large language models (MLLMs), SynthVLM-7B and SynthVLM-13B, which achieve state-of-the-art (SOTA) performance on various vision question-answering (VQA) tasks. Notably, our models outperform LLaVA across most metrics with only 18\% pretrain data. Furthermore, SynthVLM-7B and SynthVLM-13B attain SOTA performance on the MMLU benchmark, demonstrating that the high-quality SynthVLM-100K dataset preserves language abilities. To facilitate future research, our dataset and the complete data generating and curating methods are open-sourced at <a href="https://github.com/starriver030515/SynthVLM" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item273'>[273]</a> <a href ="/abs/2408.10774" title="Abstract" id="2408.10774"> arXiv:2408.10774 </a> (replaced) [<a href="/pdf/2408.10774" title="Download PDF" id="pdf-2408.10774" aria-labelledby="pdf-2408.10774">pdf</a>, <a href="https://arxiv.org/html/2408.10774v3" title="View HTML" id="html-2408.10774" aria-labelledby="html-2408.10774" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2408.10774" title="Other formats" id="oth-2408.10774" aria-labelledby="oth-2408.10774">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Flexora: Flexible Low Rank Adaptation for Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wei,+C">Chenxing Wei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shu,+Y">Yao Shu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+Y+T">Ying Tiffany He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+F+R">Fei Richard Yu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 39 pages, 15 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Large Language Models (LLMs) are driving advancements in artificial intelligence by increasing the scale of model parameters, which has significantly enhanced generalization ability and unlocked new capabilities in practice. However, their performance in specific downstream tasks is usually hindered by their knowledge boundaries on these tasks. Thus, fine-tuning techniques, especially the widely used Low-Rank Adaptation (LoRA) method, have been introduced to expand the boundaries on these tasks, whereas LoRA would underperform on certain tasks owing to its potential overfitting on these tasks. To overcome this overfitting and improve the performance of LoRA, we propose the flexible low rank adaptation (Flexora) method to automatically and flexibly select the most important layers needing to be fine-tuned to achieve the best performance on different downstream tasks. Specifically, Flexora firstly frames this layer selection problem as a well-defined hyperparameter optimization (HPO) problem, then addresses it using the unrolled differentiation (UD) method, and finally selects the most useful layers based on the optimized hyperparameters. Our extensive experiments on many pretrained models and natural language tasks show that Flexora is able to consistently improve over the existing baselines, indicating the effectiveness of our Flexora in practice. We additionally provide insightful theoretical results and many ablation studies to deliver a comprehensive understanding of our Flexora. </p> </div> </dd> <dt> <a name='item274'>[274]</a> <a href ="/abs/2409.05907" title="Abstract" id="2409.05907"> arXiv:2409.05907 </a> (replaced) [<a href="/pdf/2409.05907" title="Download PDF" id="pdf-2409.05907" aria-labelledby="pdf-2409.05907">pdf</a>, <a href="https://arxiv.org/html/2409.05907v3" title="View HTML" id="html-2409.05907" aria-labelledby="html-2409.05907" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.05907" title="Other formats" id="oth-2409.05907" aria-labelledby="oth-2409.05907">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Programming Refusal with Conditional Activation Steering </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+B+W">Bruce W. Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Padhi,+I">Inkit Padhi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ramamurthy,+K+N">Karthikeyan Natesan Ramamurthy</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Miehling,+E">Erik Miehling</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dognin,+P">Pierre Dognin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nagireddy,+M">Manish Nagireddy</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dhurandhar,+A">Amit Dhurandhar</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> ICLR 2025, Spotlight </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> LLMs have shown remarkable capabilities, but precisely controlling their response behavior remains challenging. Existing activation steering methods alter LLM behavior indiscriminately, limiting their practical applicability in settings where selective responses are essential, such as content moderation or domain-specific assistants. In this paper, we propose Conditional Activation Steering (CAST), which analyzes LLM activation patterns during inference to selectively apply or withhold activation steering based on the input context. Our method is based on the observation that different categories of prompts activate distinct patterns in the model's hidden states. Using CAST, one can systematically control LLM behavior with rules like "if input is about hate speech or adult content, then refuse" or "if input is not about legal advice, then refuse." This allows for selective modification of responses to specific content while maintaining normal responses to other content, all without requiring weight optimization. We release an open-source implementation of our framework at <a href="http://github.com/IBM/activation-steering" rel="external noopener nofollow" class="link-external link-http">this http URL</a> . </p> </div> </dd> <dt> <a name='item275'>[275]</a> <a href ="/abs/2409.08202" title="Abstract" id="2409.08202"> arXiv:2409.08202 </a> (replaced) [<a href="/pdf/2409.08202" title="Download PDF" id="pdf-2409.08202" aria-labelledby="pdf-2409.08202">pdf</a>, <a href="https://arxiv.org/html/2409.08202v2" title="View HTML" id="html-2409.08202" aria-labelledby="html-2409.08202" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.08202" title="Other formats" id="oth-2409.08202" aria-labelledby="oth-2409.08202">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> What Makes a Maze Look Like a Maze? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Hsu,+J">Joy Hsu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mao,+J">Jiayuan Mao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tenenbaum,+J+B">Joshua B. Tenenbaum</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Goodman,+N+D">Noah D. Goodman</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+J">Jiajun Wu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> ICLR 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Machine Learning (cs.LG) </div> <p class='mathjax'> A unique aspect of human visual understanding is the ability to flexibly interpret abstract concepts: acquiring lifted rules explaining what they symbolize, grounding them across familiar and unfamiliar contexts, and making predictions or reasoning about them. While off-the-shelf vision-language models excel at making literal interpretations of images (e.g., recognizing object categories such as tree branches), they still struggle to make sense of such visual abstractions (e.g., how an arrangement of tree branches may form the walls of a maze). To address this challenge, we introduce Deep Schema Grounding (DSG), a framework that leverages explicit structured representations of visual abstractions for grounding and reasoning. At the core of DSG are schemas--dependency graph descriptions of abstract concepts that decompose them into more primitive-level symbols. DSG uses large language models to extract schemas, then hierarchically grounds concrete to abstract components of the schema onto images with vision-language models. The grounded schema is used to augment visual abstraction understanding. We systematically evaluate DSG and different methods in reasoning on our new Visual Abstractions Dataset, which consists of diverse, real-world images of abstract concepts and corresponding question-answer pairs labeled by humans. We show that DSG significantly improves the abstract visual reasoning performance of vision-language models, and is a step toward human-aligned understanding of visual abstractions. </p> </div> </dd> <dt> <a name='item276'>[276]</a> <a href ="/abs/2409.13931" title="Abstract" id="2409.13931"> arXiv:2409.13931 </a> (replaced) [<a href="/pdf/2409.13931" title="Download PDF" id="pdf-2409.13931" aria-labelledby="pdf-2409.13931">pdf</a>, <a href="https://arxiv.org/html/2409.13931v3" title="View HTML" id="html-2409.13931" aria-labelledby="html-2409.13931" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.13931" title="Other formats" id="oth-2409.13931" aria-labelledby="oth-2409.13931">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> On-Device Collaborative Language Modeling via a Mixture of Generalists and Specialists </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Fan,+D">Dongyang Fan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Messmer,+B">Bettina Messmer</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Doikov,+N">Nikita Doikov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jaggi,+M">Martin Jaggi</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> On-device LLMs have gained increasing attention for their ability to enhance privacy and provide a personalized user experience. To facilitate private learning with scarce data, Federated Learning has become a standard approach. However, it faces challenges such as computational resource heterogeneity and data heterogeneity among end users. We propose CoMiGS ($\textbf{Co}$llaborative learning with a $\textbf{Mi}$xture of $\textbf{G}$eneralists and $\textbf{S}$pecialists), the first approach to address both challenges. A key innovation of our method is the bi-level optimization formulation of the Mixture-of-Experts learning objective, where the router is optimized using a separate validation set to ensure alignment with the target distribution. We solve our objective with alternating minimization, for which we provide a theoretical analysis. Our method shares generalist experts across users while localizing a varying number of specialist experts, thereby adapting to users' computational resources and preserving privacy. Through extensive experiments, we show CoMiGS effectively balances general and personalized knowledge for each token generation. We demonstrate that CoMiGS remains robust against overfitting-due to the generalists' regularizing effect-while adapting to local data through specialist expertise. We open source our codebase for collaborative LLMs. </p> </div> </dd> <dt> <a name='item277'>[277]</a> <a href ="/abs/2409.17513" title="Abstract" id="2409.17513"> arXiv:2409.17513 </a> (replaced) [<a href="/pdf/2409.17513" title="Download PDF" id="pdf-2409.17513" aria-labelledby="pdf-2409.17513">pdf</a>, <a href="https://arxiv.org/html/2409.17513v2" title="View HTML" id="html-2409.17513" aria-labelledby="html-2409.17513" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.17513" title="Other formats" id="oth-2409.17513" aria-labelledby="oth-2409.17513">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Comparing Unidirectional, Bidirectional, and Word2vec Models for Discovering Vulnerabilities in Compiled Lifted Code </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=McCully,+G+A">Gary A. McCully</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hastings,+J+D">John D. Hastings</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+S">Shengjie Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fortier,+A">Adam Fortier</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 6 pages, 2 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Cryptography and Security (cs.CR)</span>; Computation and Language (cs.CL); Machine Learning (cs.LG); Software Engineering (cs.SE) </div> <p class='mathjax'> Ransomware and other forms of malware cause significant financial and operational damage to organizations by exploiting long-standing and often difficult-to-detect software vulnerabilities. To detect vulnerabilities such as buffer overflows in compiled code, this research investigates the application of unidirectional transformer-based embeddings, specifically GPT-2. Using a dataset of LLVM functions, we trained a GPT-2 model to generate embeddings, which were subsequently used to build LSTM neural networks to differentiate between vulnerable and non-vulnerable code. Our study reveals that embeddings from the GPT-2 model significantly outperform those from bidirectional models of BERT and RoBERTa, achieving an accuracy of 92.5% and an F1-score of 89.7%. LSTM neural networks were developed with both frozen and unfrozen embedding model layers. The model with the highest performance was achieved when the embedding layers were unfrozen. Further, the research finds that, in exploring the impact of different optimizers within this domain, the SGD optimizer demonstrates superior performance over Adam. Overall, these findings reveal important insights into the potential of unidirectional transformer-based approaches in enhancing cybersecurity defenses. </p> </div> </dd> <dt> <a name='item278'>[278]</a> <a href ="/abs/2410.02197" title="Abstract" id="2410.02197"> arXiv:2410.02197 </a> (replaced) [<a href="/pdf/2410.02197" title="Download PDF" id="pdf-2410.02197" aria-labelledby="pdf-2410.02197">pdf</a>, <a href="https://arxiv.org/html/2410.02197v2" title="View HTML" id="html-2410.02197" aria-labelledby="html-2410.02197" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.02197" title="Other formats" id="oth-2410.02197" aria-labelledby="oth-2410.02197">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Beyond Bradley-Terry Models: A General Preference Model for Language Model Alignment </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yifan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+G">Ge Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Y">Yue Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+K">Kangping Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gu,+Q">Quanquan Gu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 35 pages, 3 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL); Machine Learning (cs.LG) </div> <p class='mathjax'> Modeling human preferences is crucial for aligning foundation models with human values. Traditional reward modeling methods, such as the Bradley-Terry (BT) reward model, fall short in expressiveness, particularly in addressing intransitive preferences. In this paper, we introduce preference embedding, an approach that embeds responses into a latent space to capture intricate preference structures efficiently, achieving linear query complexity. Additionally, we propose preference score-based General Preference Optimization (GPO), which generalizes reward-based reinforcement learning from human feedback (RLHF). Experimental results show that our General Preference embedding Model (GPM) consistently outperforms the BT reward model on the RewardBench benchmark and effectively models cyclic preferences where any BT reward model behaves like a random guess. Furthermore, evaluations on downstream tasks such as AlpacaEval2.0, following the language model post-training with GPO and our general preference model, reveal performance improvements over BT models. These findings indicate that our method may enhance the alignment of foundation models with nuanced human values. The code is available at <a href="https://github.com/general-preference/general-preference-model" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item279'>[279]</a> <a href ="/abs/2410.12694" title="Abstract" id="2410.12694"> arXiv:2410.12694 </a> (replaced) [<a href="/pdf/2410.12694" title="Download PDF" id="pdf-2410.12694" aria-labelledby="pdf-2410.12694">pdf</a>, <a href="https://arxiv.org/html/2410.12694v2" title="View HTML" id="html-2410.12694" aria-labelledby="html-2410.12694" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.12694" title="Other formats" id="oth-2410.12694" aria-labelledby="oth-2410.12694">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> VividMed: Vision Language Model with Versatile Visual Grounding for Medicine </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+L">Lingxiao Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+B">Bingda Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+X">Xuanzhong Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+R">Rong Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+T">Ting Chen</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Recent advancements in Vision Language Models (VLMs) have demonstrated remarkable promise in generating visually grounded responses. However, their application in the medical domain is hindered by unique challenges. For instance, most VLMs rely on a single method of visual grounding, whereas complex medical tasks demand more versatile approaches. Additionally, while most VLMs process only 2D images, a large portion of medical images are 3D. The lack of medical data further compounds these obstacles. To address these challenges, we present VividMed, a vision language model with versatile visual grounding for medicine. Our model supports generating both semantic segmentation masks and instance-level bounding boxes, and accommodates various imaging modalities, including both 2D and 3D data. We design a three-stage training procedure and an automatic data synthesis pipeline based on open datasets and models. Besides visual grounding tasks, VividMed also excels in other common downstream tasks, including Visual Question Answering (VQA) and report generation. Ablation studies empirically show that the integration of visual grounding ability leads to improved performance on these tasks. Our code is publicly available at <a href="https://github.com/function2-llx/MMMM" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item280'>[280]</a> <a href ="/abs/2410.16077" title="Abstract" id="2410.16077"> arXiv:2410.16077 </a> (replaced) [<a href="/pdf/2410.16077" title="Download PDF" id="pdf-2410.16077" aria-labelledby="pdf-2410.16077">pdf</a>, <a href="https://arxiv.org/html/2410.16077v3" title="View HTML" id="html-2410.16077" aria-labelledby="html-2410.16077" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.16077" title="Other formats" id="oth-2410.16077" aria-labelledby="oth-2410.16077">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CartesianMoE: Boosting Knowledge Sharing among Experts via Cartesian Product Routing in Mixture-of-Experts </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Su,+Z">Zhenpeng Su</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+X">Xing Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+Z">Zijia Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiong,+Y">Yizhe Xiong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lv,+M">Minxuan Lv</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+G">Guangyuan Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+H">Hui Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+S">Songlin Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ding,+G">Guiguang Ding</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> NAACL2025 Main </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Large language models (LLM) have been attracting much attention from the community recently, due to their remarkable performance in all kinds of downstream tasks. According to the well-known scaling law, scaling up a dense LLM enhances its capabilities, but also significantly increases the computational complexity. Mixture-of-Experts (MoE) models address that by allowing the model size to grow without substantially raising training or inference costs. Yet MoE models face challenges regarding knowledge sharing among experts, making their performance somehow sensitive to routing accuracy. To tackle that, previous works introduced shared experts and combined their outputs with those of the top $K$ routed experts in an ``addition'' manner. In this paper, inspired by collective matrix factorization to learn shared knowledge among data, we propose CartesianMoE, which implements more effective knowledge sharing among experts in more like a ``multiplication'' manner. Extensive experimental results indicate that CartesianMoE outperforms previous MoE models for building LLMs, in terms of both perplexity and downstream task performance. And we also find that CartesianMoE achieves better expert routing robustness. </p> </div> </dd> <dt> <a name='item281'>[281]</a> <a href ="/abs/2410.16676" title="Abstract" id="2410.16676"> arXiv:2410.16676 </a> (replaced) [<a href="/pdf/2410.16676" title="Download PDF" id="pdf-2410.16676" aria-labelledby="pdf-2410.16676">pdf</a>, <a href="https://arxiv.org/html/2410.16676v4" title="View HTML" id="html-2410.16676" aria-labelledby="html-2410.16676" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.16676" title="Other formats" id="oth-2410.16676" aria-labelledby="oth-2410.16676">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CausalEval: Towards Better Causal Reasoning in Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+L">Longxuan Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+D">Delin Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiong,+S">Siheng Xiong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Q">Qingyang Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Q">Qingzhen Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+D">Dawei Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Z">Zhikai Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+X">Xiaoze Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pan,+L">Liangming Pan</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted to NAACL25 (main) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Causal reasoning (CR) is a crucial aspect of intelligence, essential for problem-solving, decision-making, and understanding the world. While language models (LMs) can generate rationales for their outputs, their ability to reliably perform causal reasoning remains uncertain, often falling short in tasks requiring a deep understanding of causality. In this paper, we introduce CausalEval, a comprehensive review of research aimed at enhancing LMs for causal reasoning, coupled with an empirical evaluation of current models and methods. We categorize existing methods based on the role of LMs: either as reasoning engines or as helpers providing knowledge or data to traditional CR methods, followed by a detailed discussion of methodologies in each category. We then assess the performance of current LMs and various enhancement methods on a range of causal reasoning tasks, providing key findings and in-depth analysis. Finally, we present insights from current studies and highlight promising directions for future research. We aim for this work to serve as a comprehensive resource, fostering further advancements in causal reasoning with LMs. </p> </div> </dd> <dt> <a name='item282'>[282]</a> <a href ="/abs/2411.10545" title="Abstract" id="2411.10545"> arXiv:2411.10545 </a> (replaced) [<a href="/pdf/2411.10545" title="Download PDF" id="pdf-2411.10545" aria-labelledby="pdf-2411.10545">pdf</a>, <a href="https://arxiv.org/html/2411.10545v2" title="View HTML" id="html-2411.10545" aria-labelledby="html-2411.10545" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.10545" title="Other formats" id="oth-2411.10545" aria-labelledby="oth-2411.10545">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Efficient Alignment of Large Language Models via Data Sampling </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Khera,+A">Amrit Khera</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ghosh,+R">Rajat Ghosh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dutta,+D">Debojyoti Dutta</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Original work accepted at NeurIPS Efficient Natural Language and Speech Processing Workshop. PMLR, 2024. Experiments with a larger model from a different family, Llama-30B have been added to the appendix for generalizability </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> LLM alignment ensures that large language models behave safely and effectively by aligning their outputs with human values, goals, and intentions. Aligning LLMs employ huge amounts of data, computation, and time. Moreover, curating data with human feedback is expensive and takes time. Recent research depicts the benefit of data engineering in the fine-tuning and pre-training paradigms to bring down such costs. However, alignment differs from the afore-mentioned paradigms and it is unclear if data efficient alignment is feasible. In this work, we first aim to understand how the performance of LLM alignment scales with data. We find out that LLM alignment performance follows an exponential plateau pattern which tapers off post a rapid initial increase. Based on this, we identify data subsampling as a viable method to reduce resources required for alignment. Further, we propose an information theory-based methodology for efficient alignment by identifying a small high quality subset thereby reducing the computation and time required by alignment. We evaluate the proposed methodology over multiple datasets and compare the results. We find that the model aligned using our proposed methodology outperforms other sampling methods and performs comparable to the model aligned with the full dataset while using less than 10% data, leading to greater than 90% savings in costs, resources, and faster LLM alignment. </p> </div> </dd> <dt> <a name='item283'>[283]</a> <a href ="/abs/2412.01269" title="Abstract" id="2412.01269"> arXiv:2412.01269 </a> (replaced) [<a href="/pdf/2412.01269" title="Download PDF" id="pdf-2412.01269" aria-labelledby="pdf-2412.01269">pdf</a>, <a href="https://arxiv.org/html/2412.01269v5" title="View HTML" id="html-2412.01269" aria-labelledby="html-2412.01269" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.01269" title="Other formats" id="oth-2412.01269" aria-labelledby="oth-2412.01269">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CPRM: A LLM-based Continual Pre-training Framework for Relevance Modeling in Commercial Search </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+K">Kaixin Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ji,+Y">Yixin Ji</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Z">Zeyuan Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Q">Qiang Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+C">Cunxiang Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+H">Hong Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ji,+B">Baijun Ji</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+J">Jia Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zhongyi Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gu,+J">Jinjie Gu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+Y">Yuan Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mo,+L">Linjian Mo</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> NAACL 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL); Information Retrieval (cs.IR); Machine Learning (cs.LG) </div> <p class='mathjax'> Relevance modeling between queries and items stands as a pivotal component in commercial search engines, directly affecting the user experience. Given the remarkable achievements of large language models (LLMs) in various natural language processing (NLP) tasks, LLM-based relevance modeling is gradually being adopted within industrial search systems. Nevertheless, foundational LLMs lack domain-specific knowledge and do not fully exploit the potential of in-context learning. Furthermore, structured item text remains underutilized, and there is a shortage in the supply of corresponding queries and background knowledge. We thereby propose CPRM (Continual Pre-training for Relevance Modeling), a framework designed for the continual pre-training of LLMs to address these issues. Our CPRM framework includes three modules: 1) employing both queries and multi-field item to jointly pre-train for enhancing domain knowledge, 2) applying in-context pre-training, a novel approach where LLMs are pre-trained on a sequence of related queries or items, and 3) conducting reading comprehension on items to produce associated domain knowledge and background information (e.g., generating summaries and corresponding queries) to further strengthen LLMs. Results on offline experiments and online A/B testing demonstrate that our model achieves convincing performance compared to strong baselines. </p> </div> </dd> <dt> <a name='item284'>[284]</a> <a href ="/abs/2412.20694" title="Abstract" id="2412.20694"> arXiv:2412.20694 </a> (replaced) [<a href="/pdf/2412.20694" title="Download PDF" id="pdf-2412.20694" aria-labelledby="pdf-2412.20694">pdf</a>, <a href="/format/2412.20694" title="Other formats" id="oth-2412.20694" aria-labelledby="oth-2412.20694">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> QUBE: Enhancing Automatic Heuristic Design via Quality-Uncertainty Balanced Evolution </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Z">Zijie Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+Z">Zhanchao Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+Y">Yu Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+R">Renjun Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pan,+L">Lili Pan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lan,+Z">Zhenzhong Lan</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Neural and Evolutionary Computing (cs.NE)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> Solving NP-hard problems traditionally relies on heuristics, yet manually designing effective heuristics for complex problems remains a significant challenge. While recent advancements like FunSearch have shown that large language models (LLMs) can be integrated into evolutionary algorithms (EAs) for heuristic design, their potential is hindered by limitations in balancing exploitation and exploration. We introduce Quality-Uncertainty Balanced Evolution (QUBE), a novel approach that enhances LLM+EA methods by redefining the priority criterion within the FunSearch framework. QUBE employs the Quality-Uncertainty Trade-off Criterion (QUTC), based on our proposed Uncertainty-Inclusive Quality metric, to evaluate and guide the evolutionary process. Through extensive experiments on challenging NP-complete problems, QUBE demonstrates significant performance improvements over FunSearch and baseline methods. Our code are available at <a href="https://github.com/zzjchen/QUBE" rel="external noopener nofollow" class="link-external link-https">this https URL</a>\_code. </p> </div> </dd> <dt> <a name='item285'>[285]</a> <a href ="/abs/2501.03936" title="Abstract" id="2501.03936"> arXiv:2501.03936 </a> (replaced) [<a href="/pdf/2501.03936" title="Download PDF" id="pdf-2501.03936" aria-labelledby="pdf-2501.03936">pdf</a>, <a href="https://arxiv.org/html/2501.03936v2" title="View HTML" id="html-2501.03936" aria-labelledby="html-2501.03936" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.03936" title="Other formats" id="oth-2501.03936" aria-labelledby="oth-2501.03936">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> PPTAgent: Generating and Evaluating Presentations Beyond Text-to-Slides </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zheng,+H">Hao Zheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guan,+X">Xinyan Guan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kong,+H">Hao Kong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zheng,+J">Jia Zheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+W">Weixiang Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+H">Hongyu Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+Y">Yaojie Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+B">Ben He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+X">Xianpei Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+L">Le Sun</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 8 pages, 23 figures, see <a href="https://github.com/icip-cas/PPTAgent" rel="external noopener nofollow" class="link-external link-https">this https URL</a> for details </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Automatically generating presentations from documents is a challenging task that requires accommodating content quality, visual appeal, and structural coherence. Existing methods primarily focus on improving and evaluating the content quality in isolation, overlooking visual appeal and structural coherence, which limits their practical applicability. To address these limitations, we propose PPTAgent, which comprehensively improves presentation generation through a two-stage, edit-based approach inspired by human workflows. PPTAgent first analyzes reference presentations to extract slide-level functional types and content schemas, then drafts an outline and iteratively generates editing actions based on selected reference slides to create new slides. To comprehensively evaluate the quality of generated presentations, we further introduce PPTEval, an evaluation framework that assesses presentations across three dimensions: Content, Design, and Coherence. Results demonstrate that PPTAgent significantly outperforms existing automatic presentation generation methods across all three dimensions. </p> </div> </dd> <dt> <a name='item286'>[286]</a> <a href ="/abs/2501.10674" title="Abstract" id="2501.10674"> arXiv:2501.10674 </a> (replaced) [<a href="/pdf/2501.10674" title="Download PDF" id="pdf-2501.10674" aria-labelledby="pdf-2501.10674">pdf</a>, <a href="https://arxiv.org/html/2501.10674v2" title="View HTML" id="html-2501.10674" aria-labelledby="html-2501.10674" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.10674" title="Other formats" id="oth-2501.10674" aria-labelledby="oth-2501.10674">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Can Multimodal LLMs do Visual Temporal Understanding and Reasoning? The answer is No! </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Imam,+M+F">Mohamed Fazli Imam</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lyu,+C">Chenyang Lyu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Aji,+A+F">Alham Fikri Aji</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Our dataset can be found at \url{<a href="https://huggingface.co/datasets/fazliimam/temporal-vqa" rel="external noopener nofollow" class="link-external link-https">this https URL</a>} </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Multimodal Large Language Models (MLLMs) have achieved significant advancements in tasks like Visual Question Answering (VQA) by leveraging foundational Large Language Models (LLMs). However, their abilities in specific areas such as visual temporal understanding, which is crucial for comprehending real-world dynamics, remain underexplored. To address this, we propose a challenging evaluation benchmark named TemporalVQA, consisting of two parts: 1) Temporal Order Understanding and 2) Time-lapse Estimation. The first part requires MLLMs to determine the sequence of events by analyzing temporally consecutive video frames. The second part presents image pairs with varying time differences, framed as multiple-choice questions, asking MLLMs to estimate the time-lapse between images with options ranging from seconds to years. Our evaluations of advanced MLLMs, including models like GPT-4o and Gemini-1.5-Pro, reveal significant challenges: GPT-4o achieved only 49.1% average consistent accuracy in temporal order task and 70% in time-lapse estimation, with open-source models performing even poorly. These findings underscore the limitations of current MLLMs in visual temporal understanding and reasoning, highlighting the need for further improvements for their temporal capability. Our dataset can be found at <a href="https://huggingface.co/datasets/fazliimam/temporal-vqa" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item287'>[287]</a> <a href ="/abs/2502.00883" title="Abstract" id="2502.00883"> arXiv:2502.00883 </a> (replaced) [<a href="/pdf/2502.00883" title="Download PDF" id="pdf-2502.00883" aria-labelledby="pdf-2502.00883">pdf</a>, <a href="https://arxiv.org/html/2502.00883v3" title="View HTML" id="html-2502.00883" aria-labelledby="html-2502.00883" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.00883" title="Other formats" id="oth-2502.00883" aria-labelledby="oth-2502.00883">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SimPER: A Minimalist Approach to Preference Alignment without Hyperparameters </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xiao,+T">Teng Xiao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+Y">Yige Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Z">Zhengyu Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+M">Mingxiao Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liang,+S">Shangsong Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ren,+Z">Zhaochun Ren</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Honavar,+V+G">Vasant G Honavar</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> ICLR 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Existing preference optimization objectives for language model alignment require additional hyperparameters that must be extensively tuned to achieve optimal performance, increasing both the complexity and time required for fine-tuning large language models. In this paper, we propose a simple yet effective hyperparameter-free preference optimization algorithm for alignment. We observe that promising performance can be achieved simply by optimizing inverse perplexity, which is calculated as the inverse of the exponentiated average log-likelihood of the chosen and rejected responses in the preference dataset. The resulting simple learning objective, SimPER, is easy to implement and eliminates the need for expensive hyperparameter tuning and a reference model, making it both computationally and memory efficient. Extensive experiments on widely used real-world benchmarks, including MT-Bench, AlpacaEval 2, and 10 key benchmarks of the Open LLM Leaderboard with 5 base models, demonstrate that SimPER consistently and significantly outperforms existing approaches-even without any hyperparameters or a reference model . For example, despite its simplicity, SimPER outperforms state-of-the-art methods by up to 5.7 points on AlpacaEval 2 and achieves the highest average ranking across 10 benchmarks on the Open LLM Leaderboard. The source code for SimPER is publicly available at: <a href="https://github.com/tengxiao1/SimPER" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item288'>[288]</a> <a href ="/abs/2502.03283" title="Abstract" id="2502.03283"> arXiv:2502.03283 </a> (replaced) [<a href="/pdf/2502.03283" title="Download PDF" id="pdf-2502.03283" aria-labelledby="pdf-2502.03283">pdf</a>, <a href="https://arxiv.org/html/2502.03283v2" title="View HTML" id="html-2502.03283" aria-labelledby="html-2502.03283" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.03283" title="Other formats" id="oth-2502.03283" aria-labelledby="oth-2502.03283">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SymAgent: A Neural-Symbolic Self-Learning Agent Framework for Complex Reasoning over Knowledge Graphs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+B">Ben Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+J">Jihai Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+F">Fangquan Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+C">Cheng Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Peng,+M">Min Peng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yin,+W">Wotao Yin</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL); Machine Learning (cs.LG) </div> <p class='mathjax'> Recent advancements have highlighted that Large Language Models (LLMs) are prone to hallucinations when solving complex reasoning problems, leading to erroneous results. To tackle this issue, researchers incorporate Knowledge Graphs (KGs) to improve the reasoning ability of LLMs. However, existing methods face two limitations: 1) they typically assume that all answers to the questions are contained in KGs, neglecting the incompleteness issue of KGs, and 2) they treat the KG as a static repository and overlook the implicit logical reasoning structures inherent in KGs. In this paper, we introduce SymAgent, an innovative neural-symbolic agent framework that achieves collaborative augmentation between KGs and LLMs. We conceptualize KGs as dynamic environments and transform complex reasoning tasks into a multi-step interactive process, enabling KGs to participate deeply in the reasoning process. SymAgent consists of two modules: Agent-Planner and Agent-Executor. The Agent-Planner leverages LLM's inductive reasoning capability to extract symbolic rules from KGs, guiding efficient question decomposition. The Agent-Executor autonomously invokes predefined action tools to integrate information from KGs and external documents, addressing the issues of KG incompleteness. Furthermore, we design a self-learning framework comprising online exploration and offline iterative policy updating phases, enabling the agent to automatically synthesize reasoning trajectories and improve performance. Experimental results demonstrate that SymAgent with weak LLM backbones (i.e., 7B series) yields better or comparable performance compared to various strong baselines. Further analysis reveals that our agent can identify missing triples, facilitating automatic KG updates. </p> </div> </dd> <dt> <a name='item289'>[289]</a> <a href ="/abs/2502.05957" title="Abstract" id="2502.05957"> arXiv:2502.05957 </a> (replaced) [<a href="/pdf/2502.05957" title="Download PDF" id="pdf-2502.05957" aria-labelledby="pdf-2502.05957">pdf</a>, <a href="https://arxiv.org/html/2502.05957v2" title="View HTML" id="html-2502.05957" aria-labelledby="html-2502.05957" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.05957" title="Other formats" id="oth-2502.05957" aria-labelledby="oth-2502.05957">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> AutoAgent: A Fully-Automated and Zero-Code Framework for LLM Agents </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+J">Jiabin Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fan,+T">Tianyu Fan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+C">Chao Huang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Code: <a href="https://github.com/HKUDS/AutoAgent" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Large Language Model (LLM) Agents have demonstrated remarkable capabilities in task automation and intelligent decision-making, driving the widespread adoption of agent development frameworks such as LangChain and AutoGen. However, these frameworks predominantly serve developers with extensive technical expertise - a significant limitation considering that only 0.03 % of the global population possesses the necessary programming skills. This stark accessibility gap raises a fundamental question: Can we enable everyone, regardless of technical background, to build their own LLM agents using natural language alone? To address this challenge, we introduce AutoAgent-a Fully-Automated and highly Self-Developing framework that enables users to create and deploy LLM agents through Natural Language Alone. Operating as an autonomous Agent Operating System, AutoAgent comprises four key components: i) Agentic System Utilities, ii) LLM-powered Actionable Engine, iii) Self-Managing File System, and iv) Self-Play Agent Customization module. This lightweight yet powerful system enables efficient and dynamic creation and modification of tools, agents, and workflows without coding requirements or manual intervention. Beyond its code-free agent development capabilities, AutoAgent also serves as a versatile multi-agent system for General AI Assistants. Comprehensive evaluations on the GAIA benchmark demonstrate AutoAgent's effectiveness in generalist multi-agent tasks, surpassing existing state-of-the-art methods. Furthermore, AutoAgent's Retrieval-Augmented Generation (RAG)-related capabilities have shown consistently superior performance compared to many alternative LLM-based solutions. </p> </div> </dd> <dt> <a name='item290'>[290]</a> <a href ="/abs/2502.08820" title="Abstract" id="2502.08820"> arXiv:2502.08820 </a> (replaced) [<a href="/pdf/2502.08820" title="Download PDF" id="pdf-2502.08820" aria-labelledby="pdf-2502.08820">pdf</a>, <a href="https://arxiv.org/html/2502.08820v2" title="View HTML" id="html-2502.08820" aria-labelledby="html-2502.08820" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.08820" title="Other formats" id="oth-2502.08820" aria-labelledby="oth-2502.08820">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Can a Single Model Master Both Multi-turn Conversations and Tool Use? CoALM: A Unified Conversational Agentic Language Model </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Acikgoz,+E+C">Emre Can Acikgoz</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Greer,+J">Jeremiah Greer</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Datta,+A">Akul Datta</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+Z">Ze Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zeng,+W">William Zeng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Elachqar,+O">Oussama Elachqar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Koukoumidis,+E">Emmanouil Koukoumidis</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hakkani-T%C3%BCr,+D">Dilek Hakkani-T眉r</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tur,+G">Gokhan Tur</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Large Language Models (LLMs) with API-calling capabilities enabled building effective Language Agents (LA), while also revolutionizing the conventional task-oriented dialogue (TOD) paradigm. However, current approaches face a critical dilemma: TOD systems are often trained on a limited set of target APIs, requiring new data to maintain their quality when interfacing with new services, while LAs are not trained to maintain user intent over multi-turn conversations. Because both robust multi-turn management and advanced function calling are crucial for effective conversational agents, we evaluate these skills on three popular benchmarks: MultiWOZ 2.4 (TOD), BFCL V3 (LA), and API-Bank (LA), and our analyses reveal that specialized approaches excel in one domain but underperform in the other. To bridge this chasm, we introduce CoALM (Conversational Agentic Language Model), a unified approach that integrates both conversational and agentic capabilities. We created CoALM-IT, a carefully constructed multi-task dataset that interleave multi-turn ReAct reasoning with complex API usage. Using CoALM-IT, we train three models CoALM 8B, CoALM 70B, and CoALM 405B, which outperform top domain-specific models, including GPT-4o, across all three <a href="http://benchmarks.This" rel="external noopener nofollow" class="link-external link-http">this http URL</a> demonstrates the feasibility of a single model approach for both TOD and LA, setting a new standard for conversational agents. </p> </div> </dd> <dt> <a name='item291'>[291]</a> <a href ="/abs/2502.10858" title="Abstract" id="2502.10858"> arXiv:2502.10858 </a> (replaced) [<a href="/pdf/2502.10858" title="Download PDF" id="pdf-2502.10858" aria-labelledby="pdf-2502.10858">pdf</a>, <a href="https://arxiv.org/html/2502.10858v2" title="View HTML" id="html-2502.10858" aria-labelledby="html-2502.10858" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10858" title="Other formats" id="oth-2502.10858" aria-labelledby="oth-2502.10858">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Is Depth All You Need? An Exploration of Iterative Reasoning in LLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Z">Zongqian Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+T">Tianyu Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+B">Baoduo Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+J">Jiaying Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhan,+M">Mengmeng Zhan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+X">Xiaofeng Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+L">Lei Feng</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 22 pages, 7 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Deep iterative chain-of-thought (CoT) reasoning enables LLMs to tackle complex tasks by progressively activating relevant pre-trained knowledge. However, it faces challenges in ensuring continual improvement and determining a stopping criterion. In this paper, we investigate whether the relevant knowledge that contributes directly to solving the given question can be activated from the initial reasoning path, thus circumventing the need for iterative refinement. Our experiments reveal that increasing the diversity of initial reasoning paths can achieve comparable or superior performance, a concept we term \textit{breadth reasoning}. However, existing breadth reasoning approaches, such as self-consistency, offer limited diversity. To address this limitation, we propose a simple yet effective method that enhances reasoning breadth by integrating contextual exploration with reduced sampling randomness. Extensive experiments demonstrate that our approach significantly outperforms deep iterative reasoning. Our code is provided in <a href="https://github.com/zongqianwu/breadth" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item292'>[292]</a> <a href ="/abs/2502.11308" title="Abstract" id="2502.11308"> arXiv:2502.11308 </a> (replaced) [<a href="/pdf/2502.11308" title="Download PDF" id="pdf-2502.11308" aria-labelledby="pdf-2502.11308">pdf</a>, <a href="https://arxiv.org/html/2502.11308v2" title="View HTML" id="html-2502.11308" aria-labelledby="html-2502.11308" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11308" title="Other formats" id="oth-2502.11308" aria-labelledby="oth-2502.11308">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> ALGEN: Few-shot Inversion Attacks on Textual Embeddings using Alignment and Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Y">Yiyi Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+Q">Qiongkai Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bjerva,+J">Johannes Bjerva</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 18 pages, 13 tables, 6 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Cryptography and Security (cs.CR)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> With the growing popularity of Large Language Models (LLMs) and vector databases, private textual data is increasingly processed and stored as numerical embeddings. However, recent studies have proven that such embeddings are vulnerable to inversion attacks, where original text is reconstructed to reveal sensitive information. Previous research has largely assumed access to millions of sentences to train attack models, e.g., through data leakage or nearly unrestricted API access. With our method, a single data point is sufficient for a partially successful inversion attack. With as little as 1k data samples, performance reaches an optimum across a range of black-box encoders, without training on leaked data. We present a Few-shot Textual Embedding Inversion Attack using ALignment and GENeration (ALGEN), by aligning victim embeddings to the attack space and using a generative model to reconstruct text. We find that ALGEN attacks can be effectively transferred across domains and languages, revealing key information. We further examine a variety of defense mechanisms against ALGEN, and find that none are effective, highlighting the vulnerabilities posed by inversion attacks. By significantly lowering the cost of inversion and proving that embedding spaces can be aligned through one-step optimization, we establish a new textual embedding inversion paradigm with broader applications for embedding alignment in NLP. </p> </div> </dd> <dt> <a name='item293'>[293]</a> <a href ="/abs/2502.12118" title="Abstract" id="2502.12118"> arXiv:2502.12118 </a> (replaced) [<a href="/pdf/2502.12118" title="Download PDF" id="pdf-2502.12118" aria-labelledby="pdf-2502.12118">pdf</a>, <a href="https://arxiv.org/html/2502.12118v2" title="View HTML" id="html-2502.12118" aria-labelledby="html-2502.12118" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12118" title="Other formats" id="oth-2502.12118" aria-labelledby="oth-2502.12118">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Scaling Test-Time Compute Without Verification or RL is Suboptimal </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Setlur,+A">Amrith Setlur</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rajaraman,+N">Nived Rajaraman</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Levine,+S">Sergey Levine</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kumar,+A">Aviral Kumar</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Despite substantial advances in scaling test-time compute, an ongoing debate in the community is how it should be scaled up to enable continued and efficient improvements with scaling. There are largely two approaches: first, distilling successful search or thinking traces; and second, using verification (e.g., 0/1 outcome rewards, reward models, or verifiers) to guide reinforcement learning (RL) and search algorithms. In this paper, we prove that finetuning LLMs with verifier-based (VB) methods based on RL or search is far superior to verifier-free (VF) approaches based on distilling or cloning search traces, given a fixed amount of compute/data budget. Further, we show that as we scale test-time compute (measured as the output token length) and training data, suboptimality of VF methods scales poorly compared to VB when the base pre-trained LLM presents a heterogeneous distribution over correct solution traces (e.g., different lengths, styles, etc.) and admits a non-sharp distribution over rewards on traces sampled from it. We formalize this condition using anti-concentration [Erd艖s, 1945]. This implies a stronger result that VB methods scale better asymptotically, with the performance gap between VB and VF methods widening as test-time budget grows. We corroborate our theory empirically on both didactic and math reasoning problems with 3/8/32B-sized pre-trained LLMs, where we find verification is crucial for scaling test-time compute. </p> </div> </dd> </dl> <div class='paging'>Total of 293 entries </div> <div class='morefewer'>Showing up to 2000 entries per page: <a href=/list/cs.CL/new?skip=0&show=1000 rel="nofollow"> fewer</a> | <span style="color: #454545">more</span> | <span style="color: #454545">all</span> </div> </div> </div> </div> </main> <footer style="clear: both;"> <div class="columns is-desktop" role="navigation" aria-label="Secondary" style="margin: -0.75em -0.75em 0.75em -0.75em"> <!-- Macro-Column 1 --> <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- End Macro-Column 1 --> <!-- Macro-Column 2 --> <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> <!-- End Macro-Column 2 --> </div> </footer> </div> <script src="/static/base/1.0.1/js/member_acknowledgement.js"></script> </body> </html>