CINXE.COM

Software Engineering

<!DOCTYPE html> <html lang="en"> <head> <title>Software Engineering </title> <meta name="viewport" content="width=device-width, initial-scale=1"> <link rel="apple-touch-icon" sizes="180x180" href="/static/browse/0.3.4/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="/static/browse/0.3.4/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="/static/browse/0.3.4/images/icons/favicon-16x16.png"> <link rel="manifest" href="/static/browse/0.3.4/images/icons/site.webmanifest"> <link rel="mask-icon" href="/static/browse/0.3.4/images/icons/safari-pinned-tab.svg" color="#5bbad5"> <meta name="msapplication-TileColor" content="#da532c"> <meta name="theme-color" content="#ffffff"> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/arXiv.css?v=20241206" /> <link rel="stylesheet" type="text/css" media="print" href="/static/browse/0.3.4/css/arXiv-print.css?v=20200611" /> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/browse_search.css" /> <script language="javascript" src="/static/browse/0.3.4/js/accordion.js" /></script> <script src="/static/browse/0.3.4/js/mathjaxToggle.min.js" type="text/javascript"></script> <script type="text/javascript" language="javascript">mathjaxToggle();</script> </head> <body class="with-cu-identity"> <div class="flex-wrap-footer"> <header> <a href="#content" class="is-sr-only">Skip to main content</a> <!-- start desktop header --> <div class="columns is-vcentered is-hidden-mobile" id="cu-identity"> <div class="column" id="cu-logo"> <a href="https://www.cornell.edu/"><img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University" /></a> </div><div class="column" id="support-ack"> <span id="support-ack-url">We gratefully acknowledge support from the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors.</span> <a href="https://info.arxiv.org/about/donate.html" class="btn-header-donate">Donate</a> </div> </div> <div id="header" class="is-hidden-mobile"> <a aria-hidden="true" tabindex="-1" href="/IgnoreMe"></a> <div class="header-breadcrumbs"> <a href="/"><img src="/static/browse/0.3.4/images/arxiv-logo-one-color-white.svg" alt="arxiv logo" style="height:40px;"/></a> <span>&gt;</span> <a href="/list/cs.SE/recent">cs.SE</a> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div><!-- /end desktop header --> <div class="mobile-header"> <div class="columns is-mobile"> <div class="column logo-arxiv"><a href="https://arxiv.org/"><img src="/static/browse/0.3.4/images/arxiv-logomark-small-white.svg" alt="arXiv logo" style="height:60px;" /></a></div> <div class="column logo-cornell"><a href="https://www.cornell.edu/"> <picture> <source media="(min-width: 501px)" srcset="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg 400w" sizes="400w" /> <source srcset="/static/browse/0.3.4/images/icons/cu/cornell_seal_simple_black.svg 2x" /> <img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University Logo" /> </picture> </a></div> <div class="column nav" id="toggle-container" role="menubar"> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-white"><title>open search</title><path d="M505 442.7L405.3 343c-4.5-4.5-10.6-7-17-7H372c27.6-35.3 44-79.7 44-128C416 93.1 322.9 0 208 0S0 93.1 0 208s93.1 208 208 208c48.3 0 92.7-16.4 128-44v16.3c0 6.4 2.5 12.5 7 17l99.7 99.7c9.4 9.4 24.6 9.4 33.9 0l28.3-28.3c9.4-9.4 9.4-24.6.1-34zM208 336c-70.7 0-128-57.2-128-128 0-70.7 57.2-128 128-128 70.7 0 128 57.2 128 128 0 70.7-57.2 128-128 128z"/></svg></button> <div class="mobile-toggle-block toggle-target"> <form class="mobile-search-form" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <input class="input" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <input type="hidden" name="source" value="header"> <input type="hidden" name="searchtype" value="all"> <button class="button">GO</button> </div> </form> </div> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-white" role="menu"><title>open navigation menu</title><path d="M16 132h416c8.837 0 16-7.163 16-16V76c0-8.837-7.163-16-16-16H16C7.163 60 0 67.163 0 76v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16z"/ ></svg></button> <div class="mobile-toggle-block toggle-target"> <nav class="mobile-menu" aria-labelledby="mobilemenulabel"> <h2 id="mobilemenulabel">quick links</h2> <ul> <li><a href="https://arxiv.org/login">Login</a></li> <li><a href="https://info.arxiv.org/help">Help Pages</a></li> <li><a href="https://info.arxiv.org/about">About</a></li> </ul> </nav> </div> </div> </div> </div><!-- /end mobile-header --> </header> <main> <div id="content"> <div id='content-inner'> <div id='dlpage'> <h1>Software Engineering</h1> <ul> <li><a href="#item0">New submissions</a></li> <li><a href="#item22">Cross-lists</a></li> <li><a href="#item24">Replacements</a></li> </ul> <p>See <a id="recent-cs.SE" aria-labelledby="recent-cs.SE" href="/list/cs.SE/recent">recent</a> articles</p> <h3>Showing new listings for Thursday, 20 March 2025</h3> <div class='paging'>Total of 34 entries </div> <div class='morefewer'>Showing up to 2000 entries per page: <a href=/list/cs.SE/new?skip=0&amp;show=1000 rel="nofollow"> fewer</a> | <span style="color: #454545">more</span> | <span style="color: #454545">all</span> </div> <dl id='articles'> <h3>New submissions (showing 21 of 21 entries)</h3> <dt> <a name='item1'>[1]</a> <a href ="/abs/2503.14563" title="Abstract" id="2503.14563"> arXiv:2503.14563 </a> [<a href="/pdf/2503.14563" title="Download PDF" id="pdf-2503.14563" aria-labelledby="pdf-2503.14563">pdf</a>, <a href="/format/2503.14563" title="Other formats" id="oth-2503.14563" aria-labelledby="oth-2503.14563">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Workflow for Safe-AI </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Veljanovska,+S">Suzana Veljanovska</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Doran,+H+D">Hans Dermot Doran</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Software Engineering (cs.SE)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> The development and deployment of safe and dependable AI models is crucial in applications where functional safety is a key concern. Given the rapid advancement in AI research and the relative novelty of the safe-AI domain, there is an increasing need for a workflow that balances stability with adaptability. This work proposes a transparent, complete, yet flexible and lightweight workflow that highlights both reliability and qualifiability. The core idea is that the workflow must be qualifiable, which demands the use of qualified tools. Tool qualification is a resource-intensive process, both in terms of time and cost. We therefore place value on a lightweight workflow featuring a minimal number of tools with limited features. The workflow is built upon an extended ONNX model description allowing for validation of AI algorithms from their generation to runtime deployment. This validation is essential to ensure that models are validated before being reliably deployed across different runtimes, particularly in mixed-criticality systems. Keywords-AI workflows, safe-AI, dependable-AI, functional safety, v-model development </p> </div> </dd> <dt> <a name='item2'>[2]</a> <a href ="/abs/2503.14630" title="Abstract" id="2503.14630"> arXiv:2503.14630 </a> [<a href="/pdf/2503.14630" title="Download PDF" id="pdf-2503.14630" aria-labelledby="pdf-2503.14630">pdf</a>, <a href="https://arxiv.org/html/2503.14630v1" title="View HTML" id="html-2503.14630" aria-labelledby="html-2503.14630" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.14630" title="Other formats" id="oth-2503.14630" aria-labelledby="oth-2503.14630">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Assessing Large Language Models for Automated Feedback Generation in Learning Programming Problem Solving </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Silva,+P">Priscylla Silva</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Costa,+E">Evandro Costa</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Software Engineering (cs.SE)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Providing effective feedback is important for student learning in programming problem-solving. In this sense, Large Language Models (LLMs) have emerged as potential tools to automate feedback generation. However, their reliability and ability to identify reasoning errors in student code remain not well understood. This study evaluates the performance of four LLMs (GPT-4o, GPT-4o mini, GPT-4-Turbo, and Gemini-1.5-pro) on a benchmark dataset of 45 student solutions. We assessed the models&#39; capacity to provide accurate and insightful feedback, particularly in identifying reasoning mistakes. Our analysis reveals that 63\% of feedback hints were accurate and complete, while 37\% contained mistakes, including incorrect line identification, flawed explanations, or hallucinated issues. These findings highlight the potential and limitations of LLMs in programming education and underscore the need for improvements to enhance reliability and minimize risks in educational applications. </p> </div> </dd> <dt> <a name='item3'>[3]</a> <a href ="/abs/2503.14677" title="Abstract" id="2503.14677"> arXiv:2503.14677 </a> [<a href="/pdf/2503.14677" title="Download PDF" id="pdf-2503.14677" aria-labelledby="pdf-2503.14677">pdf</a>, <a href="/format/2503.14677" title="Other formats" id="oth-2503.14677" aria-labelledby="oth-2503.14677">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Analyzing DevOps Practices Through Merge Request Data: A Case Study in Networking Software Company </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kansab,+S">Samah Kansab</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Hanania,+M">Matthieu Hanania</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Bordeleau,+F">Francis Bordeleau</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Tizghadam,+A">Ali Tizghadam</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Software Engineering (cs.SE)</span> </div> <p class='mathjax'> DevOps integrates collaboration, automation, and continuous improvement, enhancing agility, reducing time to market, and ensuring consistent software releases. A key component of this process is GitLab&#39;s Merge Request (MR) mechanism, which streamlines code submission and review. Studies have extensively analyzed MR data and similar mechanisms like GitHub pull requests and Gerrit Code Review, focusing on metrics such as review completion time and time to first comment. However, MR data also reflects broader aspects, including collaboration patterns, productivity, and process optimization. This study examines 26.7k MRs from four teams across 116 projects of a networking software company to analyze DevOps processes. We first assess the impact of external factors like COVID-19 and internal changes such as migration to OpenShift. Findings show increased effort and longer MR review times during the pandemic, with stable productivity and a lasting shift to out-of-hours work, reaching 70% of weekly activities. The transition to OpenShift was successful, with stabilized metrics over time. Additionally, we identify prioritization patterns in branch management, particularly in stable branches for new releases, underscoring the importance of workflow efficiency. In code review, while bots accelerate review initiation, human reviewers remain crucial in reducing review completion time. Other factors, such as commit count and reviewer experience, also influence review efficiency. This research provides actionable insights for practitioners, demonstrating how MR data can enhance productivity, effort analysis, and overall efficiency in DevOps. </p> </div> </dd> <dt> <a name='item4'>[4]</a> <a href ="/abs/2503.14713" title="Abstract" id="2503.14713"> arXiv:2503.14713 </a> [<a href="/pdf/2503.14713" title="Download PDF" id="pdf-2503.14713" aria-labelledby="pdf-2503.14713">pdf</a>, <a href="https://arxiv.org/html/2503.14713v1" title="View HTML" id="html-2503.14713" aria-labelledby="html-2503.14713" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.14713" title="Other formats" id="oth-2503.14713" aria-labelledby="oth-2503.14713">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> TestForge: Feedback-Driven, Agentic Test Suite Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jain,+K">Kush Jain</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Goues,+C+L">Claire Le Goues</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Software Engineering (cs.SE)</span> </div> <p class='mathjax'> Automated test generation holds great promise for alleviating the burdens of manual test creation. However, existing search-based techniques compromise on test readability, while LLM-based approaches are prohibitively expensive in practice. We present TestForge, an agentic unit testing framework designed to cost-effectively generate high-quality test suites for real-world code. Our key insight is to reframe LLM-based test generation as an iterative process. TestForge thus begins with tests generated via zero-shot prompting, and then continuously refines those tests based on feedback from test executions and coverage reports. We evaluate TestForge on TestGenEval, a real world unit test generation benchmark sourced from 11 large scale open source repositories; we show that TestForge achieves a pass@1 rate of 84.3%, 44.4% line coverage and 33.8% mutation score on average, outperforming prior classical approaches and a one-iteration LLM-based baseline. TestForge produces more natural and understandable tests compared to state-of-the-art search-based techniques, and offers substantial cost savings over LLM-based techniques (at $0.63 per file). Finally, we release a version of TestGenEval integrated with the OpenHands platform, a popular open-source framework featuring a diverse set of software engineering agents and agentic benchmarks, for future extension and development. </p> </div> </dd> <dt> <a name='item5'>[5]</a> <a href ="/abs/2503.14723" title="Abstract" id="2503.14723"> arXiv:2503.14723 </a> [<a href="/pdf/2503.14723" title="Download PDF" id="pdf-2503.14723" aria-labelledby="pdf-2503.14723">pdf</a>, <a href="https://arxiv.org/html/2503.14723v1" title="View HTML" id="html-2503.14723" aria-labelledby="html-2503.14723" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.14723" title="Other formats" id="oth-2503.14723" aria-labelledby="oth-2503.14723">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LeakageDetector: An Open Source Data Leakage Analysis Tool in Machine Learning Pipelines </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=AlOmar,+E+A">Eman Abdullah AlOmar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=DeMario,+C">Catherine DeMario</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Shagawat,+R">Roger Shagawat</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kreiser,+B">Brandon Kreiser</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Software Engineering (cs.SE)</span> </div> <p class='mathjax'> Code quality is of paramount importance in all types of software development settings. Our work seeks to enable Machine Learning (ML) engineers to write better code by helping them find and fix instances of Data Leakage in their models. Data Leakage often results from bad practices in writing ML code. As a result, the model effectively &#39;&#39;memorizes&#39;&#39; the data on which it trains, leading to an overly optimistic estimate of the model performance and an inability to make generalized predictions. ML developers must carefully separate their data into training, evaluation, and test sets to avoid introducing Data Leakage into their code. Training data should be used to train the model, evaluation data should be used to repeatedly confirm a model&#39;s accuracy, and test data should be used only once to determine the accuracy of a production-ready model. In this paper, we develop LEAKAGEDETECTOR, a Python plugin for the PyCharm IDE that identifies instances of Data Leakage in ML code and provides suggestions on how to remove the leakage. </p> </div> </dd> <dt> <a name='item6'>[6]</a> <a href ="/abs/2503.14838" title="Abstract" id="2503.14838"> arXiv:2503.14838 </a> [<a href="/pdf/2503.14838" title="Download PDF" id="pdf-2503.14838" aria-labelledby="pdf-2503.14838">pdf</a>, <a href="https://arxiv.org/html/2503.14838v1" title="View HTML" id="html-2503.14838" aria-labelledby="html-2503.14838" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.14838" title="Other formats" id="oth-2503.14838" aria-labelledby="oth-2503.14838">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Think Like Human Developers: Harnessing Community Knowledge for Structured Code Reasoning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yang,+C">Chengran Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Sun,+Z">Zhensu Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kang,+H+J">Hong Jin Kang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Shi,+J">Jieke Shi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lo,+D">David Lo</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Software Engineering (cs.SE)</span> </div> <p class='mathjax'> Large Language Models (LLMs) have significantly advanced automated code generation, yet they struggle with complex coding tasks requiring multi-step logical reasoning. High-quality reasoning data is crucial for improving LLMs&#39; reasoning capabilities, but such datasets remain scarce. Existing approaches either rely on computationally expensive reinforcement learning (RL) or error-prone reasoning chains synthesized by LLMs, posing challenges in scalability and accuracy. <br>To address this challenge, we propose SVRC (Structured and Validated Reasoning Chains for Code Generation), a novel framework that mines, restructures, and enriches reasoning chains from community-driven discussions on software engineering platforms. SVRC refines unstructured and incomplete discussions of coding problems by aligning them with Software Development Life Cycle (SDLC) principles, ensuring that reasoning chains capture real-world problem-solving strategies and support iterative refinement. <br>To evaluate the effectiveness of SVRC, we introduce CodeThinker, an LLM fine-tuned on 12,444 reasoning-augmented samples generated by SVRC. Experiments on LiveCodeBench show that CodeThinker surpasses its base model by 42.86\% on medium-level code problems in terms of pass@1 and outperforms GPT-4o-mini and GPT-4o by 73.14\% and 115.86\%, respectively. Our ablation study further highlights that each component of SVRC contributes to the reasoning capabilities of CodeThinker. </p> </div> </dd> <dt> <a name='item7'>[7]</a> <a href ="/abs/2503.14852" title="Abstract" id="2503.14852"> arXiv:2503.14852 </a> [<a href="/pdf/2503.14852" title="Download PDF" id="pdf-2503.14852" aria-labelledby="pdf-2503.14852">pdf</a>, <a href="/format/2503.14852" title="Other formats" id="oth-2503.14852" aria-labelledby="oth-2503.14852">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> UntrustVul: An Automated Approach for Identifying Untrustworthy Alerts in Vulnerability Detection Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Tung,+L+N">Lam Nguyen Tung</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Du,+X">Xiaoning Du</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Neelofar,+N">Neelofar Neelofar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Aleti,+A">Aldeida Aleti</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Software Engineering (cs.SE)</span> </div> <p class='mathjax'> Machine learning (ML) has shown promise in detecting vulnerabilities. To review vulnerabilities detected by ML predictions, developers manually assess suspicious lines in their interpretations. However, studies have revealed that these models often learn and predict based on irrelevant features frequently appearing in vulnerable code. This leads to predictions that may correctly flag vulnerable functions but for the wrong reasons, which we call untrustworthy. These predictions can mislead developers, hindering them from locating the vulnerabilities. This increases the efforts of manual assessment and, worse, risks creating flawed patches that fail to address existing vulnerabilities and even introduce new ones. Hence, automated approaches are needed to detect untrustworthy predictions, preventing overlooked vulnerabilities and alleviating the burden of manual assessment. <br>We propose UntrustVul, the first automated approach to identify untrustworthy vulnerability predictions. Given a vulnerability prediction during inference, UntrustVul systematically assesses whether suspicious lines annotated by the prediction are vulnerability-unrelated. It simulates developers&#39; rationales, considering a line unrelated if (1) it is absent from historical vulnerabilities and (2) it cannot reach any vulnerabilities in execution flows. UntrustVul assesses (1) by analysing its syntactic meaning using deep representations to determine whether it is syntax-benign. To assess (2), UntrustVul traces dependencies of the syntax-benign lines on other suspicious lines using static and rule-based analyses. We evaluate UntrustVul on 155K vulnerability predictions by four models across three datasets. UntrustVul effectively detects untrustworthy predictions with an F1-score of 82%-94% and helps improve the ability of models to detect vulnerabilities by up to 321% in F1-score and 100% in trustworthiness. </p> </div> </dd> <dt> <a name='item8'>[8]</a> <a href ="/abs/2503.14924" title="Abstract" id="2503.14924"> arXiv:2503.14924 </a> [<a href="/pdf/2503.14924" title="Download PDF" id="pdf-2503.14924" aria-labelledby="pdf-2503.14924">pdf</a>, <a href="https://arxiv.org/html/2503.14924v1" title="View HTML" id="html-2503.14924" aria-labelledby="html-2503.14924" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.14924" title="Other formats" id="oth-2503.14924" aria-labelledby="oth-2503.14924">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> UTFix: Change Aware Unit Test Repairing using LLM </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Rahman,+S">Shanto Rahman</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kuhar,+S">Sachit Kuhar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Cirisci,+B">Berk Cirisci</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Garg,+P">Pranav Garg</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+S">Shiqi Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ma,+X">Xiaofei Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Deoras,+A">Anoop Deoras</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ray,+B">Baishakhi Ray</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 26 pages, International Conference on Object-oriented Programming, Systems, Languages, and Applications (OOPSLA) 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Software Engineering (cs.SE)</span> </div> <p class='mathjax'> Software updates, including bug repair and feature additions, are frequent in modern applications but they often leave test suites outdated, resulting in undetected bugs and increased chances of system failures. A recent study by Meta revealed that 14%-22% of software failures stem from outdated tests that fail to reflect changes in the codebase. This highlights the need to keep tests in sync with code changes to ensure software reliability. <br>In this paper, we present UTFix, a novel approach for repairing unit tests when their corresponding focal methods undergo changes. UTFix addresses two critical issues: assertion failure and reduced code coverage caused by changes in the focal method. Our approach leverages language models to repair unit tests by providing contextual information such as static code slices, dynamic code slices, and failure messages. We evaluate UTFix on our generated synthetic benchmarks (Tool-Bench), and real-world benchmarks. Tool- Bench includes diverse changes from popular open-source Python GitHub projects, where UTFix successfully repaired 89.2% of assertion failures and achieved 100% code coverage for 96 tests out of 369 tests. On the real-world benchmarks, UTFix repairs 60% of assertion failures while achieving 100% code coverage for 19 out of 30 unit tests. To the best of our knowledge, this is the first comprehensive study focused on unit test in evolving Python projects. Our contributions include the development of UTFix, the creation of Tool-Bench and real-world benchmarks, and the demonstration of the effectiveness of LLM-based methods in addressing unit test failures due to software evolution. </p> </div> </dd> <dt> <a name='item9'>[9]</a> <a href ="/abs/2503.14936" title="Abstract" id="2503.14936"> arXiv:2503.14936 </a> [<a href="/pdf/2503.14936" title="Download PDF" id="pdf-2503.14936" aria-labelledby="pdf-2503.14936">pdf</a>, <a href="https://arxiv.org/html/2503.14936v1" title="View HTML" id="html-2503.14936" aria-labelledby="html-2503.14936" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.14936" title="Other formats" id="oth-2503.14936" aria-labelledby="oth-2503.14936">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Enhancing Code LLM Training with Programmer Attention </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+Y">Yifan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Huang,+C">Chen Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Karas,+Z">Zachary Karas</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Nguyen,+D+T">Dung Thuy Nguyen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Leach,+K">Kevin Leach</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Huang,+Y">Yu Huang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Software Engineering (cs.SE)</span>; Human-Computer Interaction (cs.HC); Machine Learning (cs.LG) </div> <p class='mathjax'> Human attention provides valuable yet underexploited signals for code LLM training, offering a perspective beyond purely machine-driven attention. Despite the complexity and cost of collecting eye-tracking data, there has also been limited progress in systematically using these signals for code LLM training. To address both issues, we propose a cohesive pipeline spanning augmentation and reward-based fine-tuning. Specifically, we introduce (1) an eye-tracking path augmentation method to expand programmer attention datasets, (2) a pattern abstraction step that refines raw fixations into learnable attention motifs, and (3) a reward-guided strategy for integrating these insights directly into a CodeT5 supervised fine-tuning process. Our experiments yield +7.16 in CodeBLEU on the CodeXGlue benchmark for code summarization, underscoring how uniting human and machine attention can boost code intelligence. We hope this work encourages broader exploration of human-centric methods in next-generation AI4SE. </p> </div> </dd> <dt> <a name='item10'>[10]</a> <a href ="/abs/2503.15021" title="Abstract" id="2503.15021"> arXiv:2503.15021 </a> [<a href="/pdf/2503.15021" title="Download PDF" id="pdf-2503.15021" aria-labelledby="pdf-2503.15021">pdf</a>, <a href="/format/2503.15021" title="Other formats" id="oth-2503.15021" aria-labelledby="oth-2503.15021">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Wild SBOMs: a Large-scale Dataset of Software Bills of Materials from Public Code </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Soeiro,+L">Lu谋s Soeiro</a> (IP Paris, LTCI, ACES, INFRES), <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Robert,+T">Thomas Robert</a> (IP Paris, LTCI, ACES, INFRES), <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zacchiroli,+S">Stefano Zacchiroli</a> (IP Paris, LTCI, ACES, INFRES)</div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> Mining Software Repositories 2025 (MSR 2025), Apr 2025, Ottawa (Canada), Canada </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Software Engineering (cs.SE)</span> </div> <p class='mathjax'> Developers gain productivity by reusing readily available Free and Open Source Software (FOSS) components. Such practices also bring some difficulties, such as managing licensing, components and related security. One approach to handle those difficulties is to use Software Bill of Materials (SBOMs). While there have been studies on the readiness of practitioners to embrace SBOMs and on the SBOM tools ecosystem, a large scale study on SBOM practices based on SBOM files produced in the wild is still lacking. A starting point for such a study is a large dataset of SBOM files found in the wild. We introduce such a dataset, consisting of over 78 thousand unique SBOM files, deduplicated from those found in over 94 million repositories. We include metadata that contains the standard and format used, quality score generated by the tool sbomqs, number of revisions, filenames and provenance information. Finally, we give suggestions and examples of research that could bring new insights on assessing and improving SBOM real practices. </p> </div> </dd> <dt> <a name='item11'>[11]</a> <a href ="/abs/2503.15050" title="Abstract" id="2503.15050"> arXiv:2503.15050 </a> [<a href="/pdf/2503.15050" title="Download PDF" id="pdf-2503.15050" aria-labelledby="pdf-2503.15050">pdf</a>, <a href="https://arxiv.org/html/2503.15050v1" title="View HTML" id="html-2503.15050" aria-labelledby="html-2503.15050" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15050" title="Other formats" id="oth-2503.15050" aria-labelledby="oth-2503.15050">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Studying and Understanding the Effectiveness and Failures of Conversational LLM-Based Repair </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+A">Aolin Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wu,+H">Haojun Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xin,+Q">Qi Xin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Reiss,+S+P">Steven P. Reiss</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xuan,+J">Jifeng Xuan</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Software Engineering (cs.SE)</span> </div> <p class='mathjax'> Automated program repair (APR) is designed to automate the process of bug-fixing. In recent years, thanks to the rapid development of large language models (LLMs), automated repair has achieved remarkable progress. Advanced APR techniques powered by conversational LLMs, most notably ChatGPT, have exhibited impressive repair abilities and gained increasing popularity due to the capabilities of the underlying LLMs in providing repair feedback and performing iterative patch improvement. Despite the superiority, conversational APR techniques still fail to repair a large number of bugs. For example, a state-of-the-art conversational technique ChatRepair does not correctly repair over half of the single-function bugs in the Defects4J dataset. To understand the effectiveness and failures of conversational LLM-based repair and provide possible directions for improvement, we studied the exemplary ChatRepair with a focus on comparing the effectiveness of its cloze-style and full function repair strategies, assessing its key iterative component for patch improvement, and analyzing the repair failures. Our study has led to a series of findings, which we believe provide key implications for future research. </p> </div> </dd> <dt> <a name='item12'>[12]</a> <a href ="/abs/2503.15073" title="Abstract" id="2503.15073"> arXiv:2503.15073 </a> [<a href="/pdf/2503.15073" title="Download PDF" id="pdf-2503.15073" aria-labelledby="pdf-2503.15073">pdf</a>, <a href="https://arxiv.org/html/2503.15073v1" title="View HTML" id="html-2503.15073" aria-labelledby="html-2503.15073" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15073" title="Other formats" id="oth-2503.15073" aria-labelledby="oth-2503.15073">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> An Adaptive Testing Approach Based on Field Data </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Silva,+S">Samira Silva</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Caldas,+R">Ricardo Caldas</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Pelliccione,+P">Patrizio Pelliccione</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Bertolino,+A">Antonia Bertolino</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Software Engineering (cs.SE)</span> </div> <p class='mathjax'> The growing need to test systems post-release has led to extending testing activities into production environments, where uncertainty and dynamic conditions pose significant challenges. Field testing approaches, especially Self-Adaptive Testing in the Field (SATF), face hurdles like managing unpredictability, minimizing system overhead, and reducing human intervention, among others. Despite its importance, SATF remains underexplored in the literature. This work introduces AdapTA (Adaptive Testing Approach), a novel SATF strategy tailored for testing Body Sensor Networks (BSNs). BSNs are networks of wearable or implantable sensors designed to monitor physiological and environmental data. AdapTA employs an ex-vivo approach, using real-world data collected from the field to simulate patient behavior in in-house experiments. </p> </div> </dd> <dt> <a name='item13'>[13]</a> <a href ="/abs/2503.15079" title="Abstract" id="2503.15079"> arXiv:2503.15079 </a> [<a href="/pdf/2503.15079" title="Download PDF" id="pdf-2503.15079" aria-labelledby="pdf-2503.15079">pdf</a>, <a href="https://arxiv.org/html/2503.15079v1" title="View HTML" id="html-2503.15079" aria-labelledby="html-2503.15079" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15079" title="Other formats" id="oth-2503.15079" aria-labelledby="oth-2503.15079">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LogiAgent: Automated Logical Testing for REST Systems with LLM-Based Multi-Agents </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+K">Ke Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+C">Chenxi Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+C">Chong Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+C">Chi Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wu,+Y">YaChen Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xing,+Z">Zhenchang Xing</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+Y">Yang Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+Q">Qingshan Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Peng,+X">Xin Peng</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Software Engineering (cs.SE)</span> </div> <p class='mathjax'> Automated testing for REST APIs has become essential for ensuring the correctness and reliability of modern web services. While existing approaches primarily focus on detecting server crashes and error codes, they often overlook logical issues that arise due to evolving business logic and domain-specific requirements. To address this limitation, we propose LogiAgent, a novel approach for logical testing of REST systems. Built upon a large language model (LLM)-driven multi-agent framework, LogiAgent integrates a Test Scenario Generator, API Request Executor, and API Response Validator to collaboratively generate, execute, and validate API test scenarios. Unlike traditional testing methods that focus on status codes like 5xx, LogiAgent incorporates logical oracles that assess responses based on business logic, ensuring more comprehensive testing. The system is further enhanced by an Execution Memory component that stores historical API execution data for contextual consistency. We conduct extensive experiments across 12 real-world REST systems, demonstrating that LogiAgent effectively identifies 234 logical issues with an accuracy of 66.19%. Additionally, it basically excels in detecting server crashes and achieves superior test coverage compared to four state-of-the-art REST API testing tools. An ablation study confirms the significant contribution of LogiAgent&#39;s memory components to improving test coverage. </p> </div> </dd> <dt> <a name='item14'>[14]</a> <a href ="/abs/2503.15223" title="Abstract" id="2503.15223"> arXiv:2503.15223 </a> [<a href="/pdf/2503.15223" title="Download PDF" id="pdf-2503.15223" aria-labelledby="pdf-2503.15223">pdf</a>, <a href="https://arxiv.org/html/2503.15223v1" title="View HTML" id="html-2503.15223" aria-labelledby="html-2503.15223" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15223" title="Other formats" id="oth-2503.15223" aria-labelledby="oth-2503.15223">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Are &#34;Solved Issues&#34; in SWE-bench Really Solved Correctly? An Empirical Study </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+Y">You Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Pradel,+M">Michael Pradel</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+Z">Zhongxin Liu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Software Engineering (cs.SE)</span> </div> <p class='mathjax'> Automated issue solving aims to resolve real-world issues in software repositories. The most popular benchmarks for automated issue solving are SWE-bench and its human-filtered subset SWE-bench Verified. These benchmarks leverage testing to validate generated patches. However, because testing is rarely exhaustive, a patch may pass the tests but nevertheless fail to match the developers&#39; expectations. Unfortunately, it is currently unclear to what extent evaluations performed with SWE-bench suffer from such plausible but incorrect patches. This paper presents an in-depth empirical study of the correctness of plausible patches generated by three state-of-the-art issue-solving tools evaluated on SWE-bench Verified. We extensively test and inspect generated patches, and compare them against human-written ground truth patches. The core of our methodology is a novel technique PatchDiff for differential patch testing, which automatically exposes behavioral discrepancies between two patches. Our findings reveal critical weaknesses in SWE-bench&#39;s patch validation mechanism, which causes 7.8% of all patches to count as correct while failing the developer-written test suite. Moreover, our novel automated technique reveals that even more (29.6%) plausible patches induce different behavior than the ground truth patches. These behavioral differences are often due to similar, but divergent implementations (46.8%) and due to generated patches that adapt more behavior than the ground truth patches (27.3%). Our manual inspection shows that 28.6% of behaviorally divergent patches are certainly incorrect. Combined, the different weaknesses lead to an inflation of reported resolution rates by 6.2 absolute percent points. Our findings are a call to arms for more robust and reliable evaluation of issue-solving tools. We envision our automated differential patch testing technique to be useful for this purpose. </p> </div> </dd> <dt> <a name='item15'>[15]</a> <a href ="/abs/2503.15231" title="Abstract" id="2503.15231"> arXiv:2503.15231 </a> [<a href="/pdf/2503.15231" title="Download PDF" id="pdf-2503.15231" aria-labelledby="pdf-2503.15231">pdf</a>, <a href="https://arxiv.org/html/2503.15231v1" title="View HTML" id="html-2503.15231" aria-labelledby="html-2503.15231" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15231" title="Other formats" id="oth-2503.15231" aria-labelledby="oth-2503.15231">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> When LLMs Meet API Documentation: Can Retrieval Augmentation Aid Code Generation Just as It Helps Developers? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+J">Jingyi Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+S">Songqiang Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Cao,+J">Jialun Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Shen,+J">Jiasi Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Cheung,+S">Shing-Chi Cheung</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Software Engineering (cs.SE)</span> </div> <p class='mathjax'> Retrieval-augmented generation (RAG) has increasingly shown its power in extending large language models&#39; (LLMs&#39;) capability beyond their pre-trained knowledge. Existing works have shown that RAG can help with software development tasks such as code generation, code update, and test generation. Yet, the effectiveness of adapting LLMs to fast-evolving or less common API libraries using RAG remains unknown. To bridge this gap, we take an initial step to study this unexplored yet practical setting - when developers code with a less common library, they often refer to its API documentation; likewise, when LLMs are allowed to look up API documentation via RAG, to what extent can LLMs be advanced? To mimic such a setting, we select four less common open-source Python libraries with a total of 1017 eligible APIs. We study the factors that affect the effectiveness of using the documentation of less common API libraries as additional knowledge for retrieval and generation. Our intensive study yields interesting findings: (1) RAG helps improve LLMs&#39; performance by 83%-220%. (2) Example code contributes the most to advance LLMs, instead of the descriptive texts and parameter lists in the API documentation. (3) LLMs could sometimes tolerate mild noises (typos in description or incorrect parameters) by referencing their pre-trained knowledge or document context. Finally, we suggest that developers pay more attention to the quality and diversity of the code examples in the API documentation. The study sheds light on future low-code software development workflows. </p> </div> </dd> <dt> <a name='item16'>[16]</a> <a href ="/abs/2503.15248" title="Abstract" id="2503.15248"> arXiv:2503.15248 </a> [<a href="/pdf/2503.15248" title="Download PDF" id="pdf-2503.15248" aria-labelledby="pdf-2503.15248">pdf</a>, <a href="https://arxiv.org/html/2503.15248v1" title="View HTML" id="html-2503.15248" aria-labelledby="html-2503.15248" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15248" title="Other formats" id="oth-2503.15248" aria-labelledby="oth-2503.15248">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Automated Non-Functional Requirements Generation in Software Engineering with Large Language Models: A Comparative Study </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Almonte,+J+T">Jomar Thomas Almonte</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Boominathan,+S+A">Santhosh Anitha Boominathan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Nascimento,+N">Nathalia Nascimento</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 11 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Software Engineering (cs.SE)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Neglecting non-functional requirements (NFRs) early in software development can lead to critical challenges. Despite their importance, NFRs are often overlooked or difficult to identify, impacting software quality. To support requirements engineers in eliciting NFRs, we developed a framework that leverages Large Language Models (LLMs) to derive quality-driven NFRs from functional requirements (FRs). Using a custom prompting technique within a Deno-based pipeline, the system identifies relevant quality attributes for each functional requirement and generates corresponding NFRs, aiding systematic integration. A crucial aspect is evaluating the quality and suitability of these generated requirements. Can LLMs produce high-quality NFR suggestions? Using 34 functional requirements - selected as a representative subset of 3,964 FRs-the LLMs inferred applicable attributes based on the ISO/IEC 25010:2023 standard, generating 1,593 NFRs. A horizontal evaluation covered three dimensions: NFR validity, applicability of quality attributes, and classification precision. Ten industry software quality evaluators, averaging 13 years of experience, assessed a subset for relevance and quality. The evaluation showed strong alignment between LLM-generated NFRs and expert assessments, with median validity and applicability scores of 5.0 (means: 4.63 and 4.59, respectively) on a 1-5 scale. In the classification task, 80.4% of LLM-assigned attributes matched expert choices, with 8.3% near misses and 11.3% mismatches. A comparative analysis of eight LLMs highlighted variations in performance, with gemini-1.5-pro exhibiting the highest attribute accuracy, while llama-3.3-70B achieved higher validity and applicability scores. These findings provide insights into the feasibility of using LLMs for automated NFR generation and lay the foundation for further exploration of AI-assisted requirements engineering. </p> </div> </dd> <dt> <a name='item17'>[17]</a> <a href ="/abs/2503.15270" title="Abstract" id="2503.15270"> arXiv:2503.15270 </a> [<a href="/pdf/2503.15270" title="Download PDF" id="pdf-2503.15270" aria-labelledby="pdf-2503.15270">pdf</a>, <a href="https://arxiv.org/html/2503.15270v1" title="View HTML" id="html-2503.15270" aria-labelledby="html-2503.15270" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15270" title="Other formats" id="oth-2503.15270" aria-labelledby="oth-2503.15270">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Automating Comment Generation for Smart Contract from Bytecode </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xiang,+J">Jianhang Xiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gao,+Z">Zhipeng Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Bao,+L">Lingfeng Bao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Hu,+X">Xing Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+J">Jiayuan Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xia,+X">Xin Xia</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Software Engineering (cs.SE)</span> </div> <p class='mathjax'> Recently, smart contracts have played a vital role in automatic financial and business transactions. To help end users without programming background to better understand the logic of smart contracts, previous studies have proposed models for automatically translating smart contract source code into their corresponding code summaries. However, in practice, only 13% of smart contracts deployed on the Ethereum blockchain are associated with source code. The practical usage of these existing tools is significantly restricted. Considering that bytecode is always necessary when deploying smart contracts, in this paper, we first introduce the task of automatically generating smart contract code summaries from bytecode. We propose a novel approach, named SmartBT (Smart contract Bytecode Translator) for automatically translating smart contract bytecode into fine-grained natural language description directly. Two key challenges are posed for this task: structural code logic hidden in bytecode and the huge semantic gap between bytecode and natural language descriptions. To address the first challenge, we transform bytecode into CFG (Control-Flow Graph) to learn code structural and logic details. Regarding the second challenge, we introduce an information retrieval component to fetch similar comments for filling the semantic gap. Then the structural input and semantic input are used to build an attentional sequence-to-sequence neural network model. The copy mechanism is employed to copy rare words directly from similar comments and the coverage mechanism is employed to eliminate repetitive outputs. The automatic evaluation results show that SmartBT outperforms a set of baselines by a large margin, and the human evaluation results show the effectiveness and potential of SmartBT in producing meaningful and accurate comments for smart contract code from bytecode directly. </p> </div> </dd> <dt> <a name='item18'>[18]</a> <a href ="/abs/2503.15277" title="Abstract" id="2503.15277"> arXiv:2503.15277 </a> [<a href="/pdf/2503.15277" title="Download PDF" id="pdf-2503.15277" aria-labelledby="pdf-2503.15277">pdf</a>, <a href="https://arxiv.org/html/2503.15277v1" title="View HTML" id="html-2503.15277" aria-labelledby="html-2503.15277" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15277" title="Other formats" id="oth-2503.15277" aria-labelledby="oth-2503.15277">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> What Makes a Good TODO Comment? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+H">Haoye Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gao,+Z">Zhipeng Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Bi,+T">Tingting Bi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Grundy,+J">John Grundy</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+X">Xinyu Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wu,+M">Minghui Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yang,+X">Xiaohu Yang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Software Engineering (cs.SE)</span> </div> <p class='mathjax'> Software development is a collaborative process that involves various interactions among individuals and teams. TODO comments in source code play a critical role in managing and coordinating diverse tasks during this process. However, this study finds that a large proportion of open-source project TODO comments are left unresolved or take a long time to be resolved. About 46.7\% of TODO comments in open-source repositories are of low-quality (e.g., TODOs that are ambiguous, lack information, or are useless to developers). This highlights the need for better TODO practices. In this study, we investigate four aspects regarding the quality of TODO comments in open-source projects: (1) the prevalence of low-quality TODO comments; (2) the key characteristics of high-quality TODO comments; (3) how are TODO comments of different quality managed in practice; and (4) the feasibility of automatically assessing TODO comment quality. Examining 2,863 TODO comments from Top100 GitHub Java repositories, we propose criteria to identify high-quality TODO comments and provide insights into their optimal composition. We discuss the lifecycle of TODO comments with varying quality. we construct deep learning-based methods that show promising performance in identifying the quality of TODO comments, potentially enhancing development efficiency and code quality. </p> </div> </dd> <dt> <a name='item19'>[19]</a> <a href ="/abs/2503.15282" title="Abstract" id="2503.15282"> arXiv:2503.15282 </a> [<a href="/pdf/2503.15282" title="Download PDF" id="pdf-2503.15282" aria-labelledby="pdf-2503.15282">pdf</a>, <a href="https://arxiv.org/html/2503.15282v1" title="View HTML" id="html-2503.15282" aria-labelledby="html-2503.15282" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15282" title="Other formats" id="oth-2503.15282" aria-labelledby="oth-2503.15282">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SENAI: Towards Software Engineering Native Generative Artificial Intelligence </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Saad,+M">Mootez Saad</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=L%C3%B3pez,+J+A+H">Jos茅 Antonio Hern谩ndez L贸pez</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+B">Boqi Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ernst,+N">Neil Ernst</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Varr%C3%B3,+D">D谩niel Varr贸</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Sharma,+T">Tushar Sharma</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 5 pages, 1 figure </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Software Engineering (cs.SE)</span> </div> <p class='mathjax'> Large Language Models have significantly advanced the field of code generation, demonstrating the ability to produce functionally correct code snippets. However, advancements in generative AI for code overlook foundational Software Engineering (SE) principles such as modularity, and single responsibility, and concepts such as cohesion and coupling which are critical for creating maintainable, scalable, and robust software systems. These concepts are missing in pipelines that start with pre-training and end with the evaluation using benchmarks. <br>This vision paper argues for the integration of SE knowledge into LLMs to enhance their capability to understand, analyze, and generate code and other SE artifacts following established SE knowledge. The aim is to propose a new direction where LLMs can move beyond mere functional accuracy to perform generative tasks that require adherence to SE principles and best practices. In addition, given the interactive nature of these conversational models, we propose using Bloom&#39;s Taxonomy as a framework to assess the extent to which they internalize SE knowledge. The proposed evaluation framework offers a sound and more comprehensive evaluation technique compared to existing approaches such as linear probing. Software engineering native generative models will not only overcome the shortcomings present in current models but also pave the way for the next generation of generative models capable of handling real-world software engineering. </p> </div> </dd> <dt> <a name='item20'>[20]</a> <a href ="/abs/2503.15301" title="Abstract" id="2503.15301"> arXiv:2503.15301 </a> [<a href="/pdf/2503.15301" title="Download PDF" id="pdf-2503.15301" aria-labelledby="pdf-2503.15301">pdf</a>, <a href="https://arxiv.org/html/2503.15301v1" title="View HTML" id="html-2503.15301" aria-labelledby="html-2503.15301" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15301" title="Other formats" id="oth-2503.15301" aria-labelledby="oth-2503.15301">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> aiXcoder-7B-v2: Training LLMs to Fully Utilize the Long Context in Repository-level Code Completion </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+J">Jia Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhu,+H">Hao Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+H">Huanyu Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Shi,+X">Xianjie Shi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zong,+H">He Zong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Dong,+Y">Yihong Dong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+K">Kechi Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jiang,+S">Siyuan Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jin,+Z">Zhi Jin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+G">Ge Li</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Software Engineering (cs.SE)</span> </div> <p class='mathjax'> Repository-level code completion aims to complete code based on the long contexts of the repository. Existing studies extract long contexts from the repository as inputs and leverage Large Language Models (LLMs) to generate code. However, we reveal a severe limitation of LLMs, i.e., LLMs may ignore the information within long contexts in code completion. In other words, even the contexts contain useful information (e.g., relevant APIs or similar code), LLMs may fail to utilize this information. We think this limitation is caused by an inherent bias in LLMs, i.e., relying on nearby contexts and ignoring long-range contexts. To address this, we propose a novel fine-tuning approach named CoLT. The core idea of CoLT is to provide explicit supervision signals, which emphasize that long-range contexts may hold relevant information. Specifically, CoLT proposes a reinforcement learning-based training, which explicitly encourages models to utilize the information within long contexts and punishes models for ignoring long contexts. To support CoLT, we release CoLT-132K, a large-scale dataset with 132k samples across four languages, each containing long-context inputs. We apply CoLT to a popular LLM - aiXcoder-7B and release aiXcoder-7B-v2. We conduct extensive experiments on CoLT-132K and a public benchmark - CrossCodeEval. Our experiments yield the results: 1. Effectiveness. CoLT substantially improves aiXcoder-7B. aiXcoder-7B-v2 outperforms aiXcoder-7B by up to 44% in exact match. aiXcoder-7B-v2 becomes the state-of-the-art 7B model in code completion and even surpasses larger models. 2. Generalizability. The capability learned by CoLT can generalize to new languages. Besides, CoLT is model-agnostic and effectively improves multiple LLMs. 3. Enhanced Context Utilization Capability. CoLT significantly improves the capability of LLMs in utilizing the relevant information within long contexts. </p> </div> </dd> <dt> <a name='item21'>[21]</a> <a href ="/abs/2503.15341" title="Abstract" id="2503.15341"> arXiv:2503.15341 </a> [<a href="/pdf/2503.15341" title="Download PDF" id="pdf-2503.15341" aria-labelledby="pdf-2503.15341">pdf</a>, <a href="https://arxiv.org/html/2503.15341v1" title="View HTML" id="html-2503.15341" aria-labelledby="html-2503.15341" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15341" title="Other formats" id="oth-2503.15341" aria-labelledby="oth-2503.15341">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Uncertainty-Guided Chain-of-Thought for Code Generation with LLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhu,+Y">Yuqi Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+G">Ge Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jiang,+X">Xue Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+J">Jia Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Mei,+H">Hong Mei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jin,+Z">Zhi Jin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Dong,+Y">Yihong Dong</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Software Engineering (cs.SE)</span> </div> <p class='mathjax'> Chain-of-Thought (CoT) reasoning has been demonstrated as an effective technique for improving the problem-solving capabilities of large language models (LLMs) in the context of code generation. However, existing CoT methods often exhibit a tendency toward &#34;overthinking&#34;, where the LLM consistently applies reasoning strategies without adequately considering the task&#39;s underlying complexity. This results in the LLMs allocating excessive computational resources, in terms of tokens, to relatively simple tasks or problems where the correct answer is already evident. Additionally, this overthinking may lead LLMs down incorrect reasoning paths, resulting in incorrect code generation. In this paper, we introduce UnCertainty-Aware Chain-of-Thought (UnCert-CoT), an LLM-based approach designed to enhance code generation by incorporating an uncertainty-aware CoT reasoning mechanism, which focuses computational resources on targeting points where LLMs are more prone to error. We propose two confidence-based uncertainty measures: Entropy-based and Probability Differential-based methods. When uncertainty is high, UnCert-CoT activates CoT-decoding to generate multiple reasoning paths and selects the final code that exhibits the highest likelihood of correctness. In contrast, LLM directly generates the code when uncertainty is low. This uncertainty judgment mechanism allows LLMs to prioritize complex tasks and avoid unnecessary steps in simpler cases, thereby improving overall efficiency and accuracy in code generation. Our experimental results demonstrate that UnCert-CoT significantly enhances code generation accuracy on challenging benchmark MHPP(Mostly Hard Python Problems), it achieves improvements up to 6.1% on PassRate accuracy, particularly in situations where traditional LLMs are prone to errors. </p> </div> </dd> </dl> <dl id='articles'> <h3>Cross submissions (showing 2 of 2 entries)</h3> <dt> <a name='item22'>[22]</a> <a href ="/abs/2503.15199" title="Abstract" id="2503.15199"> arXiv:2503.15199 </a> (cross-list from cs.DC) [<a href="/pdf/2503.15199" title="Download PDF" id="pdf-2503.15199" aria-labelledby="pdf-2503.15199">pdf</a>, <a href="https://arxiv.org/html/2503.15199v1" title="View HTML" id="html-2503.15199" aria-labelledby="html-2503.15199" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15199" title="Other formats" id="oth-2503.15199" aria-labelledby="oth-2503.15199">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Radon: a Programming Model and Platform for Computing Continuum Systems </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=De+Martini,+L">Luca De Martini</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=d&#39;Abate,+D">Dario d&#39;Abate</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Margara,+A">Alessandro Margara</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Cugola,+G">Gianpaolo Cugola</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Submitted to EDCCS 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Distributed, Parallel, and Cluster Computing (cs.DC)</span>; Software Engineering (cs.SE) </div> <p class='mathjax'> Emerging compute continuum environments pose new challenges that traditional cloud-centric architectures struggle to address. Latency, bandwidth constraints, and the heterogeneity of edge environments hinder the efficiency of centralized cloud solutions. While major cloud providers extend their platforms to the edge, these approaches often overlook its unique characteristics, limiting its potential. <br>To tackle these challenges, we introduce Radon, a flexible programming model and platform designed for the edge-to-cloud continuum. Radon applications are structured as atoms, isolated stateful entities that communicate through messaging and can be composed into complex systems. The Radon runtime, based on WebAssembly (WASM), enables language- and deployment-independent execution, ensuring portability and adaptability across heterogeneous environments. This decoupling allows developers to focus on application logic while the runtime optimizes for diverse infrastructure conditions. <br>We present a prototype implementation of Radon and evaluate its effectiveness through a distributed key-value store case study. We analyze the implementation in terms of code complexity and performance. Our results demonstrate that Radon facilitates the development and operation of scalable applications across the edge-to-cloud continuum advancing the current state-of-the-art. </p> </div> </dd> <dt> <a name='item23'>[23]</a> <a href ="/abs/2503.15439" title="Abstract" id="2503.15439"> arXiv:2503.15439 </a> (cross-list from quant-ph) [<a href="/pdf/2503.15439" title="Download PDF" id="pdf-2503.15439" aria-labelledby="pdf-2503.15439">pdf</a>, <a href="/format/2503.15439" title="Other formats" id="oth-2503.15439" aria-labelledby="oth-2503.15439">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LuGo: an Enhanced Quantum Phase Estimation Implementation </div> <div class='list-authors'><a href="https://arxiv.org/search/quant-ph?searchtype=author&amp;query=Lu,+C">Chao Lu</a>, <a href="https://arxiv.org/search/quant-ph?searchtype=author&amp;query=Meena,+M+G">Muralikrishnan Gopalakrishanan Meena</a>, <a href="https://arxiv.org/search/quant-ph?searchtype=author&amp;query=Gottiparthi,+K+C">Kalyana Chakravarthi Gottiparthi</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Quantum Physics (quant-ph)</span>; Emerging Technologies (cs.ET); Software Engineering (cs.SE) </div> <p class='mathjax'> Quantum Phase Estimation (QPE) is a cardinal algorithm in quantum computing that plays a crucial role in various applications, including cryptography, molecular simulation, and solving systems of linear equations. However, the standard implementation of QPE faces challenges related to time complexity and circuit depth, which limit its practicality for large-scale computations. We introduce LuGo, a novel framework designed to enhance the performance of QPE by reducing redundant circuit duplication, as well as parallelization techniques to achieve faster circuit generation and gate reduction. We validate the effectiveness of our framework by generating quantum linear solver circuits, which require both QPE and inverse QPE, to solve linear systems of equations. LuGo achieves significant improvements in both computational efficiency and hardware requirements while maintaining high accuracy. Compared to a standard QPE implementation, LuGo reduces time consumption to solve a $2^6\times 2^6$ system matrix by a factor of $50.68$ and over $31\times$ reduction of quantum gates and circuit depth, with no fidelity loss on an ideal quantum simulator. With these advantages, LuGo paves the way for more efficient implementations of QPE, enabling broader applications across several quantum computing domains. </p> </div> </dd> </dl> <dl id='articles'> <h3>Replacement submissions (showing 11 of 11 entries)</h3> <dt> <a name='item24'>[24]</a> <a href ="/abs/2403.17382" title="Abstract" id="2403.17382"> arXiv:2403.17382 </a> (replaced) [<a href="/pdf/2403.17382" title="Download PDF" id="pdf-2403.17382" aria-labelledby="pdf-2403.17382">pdf</a>, <a href="https://arxiv.org/html/2403.17382v2" title="View HTML" id="html-2403.17382" aria-labelledby="html-2403.17382" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2403.17382" title="Other formats" id="oth-2403.17382" aria-labelledby="oth-2403.17382">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> No Vulnerability Data, No Problem: Towards Predicting Mean Time To Remediate In Open Source Software Dependencies </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Rahman,+I">Imranur Rahman</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Paramitha,+R">Ranindya Paramitha</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zahan,+N">Nusrat Zahan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Magill,+S">Stephen Magill</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Enck,+W">William Enck</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Williams,+L">Laurie Williams</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Software Engineering (cs.SE)</span>; Cryptography and Security (cs.CR) </div> <p class='mathjax'> Timely remediation of vulnerabilities in software dependencies is critical for the security of the software supply chain. As such, researchers have proposed tools and metrics to help practitioners assess the security practices of each of their dependencies. Conceptually, a dependency-focused Mean-Time-To-Remediate (MTTR) metric can provide a historical perspective on how long it takes a given package to update vulnerable versions of its dependencies. However, existing MTTR metrics focus on a package fixing bugs in its own code, not its dependencies. Simultaneously, existing dependency update metrics do not aggregate values for the entire package and are not sensitive to aspects important for vulnerabilities (e.g., floating version constraints). The goal of this study is to aid industry practitioners, including developers, in assessing the risk of dependencies through a novel metric approximating mean-time-to-remediate vulnerabilities in their dependencies that is evaluated by an empirical study. We propose a novel algorithm for computing MTTR called $MTTR_{dep}$ and a companion metric called $Mean-Time-To-Update_{dep}$ ($MTTU_{dep}$), which considers all version updates, including vulnerability fix updates. We conduct a large-scale study using 163, 207 packages in npm, PyPI, and Cargo, of which only 22, 513 packages produce $MTTR_{dep}$ because of the lack of vulnerability data. We further study how package characteristics (e.g., contributors and version counts) influence $MTTU_{dep}$ and $MTTR_{dep}$ and explore how long packages retain outdated vulnerable dependencies in npm, PyPI, and Cargo. Our results indicate that industry practitioners can reliably use $MTTU_{dep}$ as a proxy for $MTTR_{dep}$ when available vulnerability data is insufficient. </p> </div> </dd> <dt> <a name='item25'>[25]</a> <a href ="/abs/2409.13642" title="Abstract" id="2409.13642"> arXiv:2409.13642 </a> (replaced) [<a href="/pdf/2409.13642" title="Download PDF" id="pdf-2409.13642" aria-labelledby="pdf-2409.13642">pdf</a>, <a href="https://arxiv.org/html/2409.13642v2" title="View HTML" id="html-2409.13642" aria-labelledby="html-2409.13642" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.13642" title="Other formats" id="oth-2409.13642" aria-labelledby="oth-2409.13642">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Multi-Agent Approach to Fault Localization via Graph-Based Retrieval and Reflexion </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Rafi,+M+N">Md Nakhla Rafi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kim,+D+J">Dong Jae Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+T">Tse-Hsun Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+S">Shaowei Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Software Engineering (cs.SE)</span> </div> <p class='mathjax'> Identifying and resolving software faults remains a challenging and resource-intensive process. Traditional fault localization techniques, such as Spectrum-Based Fault Localization (SBFL), leverage statistical analysis of test coverage but often suffer from limited accuracy. While learning-based approaches improve fault localization, they demand extensive training datasets and high computational resources. Recent advances in Large Language Models (LLMs) offer new opportunities by enhancing code understanding and reasoning. However, existing LLM-based fault localization techniques face significant challenges, including token limitations, performance degradation with long inputs, and scalability issues in complex software systems. To overcome these obstacles, we propose LLM4FL, a multi-agent fault localization framework that utilizes three specialized LLM agents. First, the Context Extraction Agent applies an order-sensitive segmentation strategy to partition large coverage data within the LLM&#39;s token limit, analyze failure context, and prioritize failure-related methods. The Debugger Agent then processes the extracted data, which employs graph-based retrieval-augmented code navigation to reason about failure causes and rank suspicious methods. Finally, the Reviewer Agent re-evaluates the identified faulty methods using verbal reinforcement learning, engaging in self-criticism and iterative refinement. Evaluated on the Defects4J (V2.0.0) benchmark, which includes 675 faults from 14 Java projects, LLM4FL achieves an 18.55\% improvement in Top-1 accuracy over AutoFL and 4.82\% over SoapFL. It outperforms supervised techniques such as DeepFL and Grace, all without requiring task-specific training. Furthermore, its coverage segmentation and prompt chaining strategies enhance performance, increasing Top-1 accuracy by up to 22\%. </p> </div> </dd> <dt> <a name='item26'>[26]</a> <a href ="/abs/2410.00752" title="Abstract" id="2410.00752"> arXiv:2410.00752 </a> (replaced) [<a href="/pdf/2410.00752" title="Download PDF" id="pdf-2410.00752" aria-labelledby="pdf-2410.00752">pdf</a>, <a href="https://arxiv.org/html/2410.00752v2" title="View HTML" id="html-2410.00752" aria-labelledby="html-2410.00752" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.00752" title="Other formats" id="oth-2410.00752" aria-labelledby="oth-2410.00752">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> TestGenEval: A Real World Unit Test Generation and Test Completion Benchmark </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jain,+K">Kush Jain</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Synnaeve,+G">Gabriel Synnaeve</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Rozi%C3%A8re,+B">Baptiste Rozi猫re</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Software Engineering (cs.SE)</span> </div> <p class='mathjax'> Code generation models can help improve many common software tasks ranging from code completion to defect prediction. Most of the existing benchmarks for code generation LLMs focus on code authoring or code completion. Surprisingly, there has been far less effort dedicated to benchmarking software testing, despite the strong correlation between well-tested software and effective bug detection. To address this gap, we create and release TestGenEval, a large-scale benchmark to measure test generation performance. Based on SWEBench, TestGenEval comprises 68,647 tests from 1,210 code and test file pairs across 11 well-maintained Python repositories. It covers initial tests authoring, test suite completion, and code coverage improvements. Test authoring simulates the process of a developer writing a test suite from scratch, while test completion mimics the scenario where a developer aims to improve the coverage of an existing test suite. We evaluate several popular models, with sizes ranging from 7B to 405B parameters. Our detailed analysis highlights TestGenEval&#39;s contribution to a comprehensive evaluation of test generation performance. In particular, models struggle to generate high-coverage test suites, with the best model, GPT-4o, achieving an average coverage of only 35.2%. This is primarily due to models struggling to reason about execution, and their frequent assertion errors when addressing complex code paths. </p> </div> </dd> <dt> <a name='item27'>[27]</a> <a href ="/abs/2410.14684" title="Abstract" id="2410.14684"> arXiv:2410.14684 </a> (replaced) [<a href="/pdf/2410.14684" title="Download PDF" id="pdf-2410.14684" aria-labelledby="pdf-2410.14684">pdf</a>, <a href="https://arxiv.org/html/2410.14684v2" title="View HTML" id="html-2410.14684" aria-labelledby="html-2410.14684" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.14684" title="Other formats" id="oth-2410.14684" aria-labelledby="oth-2410.14684">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> RepoGraph: Enhancing AI Software Engineering with Repository-level Code Graph </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ouyang,+S">Siru Ouyang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yu,+W">Wenhao Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ma,+K">Kaixin Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xiao,+Z">Zilin Xiao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+Z">Zhihan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jia,+M">Mengzhao Jia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Han,+J">Jiawei Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+H">Hongming Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yu,+D">Dong Yu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> ICLR 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Software Engineering (cs.SE)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> Large Language Models (LLMs) excel in code generation yet struggle with modern AI software engineering tasks. Unlike traditional function-level or file-level coding tasks, AI software engineering requires not only basic coding proficiency but also advanced skills in managing and interacting with code repositories. However, existing methods often overlook the need for repository-level code understanding, which is crucial for accurately grasping the broader context and developing effective solutions. On this basis, we present RepoGraph, a plug-in module that manages a repository-level structure for modern AI software engineering solutions. RepoGraph offers the desired guidance and serves as a repository-wide navigation for AI software engineers. We evaluate RepoGraph on the SWE-bench by plugging it into four different methods of two lines of approaches, where RepoGraph substantially boosts the performance of all systems, leading to a new state-of-the-art among open-source frameworks. Our analyses also demonstrate the extensibility and flexibility of RepoGraph by testing on another repo-level coding benchmark, CrossCodeEval. Our code is available at <a href="https://github.com/ozyyshr/RepoGraph" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item28'>[28]</a> <a href ="/abs/2411.07718" title="Abstract" id="2411.07718"> arXiv:2411.07718 </a> (replaced) [<a href="/pdf/2411.07718" title="Download PDF" id="pdf-2411.07718" aria-labelledby="pdf-2411.07718">pdf</a>, <a href="https://arxiv.org/html/2411.07718v4" title="View HTML" id="html-2411.07718" aria-labelledby="html-2411.07718" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.07718" title="Other formats" id="oth-2411.07718" aria-labelledby="oth-2411.07718">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SoliDiffy: AST Differencing for Solidity Smart Contracts </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Eshghie,+M">Mojtaba Eshghie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=%C3%85ryd,+V">Viktor 脜ryd</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Artho,+C">Cyrille Artho</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Monperrus,+M">Martin Monperrus</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Software Engineering (cs.SE)</span>; Programming Languages (cs.PL) </div> <p class='mathjax'> Structured code differencing is the act of comparing the hierarchical structure of code via its abstract syntax tree (AST) to capture modifications. AST-based source code differencing enables tasks such as vulnerability detection and automated repair where traditional line-based differencing falls short. We introduce SoliDiffy, the first AST differencing tool for Solidity smart contracts with the ability to generate an edit script that soundly shows the structural differences between two smart-contracts using insert, delete, update, move operations. In our evaluation on 353,262 contract pairs, SoliDiffy achieved a 96.1% diffing success rate, surpassing the state-of-the-art, and produced significantly shorter edit scripts. Additional experiments on 925 real-world commits further confirmed its superiority compared to Git line-based differencing. SoliDiffy provides accurate representations of smart contract evolution even in the existence of multiple complex modifications to the source code. SoliDiffy is made publicly available at <a href="https://github.com/mojtaba-eshghie/SoliDiffy" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item29'>[29]</a> <a href ="/abs/2411.10877" title="Abstract" id="2411.10877"> arXiv:2411.10877 </a> (replaced) [<a href="/pdf/2411.10877" title="Download PDF" id="pdf-2411.10877" aria-labelledby="pdf-2411.10877">pdf</a>, <a href="https://arxiv.org/html/2411.10877v3" title="View HTML" id="html-2411.10877" aria-labelledby="html-2411.10877" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.10877" title="Other formats" id="oth-2411.10877" aria-labelledby="oth-2411.10877">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Developer Perspectives on Licensing and Copyright Issues Arising from Generative AI for Software Development </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Stalnaker,+T">Trevor Stalnaker</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wintersgill,+N">Nathan Wintersgill</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chaparro,+O">Oscar Chaparro</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Heymann,+L+A">Laura A. Heymann</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Di+Penta,+M">Massimiliano Di Penta</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=German,+D+M">Daniel M German</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Poshyvanyk,+D">Denys Poshyvanyk</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Software Engineering (cs.SE)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Despite the utility that Generative AI (GenAI) tools provide for tasks such as writing code, the use of these tools raises important legal questions and potential risks, particularly those associated with copyright law. As lawmakers and regulators engage with those questions, the views of users can provide relevant perspectives. In this paper, we provide: (1) a survey of 574 developers on the licensing and copyright aspects of GenAI for coding, as well as follow-up interviews; (2) a snapshot of developers&#39; views at a time when GenAI and perceptions of it are rapidly evolving; and (3) an analysis of developers&#39; views, yielding insights and recommendations that can inform future regulatory decisions in this evolving field. Our results show the benefits developers derive from GenAI, how they view the use of AI-generated code as similar to using other existing code, the varied opinions they have on who should own or be compensated for such code, that they are concerned about data leakage via GenAI, and much more, providing organizations and policymakers with valuable insights into how the technology is being used and what concerns stakeholders would like to see addressed. </p> </div> </dd> <dt> <a name='item30'>[30]</a> <a href ="/abs/2412.18750" title="Abstract" id="2412.18750"> arXiv:2412.18750 </a> (replaced) [<a href="/pdf/2412.18750" title="Download PDF" id="pdf-2412.18750" aria-labelledby="pdf-2412.18750">pdf</a>, <a href="https://arxiv.org/html/2412.18750v2" title="View HTML" id="html-2412.18750" aria-labelledby="html-2412.18750" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.18750" title="Other formats" id="oth-2412.18750" aria-labelledby="oth-2412.18750">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> The Impact of Input Order Bias on Large Language Models for Software Fault Localization </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Rafi,+M+N">Md Nakhla Rafi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kim,+D+J">Dong Jae Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+T">Tse-Hsun Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+S">Shaowei Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Software Engineering (cs.SE)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Large Language Models (LLMs) have shown significant potential in software engineering tasks such as Fault Localization (FL) and Automatic Program Repair (APR). This study investigates how input order and context size influence LLM performance in FL, a crucial step for many downstream software engineering tasks. We evaluate different method orderings using Kendall Tau distances, including &#34;perfect&#34; (where ground truths appear first) and &#34;worst&#34; (where ground truths appear last), across two benchmarks containing Java and Python projects. Our results reveal a strong order bias: in Java projects, Top-1 FL accuracy drops from 57% to 20% when reversing the order, while in Python projects, it decreases from 38% to approximately 3%. However, segmenting inputs into smaller contexts mitigates this bias, reducing the performance gap in FL from 22% and 6% to just 1% across both benchmarks. We replaced method names with semantically meaningful alternatives to determine whether this bias is due to data leakage. The observed trends remained consistent, suggesting that the bias is not caused by memorization from training data but rather by the inherent effect of input order. Additionally, we explored ordering methods based on traditional FL techniques and metrics, finding that DepGraph&#39;s ranking achieves 48% Top-1 accuracy, outperforming simpler approaches such as CallGraph(DFS). These findings highlight the importance of structuring inputs, managing context effectively, and selecting appropriate ordering strategies to enhance LLM performance in FL and other software engineering applications. </p> </div> </dd> <dt> <a name='item31'>[31]</a> <a href ="/abs/2503.02191" title="Abstract" id="2503.02191"> arXiv:2503.02191 </a> (replaced) [<a href="/pdf/2503.02191" title="Download PDF" id="pdf-2503.02191" aria-labelledby="pdf-2503.02191">pdf</a>, <a href="/format/2503.02191" title="Other formats" id="oth-2503.02191" aria-labelledby="oth-2503.02191">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Understanding and Predicting Derailment in Toxic Conversations on GitHub </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Imran,+M+M">Mia Mohammad Imran</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zita,+R">Robert Zita</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Copeland,+R">Rebekah Copeland</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chatterjee,+P">Preetha Chatterjee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Rahman,+R+R">Rahat Rizvi Rahman</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Damevski,+K">Kostadin Damevski</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Software Engineering (cs.SE)</span> </div> <p class='mathjax'> Software projects thrive on the involvement and contributions of individuals from different backgrounds. However, toxic language and negative interactions can hinder the participation and retention of contributors and alienate newcomers. Proactive moderation strategies aim to prevent toxicity from occurring by addressing conversations that have derailed from their intended purpose. This study aims to understand and predict conversational derailment leading to toxicity on GitHub. <br>To facilitate this research, we curate a novel dataset comprising 202 toxic conversations from GitHub with annotated derailment points, along with 696 non-toxic conversations as a baseline. Based on this dataset, we identify unique characteristics of toxic conversations and derailment points, including linguistic markers such as second-person pronouns, negation terms, and tones of Bitter Frustration and Impatience, as well as patterns in conversational dynamics between project contributors and external participants. <br>Leveraging these empirical observations, we propose a proactive moderation approach to automatically detect and address potentially harmful conversations before escalation. By utilizing modern LLMs, we develop a conversation trajectory summary technique that captures the evolution of discussions and identifies early signs of derailment. Our experiments demonstrate that LLM prompts tailored to provide summaries of GitHub conversations achieve 70% F1-Score in predicting conversational derailment, strongly improving over a set of baseline approaches. </p> </div> </dd> <dt> <a name='item32'>[32]</a> <a href ="/abs/2503.09020" title="Abstract" id="2503.09020"> arXiv:2503.09020 </a> (replaced) [<a href="/pdf/2503.09020" title="Download PDF" id="pdf-2503.09020" aria-labelledby="pdf-2503.09020">pdf</a>, <a href="https://arxiv.org/html/2503.09020v2" title="View HTML" id="html-2503.09020" aria-labelledby="html-2503.09020" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.09020" title="Other formats" id="oth-2503.09020" aria-labelledby="oth-2503.09020">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Enhancing High-Quality Code Generation in Large Language Models with Comparative Prefix-Tuning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jiang,+Y">Yuan Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+Y">Yujian Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lu,+L">Liang Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Treude,+C">Christoph Treude</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Su,+X">Xiaohong Su</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Huang,+S">Shan Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+T">Tiantian Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Software Engineering (cs.SE)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Large Language Models (LLMs) have been widely adopted in commercial code completion engines, significantly enhancing coding efficiency and productivity. However, LLMs may generate code with quality issues that violate coding standards and best practices, such as poor code style and maintainability, even when the code is functionally correct. This necessitates additional effort from developers to improve the code, potentially negating the efficiency gains provided by LLMs. To address this problem, we propose a novel comparative prefix-tuning method for controllable high-quality code generation. Our method introduces a single, property-specific prefix that is prepended to the activations of the LLM, serving as a lightweight alternative to fine-tuning. Unlike existing methods that require training multiple prefixes, our approach trains only one prefix and leverages pairs of high-quality and low-quality code samples, introducing a sequence-level ranking loss to guide the model&#39;s training. This comparative approach enables the model to better understand the differences between high-quality and low-quality code, focusing on aspects that impact code quality. Additionally, we design a data construction pipeline to collect and annotate pairs of high-quality and low-quality code, facilitating effective training. Extensive experiments on the Code Llama 7B model demonstrate that our method improves code quality by over 100% in certain task categories, while maintaining functional correctness. We also conduct ablation studies and generalization experiments, confirming the effectiveness of our method&#39;s components and its strong generalization capability. </p> </div> </dd> <dt> <a name='item33'>[33]</a> <a href ="/abs/2503.12374" title="Abstract" id="2503.12374"> arXiv:2503.12374 </a> (replaced) [<a href="/pdf/2503.12374" title="Download PDF" id="pdf-2503.12374" aria-labelledby="pdf-2503.12374">pdf</a>, <a href="/format/2503.12374" title="Other formats" id="oth-2503.12374" aria-labelledby="oth-2503.12374">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Unveiling Pitfalls: Understanding Why AI-driven Code Agents Fail at GitHub Issue Resolution </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+Z">Zhi Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ma,+W">Wei Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jiang,+L">Lingxiao Jiang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Software Engineering (cs.SE)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> AI-driven software development has rapidly advanced with the emergence of software development agents that leverage large language models (LLMs) to tackle complex, repository-level software engineering tasks. These agents go beyond just generation of final code; they engage in multi-step reasoning, utilize various tools for code modification and debugging, and interact with execution environments to diagnose and iteratively resolve issues. However, most existing evaluations focus primarily on static analyses of final code outputs, yielding limited insights into the agents&#39; dynamic problem-solving processes. To fill this gap, we conduct an in-depth empirical study on 3,977 solving-phase trajectories and 3,931 testing-phase logs from 8 top-ranked agents evaluated on 500 GitHub issues in the SWE-Bench benchmark. Our exploratory analysis shows that Python execution errors during the issue resolution phase correlate with lower resolution rates and increased reasoning overheads. We have identified the most prevalent errors -- such as ModuleNotFoundError and TypeError -- and highlighted particularly challenging errors like OSError and database-related issues (e.g., IntegrityError) that demand significantly more debugging effort. Furthermore, we have discovered 3 bugs in the SWE-Bench platform that affect benchmark fairness and accuracy; these issues have been reported to and confirmed by the maintainers. To promote transparency and foster future research, we publicly share our datasets and analysis scripts. </p> </div> </dd> <dt> <a name='item34'>[34]</a> <a href ="/abs/2503.11498" title="Abstract" id="2503.11498"> arXiv:2503.11498 </a> (replaced) [<a href="/pdf/2503.11498" title="Download PDF" id="pdf-2503.11498" aria-labelledby="pdf-2503.11498">pdf</a>, <a href="https://arxiv.org/html/2503.11498v2" title="View HTML" id="html-2503.11498" aria-labelledby="html-2503.11498" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.11498" title="Other formats" id="oth-2503.11498" aria-labelledby="oth-2503.11498">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Cloud2BIM: An open-source automatic pipeline for efficient conversion of large-scale point clouds into IFC format </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zbirovsk%C3%BD,+S">Sl谩vek Zbirovsk媒</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ne%C5%BEerka,+V">V谩clav Ne啪erka</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 53 pages, 23 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Software Engineering (cs.SE) </div> <p class='mathjax'> Building Information Modeling (BIM) is an essential component in the sustainable reconstruction and revitalization of ageing structures. However, model creation usually relies on laborious manual transformation of the unstructured point cloud data provided by laser scans or photogrammetry. This paper presents Cloud2BIM, an open-source software tool designed to automate the conversion of point clouds into BIM models compliant with the Industry Foundation Classes (IFC) standard. Cloud2BIM integrates advanced algorithms for wall and slab segmentation, opening detection, and room zoning based on real wall surfaces, resulting in a comprehensive and fully automated workflow. Unlike existing tools, it avoids computationally- and calibration-intensive techniques such as RANSAC, supports non-orthogonal geometries, and provides unprecedented processing speed-achieving results up to seven times faster than fastest competing solutions. Systematic validation using benchmark datasets confirms that Cloud2BIM is an easy-to-use, efficient, and scalable solution for generating accurate BIM models, capable of converting extensive point cloud datasets for entire buildings into IFC format with minimal user input. </p> </div> </dd> </dl> <div class='paging'>Total of 34 entries </div> <div class='morefewer'>Showing up to 2000 entries per page: <a href=/list/cs.SE/new?skip=0&amp;show=1000 rel="nofollow"> fewer</a> | <span style="color: #454545">more</span> | <span style="color: #454545">all</span> </div> </div> </div> </div> </main> <footer style="clear: both;"> <div class="columns is-desktop" role="navigation" aria-label="Secondary" style="margin: -0.75em -0.75em 0.75em -0.75em"> <!-- Macro-Column 1 --> <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- End Macro-Column 1 --> <!-- Macro-Column 2 --> <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> <!-- End Macro-Column 2 --> </div> </footer> </div> <script src="/static/base/1.0.1/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10