CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;13 of 13 results for author: <span class="mathjax">Gema, A P</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Gema%2C+A+P">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Gema, A P"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Gema%2C+A+P&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Gema, A P"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18860">arXiv:2410.18860</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.18860">pdf</a>, <a href="https://arxiv.org/format/2410.18860">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DeCoRe: Decoding by Contrasting Retrieval Heads to Mitigate Hallucinations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gema%2C+A+P">Aryo Pradipta Gema</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+C">Chen Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Abdulaal%2C+A">Ahmed Abdulaal</a>, <a href="/search/cs?searchtype=author&amp;query=Diethe%2C+T">Tom Diethe</a>, <a href="/search/cs?searchtype=author&amp;query=Teare%2C+P">Philip Teare</a>, <a href="/search/cs?searchtype=author&amp;query=Alex%2C+B">Beatrice Alex</a>, <a href="/search/cs?searchtype=author&amp;query=Minervini%2C+P">Pasquale Minervini</a>, <a href="/search/cs?searchtype=author&amp;query=Saseendran%2C+A">Amrutha Saseendran</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18860v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) often hallucinate, producing unfaithful or factually incorrect outputs by misrepresenting the provided context or incorrectly recalling internal knowledge. Recent studies have identified specific attention heads within the Transformer architecture, known as retrieval heads, responsible for extracting relevant contextual information. We hypothesise that masking these re&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18860v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18860v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18860v1-abstract-full" style="display: none;"> Large Language Models (LLMs) often hallucinate, producing unfaithful or factually incorrect outputs by misrepresenting the provided context or incorrectly recalling internal knowledge. Recent studies have identified specific attention heads within the Transformer architecture, known as retrieval heads, responsible for extracting relevant contextual information. We hypothesise that masking these retrieval heads can induce hallucinations and that contrasting the outputs of the base LLM and the masked LLM can reduce hallucinations. To this end, we propose Decoding by Contrasting Retrieval Heads (DeCoRe), a novel training-free decoding strategy that amplifies information found in the context and model parameters. DeCoRe mitigates potentially hallucinated responses by dynamically contrasting the outputs of the base LLM and the masked LLM, using conditional entropy as a guide. Our extensive experiments confirm that DeCoRe significantly improves performance on tasks requiring high contextual faithfulness, such as summarisation (XSum by 18.6%), instruction following (MemoTrap by 10.9%), and open-book question answering (NQ-Open by 2.4% and NQ-Swap by 5.5%). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18860v1-abstract-full').style.display = 'none'; document.getElementById('2410.18860v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16090">arXiv:2410.16090</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.16090">pdf</a>, <a href="https://arxiv.org/format/2410.16090">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Analysing the Residual Stream of Language Models Under Knowledge Conflicts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Y">Yu Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Du%2C+X">Xiaotang Du</a>, <a href="/search/cs?searchtype=author&amp;query=Hong%2C+G">Giwon Hong</a>, <a href="/search/cs?searchtype=author&amp;query=Gema%2C+A+P">Aryo Pradipta Gema</a>, <a href="/search/cs?searchtype=author&amp;query=Devoto%2C+A">Alessio Devoto</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Hongru Wang</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+X">Xuanli He</a>, <a href="/search/cs?searchtype=author&amp;query=Wong%2C+K">Kam-Fai Wong</a>, <a href="/search/cs?searchtype=author&amp;query=Minervini%2C+P">Pasquale Minervini</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16090v1-abstract-short" style="display: inline;"> Large language models (LLMs) can store a significant amount of factual knowledge in their parameters. However, their parametric knowledge may conflict with the information provided in the context. Such conflicts can lead to undesirable model behaviour, such as reliance on outdated or incorrect information. In this work, we investigate whether LLMs can identify knowledge conflicts and whether it is&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16090v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16090v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16090v1-abstract-full" style="display: none;"> Large language models (LLMs) can store a significant amount of factual knowledge in their parameters. However, their parametric knowledge may conflict with the information provided in the context. Such conflicts can lead to undesirable model behaviour, such as reliance on outdated or incorrect information. In this work, we investigate whether LLMs can identify knowledge conflicts and whether it is possible to know which source of knowledge the model will rely on by analysing the residual stream of the LLM. Through probing tasks, we find that LLMs can internally register the signal of knowledge conflict in the residual stream, which can be accurately detected by probing the intermediate model activations. This allows us to detect conflicts within the residual stream before generating the answers without modifying the input or model parameters. Moreover, we find that the residual stream shows significantly different patterns when the model relies on contextual knowledge versus parametric knowledge to resolve conflicts. This pattern can be employed to estimate the behaviour of LLMs when conflict happens and prevent unexpected answers before producing the answers. Our analysis offers insights into how LLMs internally manage knowledge conflicts and provides a foundation for developing methods to control the knowledge selection processes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16090v1-abstract-full').style.display = 'none'; document.getElementById('2410.16090v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Foundation Model Interventions Workshop @ NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15999">arXiv:2410.15999</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.15999">pdf</a>, <a href="https://arxiv.org/format/2410.15999">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Steering Knowledge Selection Behaviours in LLMs via SAE-Based Representation Engineering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Y">Yu Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Devoto%2C+A">Alessio Devoto</a>, <a href="/search/cs?searchtype=author&amp;query=Hong%2C+G">Giwon Hong</a>, <a href="/search/cs?searchtype=author&amp;query=Du%2C+X">Xiaotang Du</a>, <a href="/search/cs?searchtype=author&amp;query=Gema%2C+A+P">Aryo Pradipta Gema</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Hongru Wang</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+X">Xuanli He</a>, <a href="/search/cs?searchtype=author&amp;query=Wong%2C+K">Kam-Fai Wong</a>, <a href="/search/cs?searchtype=author&amp;query=Minervini%2C+P">Pasquale Minervini</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15999v2-abstract-short" style="display: inline;"> Large language models (LLMs) can store a significant amount of factual knowledge in their parameters. However, their parametric knowledge may conflict with the information provided in the context -- this phenomenon, known as \emph{context-memory knowledge conflicts}, can lead to undesirable model behaviour, such as reliance on outdated or incorrect information. Analysing the internal activations o&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15999v2-abstract-full').style.display = 'inline'; document.getElementById('2410.15999v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15999v2-abstract-full" style="display: none;"> Large language models (LLMs) can store a significant amount of factual knowledge in their parameters. However, their parametric knowledge may conflict with the information provided in the context -- this phenomenon, known as \emph{context-memory knowledge conflicts}, can lead to undesirable model behaviour, such as reliance on outdated or incorrect information. Analysing the internal activations of LLMs, we find that they can internally register the signals of knowledge conflict at mid-layers. Such signals allow us to detect whether a knowledge conflict occurs and use \emph{inference-time} intervention strategies to resolve it. In this work, we propose \textsc{SpARE}, a \emph{training-free} representation engineering method that uses pre-trained sparse auto-encoders (SAEs) to control the knowledge selection behaviour of LLMs. \textsc{SpARE} identifies the functional features that control the knowledge selection behaviours and applies them to edit the internal activations of LLMs at inference time. Our experimental results show that \textsc{SpARE} can effectively control the usage of either knowledge source to resolve knowledge conflict in open-domain question-answering tasks, surpassing existing representation engineering methods ($+10\%$) as well as contrastive decoding methods ($+15\%$). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15999v2-abstract-full').style.display = 'none'; document.getElementById('2410.15999v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10336">arXiv:2410.10336</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.10336">pdf</a>, <a href="https://arxiv.org/format/2410.10336">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Symbolic Computation">cs.SC</span> </div> </div> <p class="title is-5 mathjax"> CoMAT: Chain of Mathematically Annotated Thought Improves Mathematical Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Leang%2C+J+O+J">Joshua Ong Jun Leang</a>, <a href="/search/cs?searchtype=author&amp;query=Gema%2C+A+P">Aryo Pradipta Gema</a>, <a href="/search/cs?searchtype=author&amp;query=Cohen%2C+S+B">Shay B. Cohen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10336v1-abstract-short" style="display: inline;"> Mathematical reasoning remains a significant challenge for large language models (LLMs), despite progress in prompting techniques such as Chain-of-Thought (CoT). We present Chain of Mathematically Annotated Thought (CoMAT), which enhances reasoning through two stages: Symbolic Conversion (converting natural language queries into symbolic form) and Reasoning Execution (deriving answers from symboli&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10336v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10336v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10336v1-abstract-full" style="display: none;"> Mathematical reasoning remains a significant challenge for large language models (LLMs), despite progress in prompting techniques such as Chain-of-Thought (CoT). We present Chain of Mathematically Annotated Thought (CoMAT), which enhances reasoning through two stages: Symbolic Conversion (converting natural language queries into symbolic form) and Reasoning Execution (deriving answers from symbolic representations). CoMAT operates entirely with a single LLM and without external solvers. Across four LLMs, CoMAT outperforms traditional CoT on six out of seven benchmarks, achieving gains of 4.48% on MMLU-Redux (MATH) and 4.58% on GaoKao MCQ. In addition to improved performance, CoMAT ensures faithfulness and verifiability, offering a transparent reasoning process for complex mathematical tasks <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10336v1-abstract-full').style.display = 'none'; document.getElementById('2410.10336v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 12 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.16593">arXiv:2407.16593</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.16593">pdf</a>, <a href="https://arxiv.org/format/2407.16593">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Comparative Study on Patient Language across Therapeutic Domains for Effective Patient Voice Classification in Online Health Discussions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lysandrou%2C+G">Giorgos Lysandrou</a>, <a href="/search/cs?searchtype=author&amp;query=Owen%2C+R+E">Roma English Owen</a>, <a href="/search/cs?searchtype=author&amp;query=Popovic%2C+V">Vanja Popovic</a>, <a href="/search/cs?searchtype=author&amp;query=Brun%2C+G+L">Grant Le Brun</a>, <a href="/search/cs?searchtype=author&amp;query=Gema%2C+A+P">Aryo Pradipta Gema</a>, <a href="/search/cs?searchtype=author&amp;query=Alex%2C+B">Beatrice Alex</a>, <a href="/search/cs?searchtype=author&amp;query=Fairley%2C+E+A+L">Elizabeth A. L. Fairley</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.16593v1-abstract-short" style="display: inline;"> There exists an invisible barrier between healthcare professionals&#39; perception of a patient&#39;s clinical experience and the reality. This barrier may be induced by the environment that hinders patients from sharing their experiences openly with healthcare professionals. As patients are observed to discuss and exchange knowledge more candidly on social media, valuable insights can be leveraged from t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16593v1-abstract-full').style.display = 'inline'; document.getElementById('2407.16593v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.16593v1-abstract-full" style="display: none;"> There exists an invisible barrier between healthcare professionals&#39; perception of a patient&#39;s clinical experience and the reality. This barrier may be induced by the environment that hinders patients from sharing their experiences openly with healthcare professionals. As patients are observed to discuss and exchange knowledge more candidly on social media, valuable insights can be leveraged from these platforms. However, the abundance of non-patient posts on social media necessitates filtering out such irrelevant content to distinguish the genuine voices of patients, a task we refer to as patient voice classification. In this study, we analyse the importance of linguistic characteristics in accurately classifying patient voices. Our findings underscore the essential role of linguistic and statistical text similarity analysis in identifying common patterns among patient groups. These results allude to even starker differences in the way patients express themselves at a disease level and across various therapeutic domains. Additionally, we fine-tuned a pre-trained Language Model on the combined datasets with similar linguistic patterns, resulting in a highly accurate automatic patient voice classification. Being the pioneering study on the topic, our focus on extracting authentic patient experiences from social media stands as a crucial step towards advancing healthcare standards and fostering a patient-centric approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16593v1-abstract-full').style.display = 'none'; document.getElementById('2407.16593v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 4 figures, 5 tables, funded by Talking Medicines Limited</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.04127">arXiv:2406.04127</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.04127">pdf</a>, <a href="https://arxiv.org/format/2406.04127">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Are We Done with MMLU? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gema%2C+A+P">Aryo Pradipta Gema</a>, <a href="/search/cs?searchtype=author&amp;query=Leang%2C+J+O+J">Joshua Ong Jun Leang</a>, <a href="/search/cs?searchtype=author&amp;query=Hong%2C+G">Giwon Hong</a>, <a href="/search/cs?searchtype=author&amp;query=Devoto%2C+A">Alessio Devoto</a>, <a href="/search/cs?searchtype=author&amp;query=Mancino%2C+A+C+M">Alberto Carlo Maria Mancino</a>, <a href="/search/cs?searchtype=author&amp;query=Saxena%2C+R">Rohit Saxena</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+X">Xuanli He</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Y">Yu Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Du%2C+X">Xiaotang Du</a>, <a href="/search/cs?searchtype=author&amp;query=Madani%2C+M+R+G">Mohammad Reza Ghasemi Madani</a>, <a href="/search/cs?searchtype=author&amp;query=Barale%2C+C">Claire Barale</a>, <a href="/search/cs?searchtype=author&amp;query=McHardy%2C+R">Robert McHardy</a>, <a href="/search/cs?searchtype=author&amp;query=Harris%2C+J">Joshua Harris</a>, <a href="/search/cs?searchtype=author&amp;query=Kaddour%2C+J">Jean Kaddour</a>, <a href="/search/cs?searchtype=author&amp;query=van+Krieken%2C+E">Emile van Krieken</a>, <a href="/search/cs?searchtype=author&amp;query=Minervini%2C+P">Pasquale Minervini</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.04127v2-abstract-short" style="display: inline;"> Maybe not. We identify and analyse errors in the popular Massive Multitask Language Understanding (MMLU) benchmark. Even though MMLU is widely adopted, our analysis demonstrates numerous ground truth errors that obscure the true capabilities of LLMs. For example, we find that 57% of the analysed questions in the Virology subset contain errors. To address this issue, we introduce a comprehensive fr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04127v2-abstract-full').style.display = 'inline'; document.getElementById('2406.04127v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.04127v2-abstract-full" style="display: none;"> Maybe not. We identify and analyse errors in the popular Massive Multitask Language Understanding (MMLU) benchmark. Even though MMLU is widely adopted, our analysis demonstrates numerous ground truth errors that obscure the true capabilities of LLMs. For example, we find that 57% of the analysed questions in the Virology subset contain errors. To address this issue, we introduce a comprehensive framework for identifying dataset errors using a novel error taxonomy. Then, we create MMLU-Redux, which is a subset of 3,000 manually re-annotated questions across 30 MMLU subjects. Using MMLU-Redux, we demonstrate significant discrepancies with the model performance metrics that were originally reported. Our results strongly advocate for revising MMLU&#39;s error-ridden questions to enhance its future utility and reliability as a benchmark. Therefore, we open up MMLU-Redux for additional annotation https://huggingface.co/datasets/edinburgh-dawg/mmlu-redux. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04127v2-abstract-full').style.display = 'none'; document.getElementById('2406.04127v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.18028">arXiv:2405.18028</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.18028">pdf</a>, <a href="https://arxiv.org/format/2405.18028">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Edinburgh Clinical NLP at MEDIQA-CORR 2024: Guiding Large Language Models with Hints </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gema%2C+A+P">Aryo Pradipta Gema</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+C">Chaeeun Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Minervini%2C+P">Pasquale Minervini</a>, <a href="/search/cs?searchtype=author&amp;query=Daines%2C+L">Luke Daines</a>, <a href="/search/cs?searchtype=author&amp;query=Simpson%2C+T+I">T. Ian Simpson</a>, <a href="/search/cs?searchtype=author&amp;query=Alex%2C+B">Beatrice Alex</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.18028v1-abstract-short" style="display: inline;"> The MEDIQA-CORR 2024 shared task aims to assess the ability of Large Language Models (LLMs) to identify and correct medical errors in clinical notes. In this study, we evaluate the capability of general LLMs, specifically GPT-3.5 and GPT-4, to identify and correct medical errors with multiple prompting strategies. Recognising the limitation of LLMs in generating accurate corrections only via promp&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18028v1-abstract-full').style.display = 'inline'; document.getElementById('2405.18028v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.18028v1-abstract-full" style="display: none;"> The MEDIQA-CORR 2024 shared task aims to assess the ability of Large Language Models (LLMs) to identify and correct medical errors in clinical notes. In this study, we evaluate the capability of general LLMs, specifically GPT-3.5 and GPT-4, to identify and correct medical errors with multiple prompting strategies. Recognising the limitation of LLMs in generating accurate corrections only via prompting strategies, we propose incorporating error-span predictions from a smaller, fine-tuned model in two ways: 1) by presenting it as a hint in the prompt and 2) by framing it as multiple-choice questions from which the LLM can choose the best correction. We found that our proposed prompting strategies significantly improve the LLM&#39;s ability to generate corrections. Our best-performing solution with 8-shot + CoT + hints ranked sixth in the shared task leaderboard. Additionally, our comprehensive analyses show the impact of the location of the error sentence, the prompted role, and the position of the multiple-choice option on the accuracy of the LLM. This prompts further questions about the readiness of LLM to be implemented in real-world clinical settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18028v1-abstract-full').style.display = 'none'; document.getElementById('2405.18028v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.05904">arXiv:2404.05904</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.05904">pdf</a>, <a href="https://arxiv.org/format/2404.05904">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> The Hallucinations Leaderboard -- An Open Effort to Measure Hallucinations in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hong%2C+G">Giwon Hong</a>, <a href="/search/cs?searchtype=author&amp;query=Gema%2C+A+P">Aryo Pradipta Gema</a>, <a href="/search/cs?searchtype=author&amp;query=Saxena%2C+R">Rohit Saxena</a>, <a href="/search/cs?searchtype=author&amp;query=Du%2C+X">Xiaotang Du</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+P">Ping Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Y">Yu Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Perez-Beltrachini%2C+L">Laura Perez-Beltrachini</a>, <a href="/search/cs?searchtype=author&amp;query=Ryabinin%2C+M">Max Ryabinin</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+X">Xuanli He</a>, <a href="/search/cs?searchtype=author&amp;query=Fourrier%2C+C">Cl茅mentine Fourrier</a>, <a href="/search/cs?searchtype=author&amp;query=Minervini%2C+P">Pasquale Minervini</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.05904v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) have transformed the Natural Language Processing (NLP) landscape with their remarkable ability to understand and generate human-like text. However, these models are prone to ``hallucinations&#39;&#39; -- outputs that do not align with factual reality or the input context. This paper introduces the Hallucinations Leaderboard, an open initiative to quantitatively measure and com&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.05904v2-abstract-full').style.display = 'inline'; document.getElementById('2404.05904v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.05904v2-abstract-full" style="display: none;"> Large Language Models (LLMs) have transformed the Natural Language Processing (NLP) landscape with their remarkable ability to understand and generate human-like text. However, these models are prone to ``hallucinations&#39;&#39; -- outputs that do not align with factual reality or the input context. This paper introduces the Hallucinations Leaderboard, an open initiative to quantitatively measure and compare the tendency of each model to produce hallucinations. The leaderboard uses a comprehensive set of benchmarks focusing on different aspects of hallucinations, such as factuality and faithfulness, across various tasks, including question-answering, summarisation, and reading comprehension. Our analysis provides insights into the performance of different models, guiding researchers and practitioners in choosing the most reliable models for their applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.05904v2-abstract-full').style.display = 'none'; document.getElementById('2404.05904v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.00484">arXiv:2404.00484</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.00484">pdf</a>, <a href="https://arxiv.org/format/2404.00484">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Edinburgh Clinical NLP at SemEval-2024 Task 2: Fine-tune your model unless you have access to GPT-4 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gema%2C+A+P">Aryo Pradipta Gema</a>, <a href="/search/cs?searchtype=author&amp;query=Hong%2C+G">Giwon Hong</a>, <a href="/search/cs?searchtype=author&amp;query=Minervini%2C+P">Pasquale Minervini</a>, <a href="/search/cs?searchtype=author&amp;query=Daines%2C+L">Luke Daines</a>, <a href="/search/cs?searchtype=author&amp;query=Alex%2C+B">Beatrice Alex</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.00484v1-abstract-short" style="display: inline;"> The NLI4CT task assesses Natural Language Inference systems in predicting whether hypotheses entail or contradict evidence from Clinical Trial Reports. In this study, we evaluate various Large Language Models (LLMs) with multiple strategies, including Chain-of-Thought, In-Context Learning, and Parameter-Efficient Fine-Tuning (PEFT). We propose a PEFT method to improve the consistency of LLMs by me&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00484v1-abstract-full').style.display = 'inline'; document.getElementById('2404.00484v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.00484v1-abstract-full" style="display: none;"> The NLI4CT task assesses Natural Language Inference systems in predicting whether hypotheses entail or contradict evidence from Clinical Trial Reports. In this study, we evaluate various Large Language Models (LLMs) with multiple strategies, including Chain-of-Thought, In-Context Learning, and Parameter-Efficient Fine-Tuning (PEFT). We propose a PEFT method to improve the consistency of LLMs by merging adapters that were fine-tuned separately using triplet and language modelling objectives. We found that merging the two PEFT adapters improves the F1 score (+0.0346) and consistency (+0.152) of the LLMs. However, our novel methods did not produce more accurate results than GPT-4 in terms of faithfulness and consistency. Averaging the three metrics, GPT-4 ranks joint-first in the competition with 0.8328. Finally, our contamination analysis with GPT-4 indicates that there was no test data leakage. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00484v1-abstract-full').style.display = 'none'; document.getElementById('2404.00484v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.13512">arXiv:2401.13512</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.13512">pdf</a>, <a href="https://arxiv.org/format/2401.13512">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1093/jamia/ocae132">10.1093/jamia/ocae132 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Can GPT-3.5 Generate and Code Discharge Summaries? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Falis%2C+M">Mat煤拧 Falis</a>, <a href="/search/cs?searchtype=author&amp;query=Gema%2C+A+P">Aryo Pradipta Gema</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+H">Hang Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Daines%2C+L">Luke Daines</a>, <a href="/search/cs?searchtype=author&amp;query=Basetti%2C+S">Siddharth Basetti</a>, <a href="/search/cs?searchtype=author&amp;query=Holder%2C+M">Michael Holder</a>, <a href="/search/cs?searchtype=author&amp;query=Penfold%2C+R+S">Rose S Penfold</a>, <a href="/search/cs?searchtype=author&amp;query=Birch%2C+A">Alexandra Birch</a>, <a href="/search/cs?searchtype=author&amp;query=Alex%2C+B">Beatrice Alex</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.13512v2-abstract-short" style="display: inline;"> Objective: To investigate GPT-3.5 in generating and coding medical documents with ICD-10 codes for data augmentation on low-resources labels. Materials and Methods: Employing GPT-3.5 we generated and coded 9,606 discharge summaries based on lists of ICD-10 code descriptions of patients with infrequent (generation) codes within the MIMIC-IV dataset. Combined with the baseline training set, this f&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.13512v2-abstract-full').style.display = 'inline'; document.getElementById('2401.13512v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.13512v2-abstract-full" style="display: none;"> Objective: To investigate GPT-3.5 in generating and coding medical documents with ICD-10 codes for data augmentation on low-resources labels. Materials and Methods: Employing GPT-3.5 we generated and coded 9,606 discharge summaries based on lists of ICD-10 code descriptions of patients with infrequent (generation) codes within the MIMIC-IV dataset. Combined with the baseline training set, this formed an augmented training set. Neural coding models were trained on baseline and augmented data and evaluated on a MIMIC-IV test set. We report micro- and macro-F1 scores on the full codeset, generation codes, and their families. Weak Hierarchical Confusion Matrices were employed to determine within-family and outside-of-family coding errors in the latter codesets. The coding performance of GPT-3.5 was evaluated both on prompt-guided self-generated data and real MIMIC-IV data. Clinical professionals evaluated the clinical acceptability of the generated documents. Results: Augmentation slightly hinders the overall performance of the models but improves performance for the generation candidate codes and their families, including one unseen in the baseline training data. Augmented models display lower out-of-family error rates. GPT-3.5 can identify ICD-10 codes by the prompted descriptions, but performs poorly on real data. Evaluators note the correctness of generated concepts while suffering in variety, supporting information, and narrative. Discussion and Conclusion: GPT-3.5 alone is unsuitable for ICD-10 coding. Augmentation positively affects generation code families but mainly benefits codes with existing examples. Augmentation reduces out-of-family errors. Discharge summaries generated by GPT-3.5 state prompted concepts correctly but lack variety, and authenticity in narratives. They are unsuitable for clinical practice. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.13512v2-abstract-full').style.display = 'none'; document.getElementById('2401.13512v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages; 250 words in abstract; 4,152 words in main body; 4 figures (1 black and white, 3 colour); 4 tables; 34 references; Accepted and published by the Journal of the American Medical Informatics Association</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Journal of the American Medical Informatics Association, 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.03042">arXiv:2307.03042</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2307.03042">pdf</a>, <a href="https://arxiv.org/format/2307.03042">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Parameter-Efficient Fine-Tuning of LLaMA for the Clinical Domain </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gema%2C+A+P">Aryo Pradipta Gema</a>, <a href="/search/cs?searchtype=author&amp;query=Minervini%2C+P">Pasquale Minervini</a>, <a href="/search/cs?searchtype=author&amp;query=Daines%2C+L">Luke Daines</a>, <a href="/search/cs?searchtype=author&amp;query=Hope%2C+T">Tom Hope</a>, <a href="/search/cs?searchtype=author&amp;query=Alex%2C+B">Beatrice Alex</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.03042v3-abstract-short" style="display: inline;"> Adapting pretrained language models to novel domains, such as clinical applications, traditionally involves retraining their entire set of parameters. Parameter-Efficient Fine-Tuning (PEFT) techniques for fine-tuning language models significantly reduce computational requirements by selectively fine-tuning small subsets of parameters. In this study, we propose a two-step PEFT framework and evaluat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.03042v3-abstract-full').style.display = 'inline'; document.getElementById('2307.03042v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.03042v3-abstract-full" style="display: none;"> Adapting pretrained language models to novel domains, such as clinical applications, traditionally involves retraining their entire set of parameters. Parameter-Efficient Fine-Tuning (PEFT) techniques for fine-tuning language models significantly reduce computational requirements by selectively fine-tuning small subsets of parameters. In this study, we propose a two-step PEFT framework and evaluate it in the clinical domain. Our approach combines a specialised PEFT adapter layer designed for clinical domain adaptation with another adapter specialised for downstream tasks. We evaluate the framework on multiple clinical outcome prediction datasets, comparing it to clinically trained language models. Our framework achieves a better AUROC score averaged across all clinical downstream tasks compared to clinical language models. In particular, we observe large improvements of 4-5% AUROC in large-scale multilabel classification tasks, such as diagnoses and procedures classification. To our knowledge, this study is the first to provide an extensive empirical analysis of the interplay between PEFT techniques and domain adaptation in an important real-world domain of clinical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.03042v3-abstract-full').style.display = 'none'; document.getElementById('2307.03042v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.19979">arXiv:2305.19979</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.19979">pdf</a>, <a href="https://arxiv.org/format/2305.19979">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Knowledge Graph Embeddings in the Biomedical Domain: Are They Useful? A Look at Link Prediction, Rule Learning, and Downstream Polypharmacy Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gema%2C+A+P">Aryo Pradipta Gema</a>, <a href="/search/cs?searchtype=author&amp;query=Grabarczyk%2C+D">Dominik Grabarczyk</a>, <a href="/search/cs?searchtype=author&amp;query=De+Wulf%2C+W">Wolf De Wulf</a>, <a href="/search/cs?searchtype=author&amp;query=Borole%2C+P">Piyush Borole</a>, <a href="/search/cs?searchtype=author&amp;query=Alfaro%2C+J+A">Javier Antonio Alfaro</a>, <a href="/search/cs?searchtype=author&amp;query=Minervini%2C+P">Pasquale Minervini</a>, <a href="/search/cs?searchtype=author&amp;query=Vergari%2C+A">Antonio Vergari</a>, <a href="/search/cs?searchtype=author&amp;query=Rajan%2C+A">Ajitha Rajan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.19979v2-abstract-short" style="display: inline;"> Knowledge graphs are powerful tools for representing and organising complex biomedical data. Several knowledge graph embedding algorithms have been proposed to learn from and complete knowledge graphs. However, a recent study demonstrates the limited efficacy of these embedding algorithms when applied to biomedical knowledge graphs, raising the question of whether knowledge graph embeddings have l&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.19979v2-abstract-full').style.display = 'inline'; document.getElementById('2305.19979v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.19979v2-abstract-full" style="display: none;"> Knowledge graphs are powerful tools for representing and organising complex biomedical data. Several knowledge graph embedding algorithms have been proposed to learn from and complete knowledge graphs. However, a recent study demonstrates the limited efficacy of these embedding algorithms when applied to biomedical knowledge graphs, raising the question of whether knowledge graph embeddings have limitations in biomedical settings. This study aims to apply state-of-the-art knowledge graph embedding models in the context of a recent biomedical knowledge graph, BioKG, and evaluate their performance and potential downstream uses. We achieve a three-fold improvement in terms of performance based on the HITS@10 score over previous work on the same biomedical knowledge graph. Additionally, we provide interpretable predictions through a rule-based method. We demonstrate that knowledge graph embedding models are applicable in practice by evaluating the best-performing model on four tasks that represent real-life polypharmacy situations. Results suggest that knowledge learnt from large biomedical knowledge graphs can be transferred to such downstream use cases. Our code is available at https://github.com/aryopg/biokge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.19979v2-abstract-full').style.display = 'none'; document.getElementById('2305.19979v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.11194">arXiv:2305.11194</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.11194">pdf</a>, <a href="https://arxiv.org/format/2305.11194">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> Vaxformer: Antigenicity-controlled Transformer for Vaccine Design Against SARS-CoV-2 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gema%2C+A+P">Aryo Pradipta Gema</a>, <a href="/search/cs?searchtype=author&amp;query=Kobiela%2C+M">Micha艂 Kobiela</a>, <a href="/search/cs?searchtype=author&amp;query=Fraisse%2C+A">Achille Fraisse</a>, <a href="/search/cs?searchtype=author&amp;query=Rajan%2C+A">Ajitha Rajan</a>, <a href="/search/cs?searchtype=author&amp;query=Oyarz%C3%BAn%2C+D+A">Diego A. Oyarz煤n</a>, <a href="/search/cs?searchtype=author&amp;query=Alfaro%2C+J+A">Javier Antonio Alfaro</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.11194v1-abstract-short" style="display: inline;"> The SARS-CoV-2 pandemic has emphasised the importance of developing a universal vaccine that can protect against current and future variants of the virus. The present study proposes a novel conditional protein Language Model architecture, called Vaxformer, which is designed to produce natural-looking antigenicity-controlled SARS-CoV-2 spike proteins. We evaluate the generated protein sequences of&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.11194v1-abstract-full').style.display = 'inline'; document.getElementById('2305.11194v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.11194v1-abstract-full" style="display: none;"> The SARS-CoV-2 pandemic has emphasised the importance of developing a universal vaccine that can protect against current and future variants of the virus. The present study proposes a novel conditional protein Language Model architecture, called Vaxformer, which is designed to produce natural-looking antigenicity-controlled SARS-CoV-2 spike proteins. We evaluate the generated protein sequences of the Vaxformer model using DDGun protein stability measure, netMHCpan antigenicity score, and a structure fidelity score with AlphaFold to gauge its viability for vaccine development. Our results show that Vaxformer outperforms the existing state-of-the-art Conditional Variational Autoencoder model to generate antigenicity-controlled SARS-CoV-2 spike proteins. These findings suggest promising opportunities for conditional Transformer models to expand our understanding of vaccine design and their role in mitigating global health challenges. The code used in this study is available at https://github.com/aryopg/vaxformer . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.11194v1-abstract-full').style.display = 'none'; document.getElementById('2305.11194v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> </ol> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10