CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 62 results for author: <span class="mathjax">Hou, W</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Hou%2C+W">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Hou, W"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Hou%2C+W&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Hou, W"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Hou%2C+W&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Hou%2C+W&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Hou%2C+W&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10937">arXiv:2411.10937</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.10937">pdf</a>, <a href="https://arxiv.org/format/2411.10937">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Memory-Augmented Multimodal LLMs for Surgical VQA via Self-Contained Inquiry </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenjun Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+Y">Yi Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+K">Kaishuai Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+Y">Yan Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+W">Wenjie Li</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jiang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10937v1-abstract-short" style="display: inline;"> Comprehensively understanding surgical scenes in Surgical Visual Question Answering (Surgical VQA) requires reasoning over multiple objects. Previous approaches address this task using cross-modal fusion strategies to enhance reasoning ability. However, these methods often struggle with limited scene understanding and question comprehension, and some rely on external resources (e.g., pre-extracted&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10937v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10937v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10937v1-abstract-full" style="display: none;"> Comprehensively understanding surgical scenes in Surgical Visual Question Answering (Surgical VQA) requires reasoning over multiple objects. Previous approaches address this task using cross-modal fusion strategies to enhance reasoning ability. However, these methods often struggle with limited scene understanding and question comprehension, and some rely on external resources (e.g., pre-extracted object features), which can introduce errors and generalize poorly across diverse surgical environments. To address these challenges, we propose SCAN, a simple yet effective memory-augmented framework that leverages Multimodal LLMs to improve surgical context comprehension via Self-Contained Inquiry. SCAN operates autonomously, generating two types of memory for context augmentation: Direct Memory (DM), which provides multiple candidates (or hints) to the final answer, and Indirect Memory (IM), which consists of self-contained question-hint pairs to capture broader scene context. DM directly assists in answering the question, while IM enhances understanding of the surgical scene beyond the immediate query. Reasoning over these object-aware memories enables the model to accurately interpret images and respond to questions. Extensive experiments on three publicly available Surgical VQA datasets demonstrate that SCAN achieves state-of-the-art performance, offering improved accuracy and robustness across various surgical scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10937v1-abstract-full').style.display = 'none'; document.getElementById('2411.10937v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10309">arXiv:2411.10309</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.10309">pdf</a>, <a href="https://arxiv.org/format/2411.10309">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Modification Takes Courage: Seamless Image Stitching via Reference-Driven Inpainting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xie%2C+Z">Ziqi Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Lai%2C+X">Xiao Lai</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+W">Weidong Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xianhui Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenlong Hou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10309v1-abstract-short" style="display: inline;"> Current image stitching methods often produce noticeable seams in challenging scenarios such as uneven hue and large parallax. To tackle this problem, we propose the Reference-Driven Inpainting Stitcher (RDIStitcher), which reformulates the image fusion and rectangling as a reference-based inpainting model, incorporating a larger modification fusion area and stronger modification intensity than pr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10309v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10309v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10309v1-abstract-full" style="display: none;"> Current image stitching methods often produce noticeable seams in challenging scenarios such as uneven hue and large parallax. To tackle this problem, we propose the Reference-Driven Inpainting Stitcher (RDIStitcher), which reformulates the image fusion and rectangling as a reference-based inpainting model, incorporating a larger modification fusion area and stronger modification intensity than previous methods. Furthermore, we introduce a self-supervised model training method, which enables the implementation of RDIStitcher without requiring labeled data by fine-tuning a Text-to-Image (T2I) diffusion model. Recognizing difficulties in assessing the quality of stitched images, we present the Multimodal Large Language Models (MLLMs)-based metrics, offering a new perspective on evaluating stitched image quality. Compared to the state-of-the-art (SOTA) method, extensive experiments demonstrate that our method significantly enhances content coherence and seamless transitions in the stitched images. Especially in the zero-shot experiments, our method exhibits strong generalization capabilities. Code: https://github.com/yayoyo66/RDIStitcher <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10309v1-abstract-full').style.display = 'none'; document.getElementById('2411.10309v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01178">arXiv:2411.01178</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.01178">pdf</a>, <a href="https://arxiv.org/format/2411.01178">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> LLM4PR: Improving Post-Ranking in Search Engine with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yan%2C+Y">Yang Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yihao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+C">Chi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenyuan Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+K">Kang Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+X">Xingkai Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Zelun Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhai%2C+Z">Zhixin Zhai</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+E">Enyun Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Ou%2C+W">Wenwu Ou</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+Y">Yang Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01178v1-abstract-short" style="display: inline;"> Alongside the rapid development of Large Language Models (LLMs), there has been a notable increase in efforts to integrate LLM techniques in information retrieval (IR) and search engines (SE). Recently, an additional post-ranking stage is suggested in SE to enhance user satisfaction in practical applications. Nevertheless, research dedicated to enhancing the post-ranking stage through LLMs remains&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01178v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01178v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01178v1-abstract-full" style="display: none;"> Alongside the rapid development of Large Language Models (LLMs), there has been a notable increase in efforts to integrate LLM techniques in information retrieval (IR) and search engines (SE). Recently, an additional post-ranking stage is suggested in SE to enhance user satisfaction in practical applications. Nevertheless, research dedicated to enhancing the post-ranking stage through LLMs remains largely unexplored. In this study, we introduce a novel paradigm named Large Language Models for Post-Ranking in search engine (LLM4PR), which leverages the capabilities of LLMs to accomplish the post-ranking task in SE. Concretely, a Query-Instructed Adapter (QIA) module is designed to derive the user/item representation vectors by incorporating their heterogeneous features. A feature adaptation step is further introduced to align the semantics of user/item representations with the LLM. Finally, the LLM4PR integrates a learning to post-rank step, leveraging both a main task and an auxiliary task to fine-tune the model to adapt the post-ranking task. Experiment studies demonstrate that the proposed framework leads to significant improvements and exhibits state-of-the-art performance compared with other alternatives. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01178v1-abstract-full').style.display = 'none'; document.getElementById('2411.01178v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07230">arXiv:2410.07230</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.07230">pdf</a>, <a href="https://arxiv.org/format/2410.07230">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3659620">10.1145/3659620 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> RFBoost: Understanding and Boosting Deep WiFi Sensing via Physical Data Augmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Weiying Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+C">Chenshu Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07230v1-abstract-short" style="display: inline;"> Deep learning shows promising performance in wireless sensing. However, deep wireless sensing (DWS) heavily relies on large datasets. Unfortunately, building comprehensive datasets for DWS is difficult and costly, because wireless data depends on environmental factors and cannot be labeled offline. Despite recent advances in few-shot/cross-domain learning, DWS is still facing data scarcity issues.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07230v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07230v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07230v1-abstract-full" style="display: none;"> Deep learning shows promising performance in wireless sensing. However, deep wireless sensing (DWS) heavily relies on large datasets. Unfortunately, building comprehensive datasets for DWS is difficult and costly, because wireless data depends on environmental factors and cannot be labeled offline. Despite recent advances in few-shot/cross-domain learning, DWS is still facing data scarcity issues. In this paper, we investigate a distinct perspective of radio data augmentation (RDA) for WiFi sensing and present a data-space solution. Our key insight is that wireless signals inherently exhibit data diversity, contributing more information to be extracted for DWS. We present RFBoost, a simple and effective RDA framework encompassing novel physical data augmentation techniques. We implement RFBoost as a plug-and-play module integrated with existing deep models and evaluate it on multiple datasets. Experimental results demonstrate that RFBoost achieves remarkable average accuracy improvements of 5.4% on existing models without additional data collection or model modifications, and the best-boosted performance outperforms 11 state-of-the-art baseline models without RDA. RFBoost pioneers the study of RDA, an important yet currently underexplored building block for DWS, which we expect to become a standard DWS component of WiFi sensing and beyond. RFBoost is released at https://github.com/aiot-lab/RFBoost. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07230v1-abstract-full').style.display = 'none'; document.getElementById('2410.07230v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Proc. ACM Interact. Mob. Wearable Ubiquitous Technol. 8, 2, Article 58 (June 2024), 26 pages</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proc. ACM Interact. Mob. Wearable Ubiquitous Technol. 8, 2, Article 58 (June 2024), 26 pages </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06638">arXiv:2410.06638</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.06638">pdf</a>, <a href="https://arxiv.org/format/2410.06638">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Subtle Errors Matter: Preference Learning via Error-injected Self-editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+K">Kaishuai Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+T">Tiezheng Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenjun Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+Y">Yi Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Leong%2C+C+T">Chak Tou Leong</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+L">Liangyou Li</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+X">Xin Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Shang%2C+L">Lifeng Shang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Q">Qun Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+W">Wenjie Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06638v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have exhibited strong mathematical reasoning and computational prowess, tackling tasks ranging from basic arithmetic to advanced competition-level problems. However, frequently occurring subtle errors, such as miscalculations or incorrect substitutions, limit the models&#39; full mathematical potential. Existing studies to improve mathematical ability typically involve dis&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06638v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06638v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06638v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have exhibited strong mathematical reasoning and computational prowess, tackling tasks ranging from basic arithmetic to advanced competition-level problems. However, frequently occurring subtle errors, such as miscalculations or incorrect substitutions, limit the models&#39; full mathematical potential. Existing studies to improve mathematical ability typically involve distilling reasoning skills from stronger LLMs or applying preference learning to step-wise response pairs. Although these methods leverage samples of varying granularity to mitigate reasoning errors, they overlook the frequently occurring subtle errors. A major reason is that sampled preference pairs involve differences unrelated to the errors, which may distract the model from focusing on subtle errors. In this work, we propose a novel preference learning framework called eRror-Injected Self-Editing (RISE), which injects predefined subtle errors into partial tokens of correct solutions to construct hard pairs for error mitigation. In detail, RISE uses the model itself to edit a small number of tokens in the solution, injecting designed subtle errors. Then, pairs composed of self-edited solutions and their corresponding correct ones, along with pairs of correct and incorrect solutions obtained through sampling, are used together for subtle error-aware DPO training. Compared with other preference learning methods, RISE further refines the training objective to focus on predefined errors and their tokens, without requiring fine-grained sampling or preference annotation. Extensive experiments validate the effectiveness of RISE, with preference learning on Qwen2-7B-Instruct yielding notable improvements of 3.0% on GSM8K and 7.9% on MATH. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06638v1-abstract-full').style.display = 'none'; document.getElementById('2410.06638v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01556">arXiv:2410.01556</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.01556">pdf</a>, <a href="https://arxiv.org/format/2410.01556">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Integrative Decoding: Improve Factuality via Implicit Self-consistency </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+Y">Yi Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+X">Xiao Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Gong%2C+Y">Yeyun Gong</a>, <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+W">Wen Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Song Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yuji Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenjun Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+K">Kaishuai Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+W">Wenge Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+W">Wenjie Li</a>, <a href="/search/cs?searchtype=author&amp;query=Jiao%2C+J">Jian Jiao</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Q">Qi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+P">Peng Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+W">Wayne Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01556v2-abstract-short" style="display: inline;"> Self-consistency-based approaches, which involve repeatedly sampling multiple outputs and selecting the most consistent one as the final response, prove to be remarkably effective in improving the factual accuracy of large language models. Nonetheless, existing methods usually have strict constraints on the task format, largely limiting their applicability. In this paper, we present Integrative De&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01556v2-abstract-full').style.display = 'inline'; document.getElementById('2410.01556v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01556v2-abstract-full" style="display: none;"> Self-consistency-based approaches, which involve repeatedly sampling multiple outputs and selecting the most consistent one as the final response, prove to be remarkably effective in improving the factual accuracy of large language models. Nonetheless, existing methods usually have strict constraints on the task format, largely limiting their applicability. In this paper, we present Integrative Decoding (ID), to unlock the potential of self-consistency in open-ended generation tasks. ID operates by constructing a set of inputs, each prepended with a previously sampled response, and then processes them concurrently, with the next token being selected by aggregating of all their corresponding predictions at each decoding step. In essence, this simple approach implicitly incorporates self-consistency in the decoding objective. Extensive evaluation shows that ID consistently enhances factuality over a wide range of language models, with substantial improvements on the TruthfulQA (+11.2%), Biographies (+15.4%) and LongFact (+8.5%) benchmarks. The performance gains amplify progressively as the number of sampled responses increases, indicating the potential of ID to scale up with repeated sampling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01556v2-abstract-full').style.display = 'none'; document.getElementById('2410.01556v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.14868">arXiv:2408.14868</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.14868">pdf</a>, <a href="https://arxiv.org/format/2408.14868">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ZeroMamba: Exploring Visual State Space Model for Zero-Shot Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenjin Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Fu%2C+D">Dingjie Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+K">Kun Li</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Shiming Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+H">Hehe Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yi Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.14868v1-abstract-short" style="display: inline;"> Zero-shot learning (ZSL) aims to recognize unseen classes by transferring semantic knowledge from seen classes to unseen ones, guided by semantic information. To this end, existing works have demonstrated remarkable performance by utilizing global visual features from Convolutional Neural Networks (CNNs) or Vision Transformers (ViTs) for visual-semantic interactions. Due to the limited receptive f&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14868v1-abstract-full').style.display = 'inline'; document.getElementById('2408.14868v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.14868v1-abstract-full" style="display: none;"> Zero-shot learning (ZSL) aims to recognize unseen classes by transferring semantic knowledge from seen classes to unseen ones, guided by semantic information. To this end, existing works have demonstrated remarkable performance by utilizing global visual features from Convolutional Neural Networks (CNNs) or Vision Transformers (ViTs) for visual-semantic interactions. Due to the limited receptive fields of CNNs and the quadratic complexity of ViTs, however, these visual backbones achieve suboptimal visual-semantic interactions. In this paper, motivated by the visual state space model (i.e., Vision Mamba), which is capable of capturing long-range dependencies and modeling complex visual dynamics, we propose a parameter-efficient ZSL framework called ZeroMamba to advance ZSL. Our ZeroMamba comprises three key components: Semantic-aware Local Projection (SLP), Global Representation Learning (GRL), and Semantic Fusion (SeF). Specifically, SLP integrates semantic embeddings to map visual features to local semantic-related representations, while GRL encourages the model to learn global semantic representations. SeF combines these two semantic representations to enhance the discriminability of semantic features. We incorporate these designs into Vision Mamba, forming an end-to-end ZSL framework. As a result, the learned semantic representations are better suited for classification. Through extensive experiments on four prominent ZSL benchmarks, ZeroMamba demonstrates superior performance, significantly outperforming the state-of-the-art (i.e., CNN-based and ViT-based) methods under both conventional ZSL (CZSL) and generalized ZSL (GZSL) settings. Code is available at: https://anonymous.4open.science/r/ZeroMamba. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14868v1-abstract-full').style.display = 'none'; document.getElementById('2408.14868v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.01998">arXiv:2408.01998</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.01998">pdf</a>, <a href="https://arxiv.org/format/2408.01998">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> What Happens Without Background? Constructing Foreground-Only Data for Fine-Grained Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yuetian Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenjin Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+Q">Qinmu Peng</a>, <a href="/search/cs?searchtype=author&amp;query=You%2C+X">Xinge You</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.01998v1-abstract-short" style="display: inline;"> Fine-grained recognition, a pivotal task in visual signal processing, aims to distinguish between similar subclasses based on discriminative information present in samples. However, prevailing methods often erroneously focus on background areas, neglecting the capture of genuinely effective discriminative information from the subject, thus impeding practical application. To facilitate research int&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01998v1-abstract-full').style.display = 'inline'; document.getElementById('2408.01998v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.01998v1-abstract-full" style="display: none;"> Fine-grained recognition, a pivotal task in visual signal processing, aims to distinguish between similar subclasses based on discriminative information present in samples. However, prevailing methods often erroneously focus on background areas, neglecting the capture of genuinely effective discriminative information from the subject, thus impeding practical application. To facilitate research into the impact of background noise on models and enhance their ability to concentrate on the subject&#39;s discriminative features, we propose an engineered pipeline that leverages the capabilities of SAM and Detic to create fine-grained datasets with only foreground subjects, devoid of background. Extensive cross-experiments validate this approach as a preprocessing step prior to training, enhancing algorithmic performance and holding potential for further modal expansion of the data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01998v1-abstract-full').style.display = 'none'; document.getElementById('2408.01998v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.13960">arXiv:2406.13960</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.13960">pdf</a>, <a href="https://arxiv.org/format/2406.13960">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AutoPal: Autonomous Adaptation to Users for Personal AI Companionship </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+Y">Yi Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+W">Wenge Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+K">Kaishuai Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenjun Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Ouyang%2C+Y">Yi Ouyang</a>, <a href="/search/cs?searchtype=author&amp;query=Leong%2C+C+T">Chak Tou Leong</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xian Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+Y">Yefeng Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.13960v3-abstract-short" style="display: inline;"> Previous research has demonstrated the potential of AI agents to act as companions that can provide constant emotional support for humans. In this paper, we emphasize the necessity of autonomous adaptation in personal AI companionship, an underexplored yet promising direction. Such adaptability is crucial as it can facilitate more tailored interactions with users and allow the agent to evolve in r&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13960v3-abstract-full').style.display = 'inline'; document.getElementById('2406.13960v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.13960v3-abstract-full" style="display: none;"> Previous research has demonstrated the potential of AI agents to act as companions that can provide constant emotional support for humans. In this paper, we emphasize the necessity of autonomous adaptation in personal AI companionship, an underexplored yet promising direction. Such adaptability is crucial as it can facilitate more tailored interactions with users and allow the agent to evolve in response to users&#39; changing needs. However, imbuing agents with autonomous adaptability presents unique challenges, including identifying optimal adaptations to meet users&#39; expectations and ensuring a smooth transition during the adaptation process. To address them, we devise a hierarchical framework, AutoPal, that enables controllable and authentic adjustments to the agent&#39;s persona based on user interactions. A personamatching dataset is constructed to facilitate the learning of optimal persona adaptations. Extensive experiments demonstrate the effectiveness of AutoPal and highlight the importance of autonomous adaptability in AI companionship. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13960v3-abstract-full').style.display = 'none'; document.getElementById('2406.13960v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.13934">arXiv:2406.13934</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.13934">pdf</a>, <a href="https://arxiv.org/format/2406.13934">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Reasoning Like a Doctor: Improving Medical Dialogue Systems via Diagnostic Reasoning Process Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+K">Kaishuai Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+Y">Yi Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenjun Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Tan%2C+Q">Qiaoyu Tan</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+W">Wenjie Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.13934v1-abstract-short" style="display: inline;"> Medical dialogue systems have attracted significant attention for their potential to act as medical assistants. Enabling these medical systems to emulate clinicians&#39; diagnostic reasoning process has been the long-standing research focus. Previous studies rudimentarily realized the simulation of clinicians&#39; diagnostic process by fine-tuning language models on high-quality dialogue datasets. Nonethe&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13934v1-abstract-full').style.display = 'inline'; document.getElementById('2406.13934v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.13934v1-abstract-full" style="display: none;"> Medical dialogue systems have attracted significant attention for their potential to act as medical assistants. Enabling these medical systems to emulate clinicians&#39; diagnostic reasoning process has been the long-standing research focus. Previous studies rudimentarily realized the simulation of clinicians&#39; diagnostic process by fine-tuning language models on high-quality dialogue datasets. Nonetheless, they overly focus on the outcomes of the clinician&#39;s reasoning process while ignoring their internal thought processes and alignment with clinician preferences. Our work aims to build a medical dialogue system that aligns with clinicians&#39; diagnostic reasoning processes. We propose a novel framework, Emulation, designed to generate an appropriate response that relies on abductive and deductive diagnostic reasoning analyses and aligns with clinician preferences through thought process modeling. Experimental results on two datasets confirm the efficacy of Emulation. Crucially, our framework furnishes clear explanations for the generated responses, enhancing its transparency in medical consultations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13934v1-abstract-full').style.display = 'none'; document.getElementById('2406.13934v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACL 2024 Findings</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.02147">arXiv:2406.02147</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.02147">pdf</a>, <a href="https://arxiv.org/format/2406.02147">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> UA-Track: Uncertainty-Aware End-to-End 3D Multi-Object Tracking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+L">Lijun Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+T">Tao Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Hao%2C+P">Pengkun Hao</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+Z">Zihang He</a>, <a href="/search/cs?searchtype=author&amp;query=Ho%2C+K">Kalok Ho</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+S">Shuo Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenbo Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Hao%2C+Z">Zhihui Hao</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+H">Haiyang Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Zhan%2C+K">Kun Zhan</a>, <a href="/search/cs?searchtype=author&amp;query=Jia%2C+P">Peng Jia</a>, <a href="/search/cs?searchtype=author&amp;query=Lang%2C+X">Xianpeng Lang</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+X">Xiaodan Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.02147v1-abstract-short" style="display: inline;"> 3D multiple object tracking (MOT) plays a crucial role in autonomous driving perception. Recent end-to-end query-based trackers simultaneously detect and track objects, which have shown promising potential for the 3D MOT task. However, existing methods overlook the uncertainty issue, which refers to the lack of precise confidence about the state and location of tracked objects. Uncertainty arises&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02147v1-abstract-full').style.display = 'inline'; document.getElementById('2406.02147v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.02147v1-abstract-full" style="display: none;"> 3D multiple object tracking (MOT) plays a crucial role in autonomous driving perception. Recent end-to-end query-based trackers simultaneously detect and track objects, which have shown promising potential for the 3D MOT task. However, existing methods overlook the uncertainty issue, which refers to the lack of precise confidence about the state and location of tracked objects. Uncertainty arises owing to various factors during motion observation by cameras, especially occlusions and the small size of target objects, resulting in an inaccurate estimation of the object&#39;s position, label, and identity. To this end, we propose an Uncertainty-Aware 3D MOT framework, UA-Track, which tackles the uncertainty problem from multiple aspects. Specifically, we first introduce an Uncertainty-aware Probabilistic Decoder to capture the uncertainty in object prediction with probabilistic attention. Secondly, we propose an Uncertainty-guided Query Denoising strategy to further enhance the training process. We also utilize Uncertainty-reduced Query Initialization, which leverages predicted 2D object location and depth information to reduce query uncertainty. As a result, our UA-Track achieves state-of-the-art performance on the nuScenes benchmark, i.e., 66.3% AMOTA on the test split, surpassing the previous best end-to-end solution by a significant margin of 8.9% AMOTA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02147v1-abstract-full').style.display = 'none'; document.getElementById('2406.02147v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.14808">arXiv:2404.14808</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.14808">pdf</a>, <a href="https://arxiv.org/format/2404.14808">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Visual-Augmented Dynamic Semantic Prototype for Generative Zero-Shot Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenjin Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Shiming Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Shuhuang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Hong%2C+Z">Ziming Hong</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+X">Xuetao Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Khan%2C+S">Salman Khan</a>, <a href="/search/cs?searchtype=author&amp;query=Khan%2C+F+S">Fahad Shahbaz Khan</a>, <a href="/search/cs?searchtype=author&amp;query=You%2C+X">Xinge You</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.14808v1-abstract-short" style="display: inline;"> Generative Zero-shot learning (ZSL) learns a generator to synthesize visual samples for unseen classes, which is an effective way to advance ZSL. However, existing generative methods rely on the conditions of Gaussian noise and the predefined semantic prototype, which limit the generator only optimized on specific seen classes rather than characterizing each visual instance, resulting in poor gene&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.14808v1-abstract-full').style.display = 'inline'; document.getElementById('2404.14808v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.14808v1-abstract-full" style="display: none;"> Generative Zero-shot learning (ZSL) learns a generator to synthesize visual samples for unseen classes, which is an effective way to advance ZSL. However, existing generative methods rely on the conditions of Gaussian noise and the predefined semantic prototype, which limit the generator only optimized on specific seen classes rather than characterizing each visual instance, resulting in poor generalizations (\textit{e.g.}, overfitting to seen classes). To address this issue, we propose a novel Visual-Augmented Dynamic Semantic prototype method (termed VADS) to boost the generator to learn accurate semantic-visual mapping by fully exploiting the visual-augmented knowledge into semantic conditions. In detail, VADS consists of two modules: (1) Visual-aware Domain Knowledge Learning module (VDKL) learns the local bias and global prior of the visual features (referred to as domain visual knowledge), which replace pure Gaussian noise to provide richer prior noise information; (2) Vision-Oriented Semantic Updation module (VOSU) updates the semantic prototype according to the visual representations of the samples. Ultimately, we concatenate their output as a dynamic semantic prototype, which serves as the condition of the generator. Extensive experiments demonstrate that our VADS achieves superior CZSL and GZSL performances on three prominent datasets and outperforms other state-of-the-art methods with averaging increases by 6.4\%, 5.9\% and 4.2\% on SUN, CUB and AWA2, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.14808v1-abstract-full').style.display = 'none'; document.getElementById('2404.14808v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.07713">arXiv:2404.07713</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.07713">pdf</a>, <a href="https://arxiv.org/format/2404.07713">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Progressive Semantic-Guided Vision Transformer for Zero-Shot Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Shiming Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenjin Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Khan%2C+S">Salman Khan</a>, <a href="/search/cs?searchtype=author&amp;query=Khan%2C+F+S">Fahad Shahbaz Khan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.07713v2-abstract-short" style="display: inline;"> Zero-shot learning (ZSL) recognizes the unseen classes by conducting visual-semantic interactions to transfer semantic knowledge from seen classes to unseen ones, supported by semantic information (e.g., attributes). However, existing ZSL methods simply extract visual features using a pre-trained network backbone (i.e., CNN or ViT), which fail to learn matched visual-semantic correspondences for r&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.07713v2-abstract-full').style.display = 'inline'; document.getElementById('2404.07713v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.07713v2-abstract-full" style="display: none;"> Zero-shot learning (ZSL) recognizes the unseen classes by conducting visual-semantic interactions to transfer semantic knowledge from seen classes to unseen ones, supported by semantic information (e.g., attributes). However, existing ZSL methods simply extract visual features using a pre-trained network backbone (i.e., CNN or ViT), which fail to learn matched visual-semantic correspondences for representing semantic-related visual features as lacking of the guidance of semantic information, resulting in undesirable visual-semantic interactions. To tackle this issue, we propose a progressive semantic-guided vision transformer for zero-shot learning (dubbed ZSLViT). ZSLViT mainly considers two properties in the whole network: i) discover the semantic-related visual representations explicitly, and ii) discard the semantic-unrelated visual information. Specifically, we first introduce semantic-embedded token learning to improve the visual-semantic correspondences via semantic enhancement and discover the semantic-related visual tokens explicitly with semantic-guided token attention. Then, we fuse low semantic-visual correspondence visual tokens to discard the semantic-unrelated visual information for visual enhancement. These two operations are integrated into various encoders to progressively learn semantic-related visual representations for accurate visual-semantic interactions in ZSL. The extensive experiments show that our ZSLViT achieves significant performance gains on three popular benchmark datasets, i.e., CUB, SUN, and AWA2. Codes are available at: https://github.com/shiming-chen/ZSLViT . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.07713v2-abstract-full').style.display = 'none'; document.getElementById('2404.07713v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR&#39;24</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.04617">arXiv:2404.04617</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.04617">pdf</a>, <a href="https://arxiv.org/format/2404.04617">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Empowering Image Recovery_ A Multi-Attention Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wen%2C+J">Juan Wen</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yawei Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+C">Chao Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Weiyan Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Timofte%2C+R">Radu Timofte</a>, <a href="/search/cs?searchtype=author&amp;query=Van+Gool%2C+L">Luc Van Gool</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.04617v2-abstract-short" style="display: inline;"> We propose Diverse Restormer (DART), a novel image restoration method that effectively integrates information from various sources (long sequences, local and global regions, feature dimensions, and positional dimensions) to address restoration challenges. While Transformer models have demonstrated excellent performance in image restoration due to their self-attention mechanism, they face limitatio&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.04617v2-abstract-full').style.display = 'inline'; document.getElementById('2404.04617v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.04617v2-abstract-full" style="display: none;"> We propose Diverse Restormer (DART), a novel image restoration method that effectively integrates information from various sources (long sequences, local and global regions, feature dimensions, and positional dimensions) to address restoration challenges. While Transformer models have demonstrated excellent performance in image restoration due to their self-attention mechanism, they face limitations in complex scenarios. Leveraging recent advancements in Transformers and various attention mechanisms, our method utilizes customized attention mechanisms to enhance overall performance. DART, our novel network architecture, employs windowed attention to mimic the selective focusing mechanism of human eyes. By dynamically adjusting receptive fields, it optimally captures the fundamental features crucial for image resolution reconstruction. Efficiency and performance balance are achieved through the LongIR attention mechanism for long sequence image restoration. Integration of attention mechanisms across feature and positional dimensions further enhances the recovery of fine details. Evaluation across five restoration tasks consistently positions DART at the forefront. Upon acceptance, we commit to providing publicly accessible code and models to ensure reproducibility and facilitate further research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.04617v2-abstract-full').style.display = 'none'; document.getElementById('2404.04617v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 10 figures, 12 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07 (Primary) 168T45 (Secondary) <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.4.4 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.00894">arXiv:2403.00894</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.00894">pdf</a>, <a href="https://arxiv.org/format/2403.00894">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Programming Languages">cs.PL</span> </div> </div> <p class="title is-5 mathjax"> Comparing large language models and human programmers for generating programming code </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenpin Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Ji%2C+Z">Zhicheng Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.00894v2-abstract-short" style="display: inline;"> We systematically evaluated the performance of seven large language models in generating programming code using various prompt strategies, programming languages, and task difficulties. GPT-4 substantially outperforms other large language models, including Gemini Ultra and Claude 2. The coding performance of GPT-4 varies considerably with different prompt strategies. In most LeetCode and GeeksforGe&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.00894v2-abstract-full').style.display = 'inline'; document.getElementById('2403.00894v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.00894v2-abstract-full" style="display: none;"> We systematically evaluated the performance of seven large language models in generating programming code using various prompt strategies, programming languages, and task difficulties. GPT-4 substantially outperforms other large language models, including Gemini Ultra and Claude 2. The coding performance of GPT-4 varies considerably with different prompt strategies. In most LeetCode and GeeksforGeeks coding contests evaluated in this study, GPT-4 employing the optimal prompt strategy outperforms 85 percent of human participants. Additionally, GPT-4 demonstrates strong capabilities in translating code between different programming languages and in learning from past errors. The computational efficiency of the code generated by GPT-4 is comparable to that of human programmers. These results suggest that GPT-4 has the potential to serve as a reliable assistant in programming code generation and software development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.00894v2-abstract-full').style.display = 'none'; document.getElementById('2403.00894v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.12844">arXiv:2402.12844</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.12844">pdf</a>, <a href="https://arxiv.org/format/2402.12844">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ICON: Improving Inter-Report Consistency in Radiology Report Generation via Lesion-aware Mixup Augmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenjun Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+Y">Yi Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+K">Kaishuai Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+Y">Yan Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+W">Wenjie Li</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jiang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.12844v2-abstract-short" style="display: inline;"> Previous research on radiology report generation has made significant progress in terms of increasing the clinical accuracy of generated reports. In this paper, we emphasize another crucial quality that it should possess, i.e., inter-report consistency, which refers to the capability of generating consistent reports for semantically equivalent radiographs. This quality is even of greater significa&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.12844v2-abstract-full').style.display = 'inline'; document.getElementById('2402.12844v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.12844v2-abstract-full" style="display: none;"> Previous research on radiology report generation has made significant progress in terms of increasing the clinical accuracy of generated reports. In this paper, we emphasize another crucial quality that it should possess, i.e., inter-report consistency, which refers to the capability of generating consistent reports for semantically equivalent radiographs. This quality is even of greater significance than the overall report accuracy in terms of ensuring the system&#39;s credibility, as a system prone to providing conflicting results would severely erode users&#39; trust. Regrettably, existing approaches struggle to maintain inter-report consistency, exhibiting biases towards common patterns and susceptibility to lesion variants. To address this issue, we propose ICON, which improves the inter-report consistency of radiology report generation. Aiming to enhance the system&#39;s ability to capture similarities in semantically equivalent lesions, our approach first involves extracting lesions from input images and examining their characteristics. Then, we introduce a lesion-aware mixup technique to ensure that the representations of the semantically equivalent lesions align with the same attributes, achieved through a linear combination during the training phase. Extensive experiments on three publicly available chest X-ray datasets verify the effectiveness of our approach, both in terms of improving the consistency and accuracy of the generated reports. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.12844v2-abstract-full').style.display = 'none'; document.getElementById('2402.12844v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.06541">arXiv:2401.06541</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.06541">pdf</a>, <a href="https://arxiv.org/format/2401.06541">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Medical Dialogue Generation via Intuitive-then-Analytical Differential Diagnosis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+K">Kaishuai Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenjun Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+Y">Yi Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jian Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+W">Wenjie Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.06541v1-abstract-short" style="display: inline;"> Medical dialogue systems have attracted growing research attention as they have the potential to provide rapid diagnoses, treatment plans, and health consultations. In medical dialogues, a proper diagnosis is crucial as it establishes the foundation for future consultations. Clinicians typically employ both intuitive and analytic reasoning to formulate a differential diagnosis. This reasoning proc&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.06541v1-abstract-full').style.display = 'inline'; document.getElementById('2401.06541v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.06541v1-abstract-full" style="display: none;"> Medical dialogue systems have attracted growing research attention as they have the potential to provide rapid diagnoses, treatment plans, and health consultations. In medical dialogues, a proper diagnosis is crucial as it establishes the foundation for future consultations. Clinicians typically employ both intuitive and analytic reasoning to formulate a differential diagnosis. This reasoning process hypothesizes and verifies a variety of possible diseases and strives to generate a comprehensive and rigorous diagnosis. However, recent studies on medical dialogue generation have overlooked the significance of modeling a differential diagnosis, which hinders the practical application of these systems. To address the above issue, we propose a medical dialogue generation framework with the Intuitive-then-Analytic Differential Diagnosis (IADDx). Our method starts with a differential diagnosis via retrieval-based intuitive association and subsequently refines it through a graph-enhanced analytic procedure. The resulting differential diagnosis is then used to retrieve medical knowledge and guide response generation. Experimental results on two datasets validate the efficacy of our method. Besides, we demonstrate how our framework assists both clinicians and patients in understanding the diagnostic process, for instance, by producing intermediate results and graph-based diagnosis paths. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.06541v1-abstract-full').style.display = 'none'; document.getElementById('2401.06541v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.11111">arXiv:2312.11111</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.11111">pdf</a>, <a href="https://arxiv.org/format/2312.11111">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> The Good, The Bad, and Why: Unveiling Emotions in Generative AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Cheng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jindong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yixuan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+K">Kaijie Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xinyi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenxin Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Lian%2C+J">Jianxun Lian</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+F">Fang Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Q">Qiang Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+X">Xing Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.11111v3-abstract-short" style="display: inline;"> Emotion significantly impacts our daily behaviors and interactions. While recent generative AI models, such as large language models, have shown impressive performance in various tasks, it remains unclear whether they truly comprehend emotions. This paper aims to address this gap by incorporating psychological theories to gain a holistic understanding of emotions in generative AI models. Specifica&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.11111v3-abstract-full').style.display = 'inline'; document.getElementById('2312.11111v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.11111v3-abstract-full" style="display: none;"> Emotion significantly impacts our daily behaviors and interactions. While recent generative AI models, such as large language models, have shown impressive performance in various tasks, it remains unclear whether they truly comprehend emotions. This paper aims to address this gap by incorporating psychological theories to gain a holistic understanding of emotions in generative AI models. Specifically, we propose three approaches: 1) EmotionPrompt to enhance AI model performance, 2) EmotionAttack to impair AI model performance, and 3) EmotionDecode to explain the effects of emotional stimuli, both benign and malignant. Through extensive experiments involving language and multi-modal models on semantic understanding, logical reasoning, and generation tasks, we demonstrate that both textual and visual EmotionPrompt can boost the performance of AI models while EmotionAttack can hinder it. Additionally, EmotionDecode reveals that AI models can comprehend emotional stimuli akin to the mechanism of dopamine in the human brain. Our work heralds a novel avenue for exploring psychology to enhance our understanding of generative AI models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.11111v3-abstract-full').style.display = 'none'; document.getElementById('2312.11111v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">International Conference on Machine Learning (ICML) 2024; an extension to EmotionPrompt (arXiv:2307.11760)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.16832">arXiv:2311.16832</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.16832">pdf</a>, <a href="https://arxiv.org/format/2311.16832">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CharacterGLM: Customizing Chinese Conversational AI Characters with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jinfeng Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zhuang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+D">Dazhen Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+B">Bosi Wen</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+Y">Yi Song</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+J">Jifan Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Y">Yongkang Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+L">Libiao Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+J">Jiaming Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+X">Xiyao Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=Sabour%2C+S">Sahand Sabour</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xiaohan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenjing Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yijia Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Y">Yuxiao Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+J">Jie Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+M">Minlie Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.16832v1-abstract-short" style="display: inline;"> In this paper, we present CharacterGLM, a series of models built upon ChatGLM, with model sizes ranging from 6B to 66B parameters. Our CharacterGLM is designed for generating Character-based Dialogues (CharacterDial), which aims to equip a conversational AI system with character customization for satisfying people&#39;s inherent social desires and emotional needs. On top of CharacterGLM, we can custom&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.16832v1-abstract-full').style.display = 'inline'; document.getElementById('2311.16832v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.16832v1-abstract-full" style="display: none;"> In this paper, we present CharacterGLM, a series of models built upon ChatGLM, with model sizes ranging from 6B to 66B parameters. Our CharacterGLM is designed for generating Character-based Dialogues (CharacterDial), which aims to equip a conversational AI system with character customization for satisfying people&#39;s inherent social desires and emotional needs. On top of CharacterGLM, we can customize various AI characters or social agents by configuring their attributes (identities, interests, viewpoints, experiences, achievements, social relationships, etc.) and behaviors (linguistic features, emotional expressions, interaction patterns, etc.). Our model outperforms most mainstream close-source large langauge models, including the GPT series, especially in terms of consistency, human-likeness, and engagement according to manual evaluations. We will release our 6B version of CharacterGLM and a subset of training data to facilitate further research development in the direction of character-based dialogue generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.16832v1-abstract-full').style.display = 'none'; document.getElementById('2311.16832v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.05050">arXiv:2311.05050</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.05050">pdf</a>, <a href="https://arxiv.org/format/2311.05050">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> </div> </div> <p class="title is-5 mathjax"> Quantum Generative Modeling of Sequential Data with Trainable Token Embedding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wanda Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+M">Miao Li</a>, <a href="/search/cs?searchtype=author&amp;query=You%2C+Y">Yi-Zhuang You</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.05050v1-abstract-short" style="display: inline;"> Generative models are a class of machine learning models that aim to learn the underlying probability distribution of data. Unlike discriminative models, generative models focus on capturing the data&#39;s inherent structure, allowing them to generate new samples that resemble the original data. To fully exploit the potential of modeling probability distributions using quantum physics, a quantum-inspi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.05050v1-abstract-full').style.display = 'inline'; document.getElementById('2311.05050v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.05050v1-abstract-full" style="display: none;"> Generative models are a class of machine learning models that aim to learn the underlying probability distribution of data. Unlike discriminative models, generative models focus on capturing the data&#39;s inherent structure, allowing them to generate new samples that resemble the original data. To fully exploit the potential of modeling probability distributions using quantum physics, a quantum-inspired generative model known as the Born machines have shown great advancements in learning classical and quantum data over matrix product state(MPS) framework. The Born machines support tractable log-likelihood, autoregressive and mask sampling, and have shown outstanding performance in various unsupervised learning tasks. However, much of the current research has been centered on improving the expressive power of MPS, predominantly embedding each token directly by a corresponding tensor index. In this study, we generalize the embedding method into trainable quantum measurement operators that can be simultaneously honed with MPS. Our study indicated that combined with trainable embedding, Born machines can exhibit better performance and learn deeper correlations from the dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.05050v1-abstract-full').style.display = 'none'; document.getElementById('2311.05050v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.13864">arXiv:2310.13864</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.13864">pdf</a>, <a href="https://arxiv.org/format/2310.13864">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> RECAP: Towards Precise Radiology Report Generation via Dynamic Disease Progression Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenjun Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+Y">Yi Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+K">Kaishuai Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+W">Wenjie Li</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jiang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.13864v1-abstract-short" style="display: inline;"> Automating radiology report generation can significantly alleviate radiologists&#39; workloads. Previous research has primarily focused on realizing highly concise observations while neglecting the precise attributes that determine the severity of diseases (e.g., small pleural effusion). Since incorrect attributes will lead to imprecise radiology reports, strengthening the generation process with prec&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.13864v1-abstract-full').style.display = 'inline'; document.getElementById('2310.13864v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.13864v1-abstract-full" style="display: none;"> Automating radiology report generation can significantly alleviate radiologists&#39; workloads. Previous research has primarily focused on realizing highly concise observations while neglecting the precise attributes that determine the severity of diseases (e.g., small pleural effusion). Since incorrect attributes will lead to imprecise radiology reports, strengthening the generation process with precise attribute modeling becomes necessary. Additionally, the temporal information contained in the historical records, which is crucial in evaluating a patient&#39;s current condition (e.g., heart size is unchanged), has also been largely disregarded. To address these issues, we propose RECAP, which generates precise and accurate radiology reports via dynamic disease progression reasoning. Specifically, RECAP first predicts the observations and progressions (i.e., spatiotemporal information) given two consecutive radiographs. It then combines the historical records, spatiotemporal information, and radiographs for report generation, where a disease progression graph and dynamic progression reasoning mechanism are devised to accurately select the attributes of each observation and progression. Extensive experiments on two publicly available datasets demonstrate the effectiveness of our model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.13864v1-abstract-full').style.display = 'none'; document.getElementById('2310.13864v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Findings of EMNLP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.02182">arXiv:2309.02182</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2309.02182">pdf</a>, <a href="https://arxiv.org/format/2309.02182">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ICSME55016.2022.00080">10.1109/ICSME55016.2022.00080 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Using a Nearest-Neighbour, BERT-Based Approach for Scalable Clone Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chochlov%2C+M">Muslim Chochlov</a>, <a href="/search/cs?searchtype=author&amp;query=Ahmed%2C+G+A">Gul Aftab Ahmed</a>, <a href="/search/cs?searchtype=author&amp;query=Patten%2C+J+V">James Vincent Patten</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+G">Guoxian Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wei Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Gregg%2C+D">David Gregg</a>, <a href="/search/cs?searchtype=author&amp;query=Buckley%2C+J">Jim Buckley</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.02182v1-abstract-short" style="display: inline;"> Code clones can detrimentally impact software maintenance and manually detecting them in very large codebases is impractical. Additionally, automated approaches find detection of Type 3 and Type 4 (inexact) clones very challenging. While the most recent artificial deep neural networks (for example BERT-based artificial neural networks) seem to be highly effective in detecting such clones, their pa&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.02182v1-abstract-full').style.display = 'inline'; document.getElementById('2309.02182v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.02182v1-abstract-full" style="display: none;"> Code clones can detrimentally impact software maintenance and manually detecting them in very large codebases is impractical. Additionally, automated approaches find detection of Type 3 and Type 4 (inexact) clones very challenging. While the most recent artificial deep neural networks (for example BERT-based artificial neural networks) seem to be highly effective in detecting such clones, their pairwise comparison of every code pair in the target system(s) is inefficient and scales poorly on large codebases. We therefore introduce SSCD, a BERT-based clone detection approach that targets high recall of Type 3 and Type 4 clones at scale (in line with our industrial partner&#39;s requirements). It does so by computing a representative embedding for each code fragment and finding similar fragments using a nearest neighbour search. SSCD thus avoids the pairwise-comparison bottleneck of other Neural Network approaches while also using parallel, GPU-accelerated search to tackle scalability. This paper details the approach and an empirical assessment towards configuring and evaluating that approach in industrial setting. The configuration analysis suggests that shorter input lengths and text-only based neural network models demonstrate better efficiency in SSCD, while only slightly decreasing effectiveness. The evaluation results suggest that SSCD is more effective than state-of-the-art approaches like SAGA and SourcererCC. It is also highly efficient: in its optimal setting, SSCD effectively locates clones in the entire 320 million LOC BigCloneBench (a standard clone detection benchmark) in just under three hours. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.02182v1-abstract-full').style.display = 'none'; document.getElementById('2309.02182v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 2 figures, 38th IEEE International Conference on Software Maintenance and Evolution</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.09915">arXiv:2308.09915</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2308.09915">pdf</a>, <a href="https://arxiv.org/format/2308.09915">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> EGANS: Evolutionary Generative Adversarial Network Search for Zero-Shot Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Shiming Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Shihuang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenjin Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+W">Weiping Ding</a>, <a href="/search/cs?searchtype=author&amp;query=You%2C+X">Xinge You</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.09915v1-abstract-short" style="display: inline;"> Zero-shot learning (ZSL) aims to recognize the novel classes which cannot be collected for training a prediction model. Accordingly, generative models (e.g., generative adversarial network (GAN)) are typically used to synthesize the visual samples conditioned by the class semantic vectors and achieve remarkable progress for ZSL. However, existing GAN-based generative ZSL methods are based on hand-&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.09915v1-abstract-full').style.display = 'inline'; document.getElementById('2308.09915v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.09915v1-abstract-full" style="display: none;"> Zero-shot learning (ZSL) aims to recognize the novel classes which cannot be collected for training a prediction model. Accordingly, generative models (e.g., generative adversarial network (GAN)) are typically used to synthesize the visual samples conditioned by the class semantic vectors and achieve remarkable progress for ZSL. However, existing GAN-based generative ZSL methods are based on hand-crafted models, which cannot adapt to various datasets/scenarios and fails to model instability. To alleviate these challenges, we propose evolutionary generative adversarial network search (termed EGANS) to automatically design the generative network with good adaptation and stability, enabling reliable visual feature sample synthesis for advancing ZSL. Specifically, we adopt cooperative dual evolution to conduct a neural architecture search for both generator and discriminator under a unified evolutionary adversarial framework. EGANS is learned by two stages: evolution generator architecture search and evolution discriminator architecture search. During the evolution generator architecture search, we adopt a many-to-one adversarial training strategy to evolutionarily search for the optimal generator. Then the optimal generator is further applied to search for the optimal discriminator in the evolution discriminator architecture search with a similar evolution search algorithm. Once the optimal generator and discriminator are searched, we entail them into various generative ZSL baselines for ZSL classification. Extensive experiments show that EGANS consistently improve existing generative ZSL methods on the standard CUB, SUN, AWA2 and FLO datasets. The significant performance gains indicate that the evolutionary neural architecture search explores a virgin field in ZSL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.09915v1-abstract-full').style.display = 'none'; document.getElementById('2308.09915v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to TEVC</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.05421">arXiv:2308.05421</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2308.05421">pdf</a>, <a href="https://arxiv.org/format/2308.05421">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Progressive Spatio-temporal Perception for Audio-Visual Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+G">Guangyao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenxuan Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+D">Di Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.05421v1-abstract-short" style="display: inline;"> Audio-Visual Question Answering (AVQA) task aims to answer questions about different visual objects, sounds, and their associations in videos. Such naturally multi-modal videos are composed of rich and complex dynamic audio-visual components, where most of which could be unrelated to the given questions, or even play as interference in answering the content of interest. Oppositely, only focusing o&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.05421v1-abstract-full').style.display = 'inline'; document.getElementById('2308.05421v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.05421v1-abstract-full" style="display: none;"> Audio-Visual Question Answering (AVQA) task aims to answer questions about different visual objects, sounds, and their associations in videos. Such naturally multi-modal videos are composed of rich and complex dynamic audio-visual components, where most of which could be unrelated to the given questions, or even play as interference in answering the content of interest. Oppositely, only focusing on the question-aware audio-visual content could get rid of influence, meanwhile enabling the model to answer more efficiently. In this paper, we propose a Progressive Spatio-Temporal Perception Network (PSTP-Net), which contains three modules that progressively identify key spatio-temporal regions w.r.t. questions. Specifically, a temporal segment selection module is first introduced to select the most relevant audio-visual segments related to the given question. Then, a spatial region selection module is utilized to choose the most relevant regions associated with the question from the selected temporal segments. To further refine the selection of features, an audio-guided visual attention module is employed to perceive the association between auido and selected spatial regions. Finally, the spatio-temporal features from these modules are integrated for answering the question. Extensive experimental results on the public MUSIC-AVQA and AVQA datasets provide compelling evidence of the effectiveness and efficiency of PSTP-Net. Code is available at: \href{https://github.com/GeWu-Lab/PSTP-Net}{https://github.com/GeWu-Lab/PSTP-Net} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.05421v1-abstract-full').style.display = 'none'; document.getElementById('2308.05421v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACM MM 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.04370">arXiv:2308.04370</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2308.04370">pdf</a>, <a href="https://arxiv.org/format/2308.04370">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> When Super-Resolution Meets Camouflaged Object Detection: A Comparison Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wen%2C+J">Juan Wen</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+S">Shupeng Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+P">Peng Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bowen Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Timofte%2C+R">Radu Timofte</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Weiyan Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Van+Gool%2C+L">Luc Van Gool</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.04370v1-abstract-short" style="display: inline;"> Super Resolution (SR) and Camouflaged Object Detection (COD) are two hot topics in computer vision with various joint applications. For instance, low-resolution surveillance images can be successively processed by super-resolution techniques and camouflaged object detection. However, in previous work, these two areas are always studied in isolation. In this paper, we, for the first time, conduct a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.04370v1-abstract-full').style.display = 'inline'; document.getElementById('2308.04370v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.04370v1-abstract-full" style="display: none;"> Super Resolution (SR) and Camouflaged Object Detection (COD) are two hot topics in computer vision with various joint applications. For instance, low-resolution surveillance images can be successively processed by super-resolution techniques and camouflaged object detection. However, in previous work, these two areas are always studied in isolation. In this paper, we, for the first time, conduct an integrated comparative evaluation for both. Specifically, we benchmark different super-resolution methods on commonly used COD datasets, and meanwhile, we evaluate the robustness of different COD models by using COD data processed by SR methods. Our goal is to bridge these two domains, discover novel experimental phenomena, summarize new experim. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.04370v1-abstract-full').style.display = 'none'; document.getElementById('2308.04370v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages with 8 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T45 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.4.3 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.11760">arXiv:2307.11760</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2307.11760">pdf</a>, <a href="https://arxiv.org/format/2307.11760">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Large Language Models Understand and Can be Enhanced by Emotional Stimuli </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Cheng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jindong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yixuan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+K">Kaijie Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenxin Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Lian%2C+J">Jianxun Lian</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+F">Fang Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Q">Qiang Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+X">Xing Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.11760v7-abstract-short" style="display: inline;"> Emotional intelligence significantly impacts our daily behaviors and interactions. Although Large Language Models (LLMs) are increasingly viewed as a stride toward artificial general intelligence, exhibiting impressive performance in numerous tasks, it is still uncertain if LLMs can genuinely grasp psychological emotional stimuli. Understanding and responding to emotional cues gives humans a disti&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.11760v7-abstract-full').style.display = 'inline'; document.getElementById('2307.11760v7-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.11760v7-abstract-full" style="display: none;"> Emotional intelligence significantly impacts our daily behaviors and interactions. Although Large Language Models (LLMs) are increasingly viewed as a stride toward artificial general intelligence, exhibiting impressive performance in numerous tasks, it is still uncertain if LLMs can genuinely grasp psychological emotional stimuli. Understanding and responding to emotional cues gives humans a distinct advantage in problem-solving. In this paper, we take the first step towards exploring the ability of LLMs to understand emotional stimuli. To this end, we first conduct automatic experiments on 45 tasks using various LLMs, including Flan-T5-Large, Vicuna, Llama 2, BLOOM, ChatGPT, and GPT-4. Our tasks span deterministic and generative applications that represent comprehensive evaluation scenarios. Our automatic experiments show that LLMs have a grasp of emotional intelligence, and their performance can be improved with emotional prompts (which we call &#34;EmotionPrompt&#34; that combines the original prompt with emotional stimuli), e.g., 8.00% relative performance improvement in Instruction Induction and 115% in BIG-Bench. In addition to those deterministic tasks that can be automatically evaluated using existing metrics, we conducted a human study with 106 participants to assess the quality of generative tasks using both vanilla and emotional prompts. Our human study results demonstrate that EmotionPrompt significantly boosts the performance of generative tasks (10.9% average improvement in terms of performance, truthfulness, and responsibility metrics). We provide an in-depth discussion regarding why EmotionPrompt works for LLMs and the factors that may influence its performance. We posit that EmotionPrompt heralds a novel avenue for exploring interdisciplinary knowledge for human-LLMs interaction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.11760v7-abstract-full').style.display = 'none'; document.getElementById('2307.11760v7-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical report; updated the std error for human study; short version (v1) was accepted by LLM@IJCAI&#39;23; 32 pages; more work: https://llm-enhance.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.04427">arXiv:2307.04427</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2307.04427">pdf</a>, <a href="https://arxiv.org/format/2307.04427">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="High Energy Astrophysical Phenomena">astro-ph.HE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Astrophysics of Galaxies">astro-ph.GA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1126/science.adc9818">10.1126/science.adc9818 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Observation of high-energy neutrinos from the Galactic plane </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Abbasi%2C+R">R. Abbasi</a>, <a href="/search/cs?searchtype=author&amp;query=Ackermann%2C+M">M. Ackermann</a>, <a href="/search/cs?searchtype=author&amp;query=Adams%2C+J">J. Adams</a>, <a href="/search/cs?searchtype=author&amp;query=Aguilar%2C+J+A">J. A. Aguilar</a>, <a href="/search/cs?searchtype=author&amp;query=Ahlers%2C+M">M. Ahlers</a>, <a href="/search/cs?searchtype=author&amp;query=Ahrens%2C+M">M. Ahrens</a>, <a href="/search/cs?searchtype=author&amp;query=Alameddine%2C+J+M">J. M. Alameddine</a>, <a href="/search/cs?searchtype=author&amp;query=Alves%2C+A+A">A. A. Alves Jr.</a>, <a href="/search/cs?searchtype=author&amp;query=Amin%2C+N+M">N. M. Amin</a>, <a href="/search/cs?searchtype=author&amp;query=Andeen%2C+K">K. Andeen</a>, <a href="/search/cs?searchtype=author&amp;query=Anderson%2C+T">T. Anderson</a>, <a href="/search/cs?searchtype=author&amp;query=Anton%2C+G">G. Anton</a>, <a href="/search/cs?searchtype=author&amp;query=Arg%C3%BCelles%2C+C">C. Arg眉elles</a>, <a href="/search/cs?searchtype=author&amp;query=Ashida%2C+Y">Y. Ashida</a>, <a href="/search/cs?searchtype=author&amp;query=Athanasiadou%2C+S">S. Athanasiadou</a>, <a href="/search/cs?searchtype=author&amp;query=Axani%2C+S">S. Axani</a>, <a href="/search/cs?searchtype=author&amp;query=Bai%2C+X">X. Bai</a>, <a href="/search/cs?searchtype=author&amp;query=V.%2C+A+B">A. Balagopal V.</a>, <a href="/search/cs?searchtype=author&amp;query=Barwick%2C+S+W">S. W. Barwick</a>, <a href="/search/cs?searchtype=author&amp;query=Basu%2C+V">V. Basu</a>, <a href="/search/cs?searchtype=author&amp;query=Baur%2C+S">S. Baur</a>, <a href="/search/cs?searchtype=author&amp;query=Bay%2C+R">R. Bay</a>, <a href="/search/cs?searchtype=author&amp;query=Beatty%2C+J+J">J. J. Beatty</a>, <a href="/search/cs?searchtype=author&amp;query=Becker%2C+K+-">K. -H. Becker</a>, <a href="/search/cs?searchtype=author&amp;query=Tjus%2C+J+B">J. Becker Tjus</a> , et al. (364 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.04427v1-abstract-short" style="display: inline;"> The origin of high-energy cosmic rays, atomic nuclei that continuously impact Earth&#39;s atmosphere, has been a mystery for over a century. Due to deflection in interstellar magnetic fields, cosmic rays from the Milky Way arrive at Earth from random directions. However, near their sources and during propagation, cosmic rays interact with matter and produce high-energy neutrinos. We search for neutrin&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.04427v1-abstract-full').style.display = 'inline'; document.getElementById('2307.04427v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.04427v1-abstract-full" style="display: none;"> The origin of high-energy cosmic rays, atomic nuclei that continuously impact Earth&#39;s atmosphere, has been a mystery for over a century. Due to deflection in interstellar magnetic fields, cosmic rays from the Milky Way arrive at Earth from random directions. However, near their sources and during propagation, cosmic rays interact with matter and produce high-energy neutrinos. We search for neutrino emission using machine learning techniques applied to ten years of data from the IceCube Neutrino Observatory. We identify neutrino emission from the Galactic plane at the 4.5$蟽$ level of significance, by comparing diffuse emission models to a background-only hypothesis. The signal is consistent with modeled diffuse emission from the Galactic plane, but could also arise from a population of unresolved point sources. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.04427v1-abstract-full').style.display = 'none'; document.getElementById('2307.04427v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted on May 12th, 2022; Accepted on May 4th, 2023</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Science 380, 6652, 1338-1343 (2023) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.09431">arXiv:2306.09431</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2306.09431">pdf</a>, <a href="https://arxiv.org/format/2306.09431">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Towards Long Form Audio-visual Video Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenxuan Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+G">Guangyao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Tian%2C+Y">Yapeng Tian</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+D">Di Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.09431v1-abstract-short" style="display: inline;"> We live in a world filled with never-ending streams of multimodal information. As a more natural recording of the real scenario, long form audio-visual videos are expected as an important bridge for better exploring and understanding the world. In this paper, we propose the multisensory temporal event localization task in long form videos and strive to tackle the associated challenges. To facilita&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.09431v1-abstract-full').style.display = 'inline'; document.getElementById('2306.09431v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.09431v1-abstract-full" style="display: none;"> We live in a world filled with never-ending streams of multimodal information. As a more natural recording of the real scenario, long form audio-visual videos are expected as an important bridge for better exploring and understanding the world. In this paper, we propose the multisensory temporal event localization task in long form videos and strive to tackle the associated challenges. To facilitate this study, we first collect a large-scale Long Form Audio-visual Video (LFAV) dataset with 5,175 videos and an average video length of 210 seconds. Each of the collected videos is elaborately annotated with diversified modality-aware events, in a long-range temporal sequence. We then propose an event-centric framework for localizing multisensory events as well as understanding their relations in long form videos. It includes three phases in different levels: snippet prediction phase to learn snippet features, event extraction phase to extract event-level features, and event interaction phase to study event relations. Experiments demonstrate that the proposed method, utilizing the new LFAV dataset, exhibits considerable effectiveness in localizing multiple modality-aware events within long form videos. Project website: http://gewu-lab.github.io/LFAV/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.09431v1-abstract-full').style.display = 'none'; document.getElementById('2306.09431v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.06931">arXiv:2306.06931</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2306.06931">pdf</a>, <a href="https://arxiv.org/format/2306.06931">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Evolving Semantic Prototype Improves Generative Zero-Shot Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Shiming Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenjin Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Hong%2C+Z">Ziming Hong</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+X">Xiaohan Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+Y">Yibing Song</a>, <a href="/search/cs?searchtype=author&amp;query=You%2C+X">Xinge You</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+T">Tongliang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+K">Kun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.06931v1-abstract-short" style="display: inline;"> In zero-shot learning (ZSL), generative methods synthesize class-related sample features based on predefined semantic prototypes. They advance the ZSL performance by synthesizing unseen class sample features for better training the classifier. We observe that each class&#39;s predefined semantic prototype (also referred to as semantic embedding or condition) does not accurately match its real semantic&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.06931v1-abstract-full').style.display = 'inline'; document.getElementById('2306.06931v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.06931v1-abstract-full" style="display: none;"> In zero-shot learning (ZSL), generative methods synthesize class-related sample features based on predefined semantic prototypes. They advance the ZSL performance by synthesizing unseen class sample features for better training the classifier. We observe that each class&#39;s predefined semantic prototype (also referred to as semantic embedding or condition) does not accurately match its real semantic prototype. So the synthesized visual sample features do not faithfully represent the real sample features, limiting the classifier training and existing ZSL performance. In this paper, we formulate this mismatch phenomenon as the visual-semantic domain shift problem. We propose a dynamic semantic prototype evolving (DSP) method to align the empirically predefined semantic prototypes and the real prototypes for class-related feature synthesis. The alignment is learned by refining sample features and semantic prototypes in a unified framework and making the synthesized visual sample features approach real sample features. After alignment, synthesized sample features from unseen classes are closer to the real sample features and benefit DSP to improve existing generative ZSL methods by 8.5\%, 8.0\%, and 9.7\% on the standard CUB, SUN AWA2 datasets, the significant performance improvement indicates that evolving semantic prototype explores a virgin field in ZSL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.06931v1-abstract-full').style.display = 'none'; document.getElementById('2306.06931v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICML&#39;23</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.06466">arXiv:2306.06466</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2306.06466">pdf</a>, <a href="https://arxiv.org/format/2306.06466">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ORGAN: Observation-Guided Radiology Report Generation via Tree Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenjun Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+K">Kaishuai Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+Y">Yi Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+W">Wenjie Li</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jiang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.06466v1-abstract-short" style="display: inline;"> This paper explores the task of radiology report generation, which aims at generating free-text descriptions for a set of radiographs. One significant challenge of this task is how to correctly maintain the consistency between the images and the lengthy report. Previous research explored solving this issue through planning-based methods, which generate reports only based on high-level plans. Howev&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.06466v1-abstract-full').style.display = 'inline'; document.getElementById('2306.06466v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.06466v1-abstract-full" style="display: none;"> This paper explores the task of radiology report generation, which aims at generating free-text descriptions for a set of radiographs. One significant challenge of this task is how to correctly maintain the consistency between the images and the lengthy report. Previous research explored solving this issue through planning-based methods, which generate reports only based on high-level plans. However, these plans usually only contain the major observations from the radiographs (e.g., lung opacity), lacking much necessary information, such as the observation characteristics and preliminary clinical diagnoses. To address this problem, the system should also take the image information into account together with the textual plan and perform stronger reasoning during the generation process. In this paper, we propose an observation-guided radiology report generation framework (ORGAN). It first produces an observation plan and then feeds both the plan and radiographs for report generation, where an observation graph and a tree reasoning mechanism are adopted to precisely enrich the plan information by capturing the multi-formats of each observation. Experimental results demonstrate that our framework outperforms previous state-of-the-art methods regarding text quality and clinical efficacy <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.06466v1-abstract-full').style.display = 'none'; document.getElementById('2306.06466v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACL 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.04893">arXiv:2306.04893</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2306.04893">pdf</a>, <a href="https://arxiv.org/format/2306.04893">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Coping with Change: Learning Invariant and Minimum Sufficient Representations for Fine-Grained Visual Categorization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ye%2C+S">Shuo Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+S">Shujian Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenjin Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=You%2C+X">Xinge You</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.04893v3-abstract-short" style="display: inline;"> Fine-grained visual categorization (FGVC) is a challenging task due to similar visual appearances between various species. Previous studies always implicitly assume that the training and test data have the same underlying distributions, and that features extracted by modern backbone architectures remain discriminative and generalize well to unseen test data. However, we empirically justify that th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.04893v3-abstract-full').style.display = 'inline'; document.getElementById('2306.04893v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.04893v3-abstract-full" style="display: none;"> Fine-grained visual categorization (FGVC) is a challenging task due to similar visual appearances between various species. Previous studies always implicitly assume that the training and test data have the same underlying distributions, and that features extracted by modern backbone architectures remain discriminative and generalize well to unseen test data. However, we empirically justify that these conditions are not always true on benchmark datasets. To this end, we combine the merits of invariant risk minimization (IRM) and information bottleneck (IB) principle to learn invariant and minimum sufficient (IMS) representations for FGVC, such that the overall model can always discover the most succinct and consistent fine-grained features. We apply the matrix-based R{茅}nyi&#39;s $伪$-order entropy to simplify and stabilize the training of IB; we also design a ``soft&#34; environment partition scheme to make IRM applicable to FGVC task. To the best of our knowledge, we are the first to address the problem of FGVC from a generalization perspective and develop a new information-theoretic solution accordingly. Extensive experiments demonstrate the consistent performance gain offered by our IMS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.04893v3-abstract-full').style.display = 'none'; document.getElementById('2306.04893v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Manuscript accepted by CVIU, code is available at Github</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.18109">arXiv:2305.18109</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.18109">pdf</a>, <a href="https://arxiv.org/format/2305.18109">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Medical Dialogue Generation via Dual Flow Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+K">Kaishuai Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenjun Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+Y">Yi Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jian Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+W">Wenjie Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.18109v1-abstract-short" style="display: inline;"> Medical dialogue systems (MDS) aim to provide patients with medical services, such as diagnosis and prescription. Since most patients cannot precisely describe their symptoms, dialogue understanding is challenging for MDS. Previous studies mainly addressed this by extracting the mentioned medical entities as critical dialogue history information. In this work, we argue that it is also essential to&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.18109v1-abstract-full').style.display = 'inline'; document.getElementById('2305.18109v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.18109v1-abstract-full" style="display: none;"> Medical dialogue systems (MDS) aim to provide patients with medical services, such as diagnosis and prescription. Since most patients cannot precisely describe their symptoms, dialogue understanding is challenging for MDS. Previous studies mainly addressed this by extracting the mentioned medical entities as critical dialogue history information. In this work, we argue that it is also essential to capture the transitions of the medical entities and the doctor&#39;s dialogue acts in each turn, as they help the understanding of how the dialogue flows and enhance the prediction of the entities and dialogue acts to be adopted in the following turn. Correspondingly, we propose a Dual Flow enhanced Medical (DFMed) dialogue generation framework. It extracts the medical entities and dialogue acts used in the dialogue history and models their transitions with an entity-centric graph flow and a sequential act flow, respectively. We employ two sequential models to encode them and devise an interweaving component to enhance their interactions. Experiments on two datasets demonstrate that our method exceeds baselines in both automatic and manual evaluations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.18109v1-abstract-full').style.display = 'none'; document.getElementById('2305.18109v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted as Findings of ACL 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.12095">arXiv:2302.12095</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2302.12095">pdf</a>, <a href="https://arxiv.org/format/2302.12095">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> On the Robustness of ChatGPT: An Adversarial and Out-of-distribution Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jindong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+X">Xixu Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenxin Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+R">Runkai Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yidong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+L">Linyi Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+H">Haojun Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+W">Wei Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Geng%2C+X">Xiubo Geng</a>, <a href="/search/cs?searchtype=author&amp;query=Jiao%2C+B">Binxin Jiao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yue Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+X">Xing Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.12095v5-abstract-short" style="display: inline;"> ChatGPT is a recent chatbot service released by OpenAI and is receiving increasing attention over the past few months. While evaluations of various aspects of ChatGPT have been done, its robustness, i.e., the performance to unexpected inputs, is still unclear to the public. Robustness is of particular concern in responsible AI, especially for safety-critical applications. In this paper, we conduct&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.12095v5-abstract-full').style.display = 'inline'; document.getElementById('2302.12095v5-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.12095v5-abstract-full" style="display: none;"> ChatGPT is a recent chatbot service released by OpenAI and is receiving increasing attention over the past few months. While evaluations of various aspects of ChatGPT have been done, its robustness, i.e., the performance to unexpected inputs, is still unclear to the public. Robustness is of particular concern in responsible AI, especially for safety-critical applications. In this paper, we conduct a thorough evaluation of the robustness of ChatGPT from the adversarial and out-of-distribution (OOD) perspective. To do so, we employ the AdvGLUE and ANLI benchmarks to assess adversarial robustness and the Flipkart review and DDXPlus medical diagnosis datasets for OOD evaluation. We select several popular foundation models as baselines. Results show that ChatGPT shows consistent advantages on most adversarial and OOD classification and translation tasks. However, the absolute performance is far from perfection, which suggests that adversarial and OOD robustness remains a significant threat to foundation models. Moreover, ChatGPT shows astounding performance in understanding dialogue-related texts and we find that it tends to provide informal suggestions for medical tasks instead of definitive answers. Finally, we present in-depth discussions of possible research directions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.12095v5-abstract-full').style.display = 'none'; document.getElementById('2302.12095v5-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Highlighted paper at ICLR 2023 workshop on Trustworthy and Reliable Large-Scale Machine Learning Models; code is at: https://github.com/microsoft/robustlearn; more works: https://llm-eval.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.03042">arXiv:2209.03042</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2209.03042">pdf</a>, <a href="https://arxiv.org/format/2209.03042">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="High Energy Physics - Experiment">hep-ex</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Instrumentation and Methods for Astrophysics">astro-ph.IM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Analysis, Statistics and Probability">physics.data-an</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Instrumentation and Detectors">physics.ins-det</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1088/1748-0221/17/11/P11003">10.1088/1748-0221/17/11/P11003 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Graph Neural Networks for Low-Energy Event Classification &amp; Reconstruction in IceCube </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Abbasi%2C+R">R. Abbasi</a>, <a href="/search/cs?searchtype=author&amp;query=Ackermann%2C+M">M. Ackermann</a>, <a href="/search/cs?searchtype=author&amp;query=Adams%2C+J">J. Adams</a>, <a href="/search/cs?searchtype=author&amp;query=Aggarwal%2C+N">N. Aggarwal</a>, <a href="/search/cs?searchtype=author&amp;query=Aguilar%2C+J+A">J. A. Aguilar</a>, <a href="/search/cs?searchtype=author&amp;query=Ahlers%2C+M">M. Ahlers</a>, <a href="/search/cs?searchtype=author&amp;query=Ahrens%2C+M">M. Ahrens</a>, <a href="/search/cs?searchtype=author&amp;query=Alameddine%2C+J+M">J. M. Alameddine</a>, <a href="/search/cs?searchtype=author&amp;query=Alves%2C+A+A">A. A. Alves Jr.</a>, <a href="/search/cs?searchtype=author&amp;query=Amin%2C+N+M">N. M. Amin</a>, <a href="/search/cs?searchtype=author&amp;query=Andeen%2C+K">K. Andeen</a>, <a href="/search/cs?searchtype=author&amp;query=Anderson%2C+T">T. Anderson</a>, <a href="/search/cs?searchtype=author&amp;query=Anton%2C+G">G. Anton</a>, <a href="/search/cs?searchtype=author&amp;query=Arg%C3%BCelles%2C+C">C. Arg眉elles</a>, <a href="/search/cs?searchtype=author&amp;query=Ashida%2C+Y">Y. Ashida</a>, <a href="/search/cs?searchtype=author&amp;query=Athanasiadou%2C+S">S. Athanasiadou</a>, <a href="/search/cs?searchtype=author&amp;query=Axani%2C+S">S. Axani</a>, <a href="/search/cs?searchtype=author&amp;query=Bai%2C+X">X. Bai</a>, <a href="/search/cs?searchtype=author&amp;query=V.%2C+A+B">A. Balagopal V.</a>, <a href="/search/cs?searchtype=author&amp;query=Baricevic%2C+M">M. Baricevic</a>, <a href="/search/cs?searchtype=author&amp;query=Barwick%2C+S+W">S. W. Barwick</a>, <a href="/search/cs?searchtype=author&amp;query=Basu%2C+V">V. Basu</a>, <a href="/search/cs?searchtype=author&amp;query=Bay%2C+R">R. Bay</a>, <a href="/search/cs?searchtype=author&amp;query=Beatty%2C+J+J">J. J. Beatty</a>, <a href="/search/cs?searchtype=author&amp;query=Becker%2C+K+-">K. -H. Becker</a> , et al. (359 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.03042v3-abstract-short" style="display: inline;"> IceCube, a cubic-kilometer array of optical sensors built to detect atmospheric and astrophysical neutrinos between 1 GeV and 1 PeV, is deployed 1.45 km to 2.45 km below the surface of the ice sheet at the South Pole. The classification and reconstruction of events from the in-ice detectors play a central role in the analysis of data from IceCube. Reconstructing and classifying events is a challen&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.03042v3-abstract-full').style.display = 'inline'; document.getElementById('2209.03042v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.03042v3-abstract-full" style="display: none;"> IceCube, a cubic-kilometer array of optical sensors built to detect atmospheric and astrophysical neutrinos between 1 GeV and 1 PeV, is deployed 1.45 km to 2.45 km below the surface of the ice sheet at the South Pole. The classification and reconstruction of events from the in-ice detectors play a central role in the analysis of data from IceCube. Reconstructing and classifying events is a challenge due to the irregular detector geometry, inhomogeneous scattering and absorption of light in the ice and, below 100 GeV, the relatively low number of signal photons produced per event. To address this challenge, it is possible to represent IceCube events as point cloud graphs and use a Graph Neural Network (GNN) as the classification and reconstruction method. The GNN is capable of distinguishing neutrino events from cosmic-ray backgrounds, classifying different neutrino event types, and reconstructing the deposited energy, direction and interaction vertex. Based on simulation, we provide a comparison in the 1-100 GeV energy range to the current state-of-the-art maximum likelihood techniques used in current IceCube analyses, including the effects of known systematic uncertainties. For neutrino event classification, the GNN increases the signal efficiency by 18% at a fixed false positive rate (FPR), compared to current IceCube methods. Alternatively, the GNN offers a reduction of the FPR by over a factor 8 (to below half a percent) at a fixed signal efficiency. For the reconstruction of energy, direction, and interaction vertex, the resolution improves by an average of 13%-20% compared to current maximum likelihood techniques in the energy range of 1-30 GeV. The GNN, when run on a GPU, is capable of processing IceCube events at a rate nearly double of the median IceCube trigger rate of 2.7 kHz, which opens the possibility of using low energy neutrinos in online searches for transient events. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.03042v3-abstract-full').style.display = 'none'; document.getElementById('2209.03042v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Prepared for submission to JINST</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.10833">arXiv:2208.10833</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2208.10833">pdf</a>, <a href="https://arxiv.org/format/2208.10833">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LogLG: Weakly Supervised Log Anomaly Detection via Log-Event Graph Construction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Guo%2C+H">Hongcheng Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+Y">Yuhui Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+R">Renjie Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+J">Jian Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jiaheng Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhoujun Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+T">Tieqiao Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Weichao Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+L">Liangfan Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+B">Bo Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.10833v5-abstract-short" style="display: inline;"> Fully supervised log anomaly detection methods suffer the heavy burden of annotating massive unlabeled log data. Recently, many semi-supervised methods have been proposed to reduce annotation costs with the help of parsed templates. However, these methods consider each keyword independently, which disregards the correlation between keywords and the contextual relationships among log sequences. In&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.10833v5-abstract-full').style.display = 'inline'; document.getElementById('2208.10833v5-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.10833v5-abstract-full" style="display: none;"> Fully supervised log anomaly detection methods suffer the heavy burden of annotating massive unlabeled log data. Recently, many semi-supervised methods have been proposed to reduce annotation costs with the help of parsed templates. However, these methods consider each keyword independently, which disregards the correlation between keywords and the contextual relationships among log sequences. In this paper, we propose a novel weakly supervised log anomaly detection framework, named LogLG, to explore the semantic connections among keywords from sequences. Specifically, we design an end-to-end iterative process, where the keywords of unlabeled logs are first extracted to construct a log-event graph. Then, we build a subgraph annotator to generate pseudo labels for unlabeled log sequences. To ameliorate the annotation quality, we adopt a self-supervised task to pre-train a subgraph annotator. After that, a detection model is trained with the generated pseudo labels. Conditioned on the classification results, we re-extract the keywords from the log sequences and update the log-event graph for the next iteration. Experiments on five benchmarks validate the effectiveness of LogLG for detecting anomalies on unlabeled log data and demonstrate that LogLG, as the state-of-the-art weakly supervised method, achieves significant performance improvements compared to existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.10833v5-abstract-full').style.display = 'none'; document.getElementById('2208.10833v5-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.08280">arXiv:2208.08280</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2208.08280">pdf</a>, <a href="https://arxiv.org/format/2208.08280">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Exploiting Unlabeled Data for Target-Oriented Opinion Words Extraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yidong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+H">Hao Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+A">Ao Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenxin Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Zhen Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jindong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Shinozaki%2C+T">Takahiro Shinozaki</a>, <a href="/search/cs?searchtype=author&amp;query=Okumura%2C+M">Manabu Okumura</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yue Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.08280v1-abstract-short" style="display: inline;"> Target-oriented Opinion Words Extraction (TOWE) is a fine-grained sentiment analysis task that aims to extract the corresponding opinion words of a given opinion target from the sentence. Recently, deep learning approaches have made remarkable progress on this task. Nevertheless, the TOWE task still suffers from the scarcity of training data due to the expensive data annotation process. Limited la&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.08280v1-abstract-full').style.display = 'inline'; document.getElementById('2208.08280v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.08280v1-abstract-full" style="display: none;"> Target-oriented Opinion Words Extraction (TOWE) is a fine-grained sentiment analysis task that aims to extract the corresponding opinion words of a given opinion target from the sentence. Recently, deep learning approaches have made remarkable progress on this task. Nevertheless, the TOWE task still suffers from the scarcity of training data due to the expensive data annotation process. Limited labeled data increase the risk of distribution shift between test data and training data. In this paper, we propose exploiting massive unlabeled data to reduce the risk by increasing the exposure of the model to varying distribution shifts. Specifically, we propose a novel Multi-Grained Consistency Regularization (MGCR) method to make use of unlabeled data and design two filters specifically for TOWE to filter noisy data at different granularity. Extensive experimental results on four TOWE benchmark datasets indicate the superiority of MGCR compared with current state-of-the-art methods. The in-depth analysis also demonstrates the effectiveness of the different-granularity filters. Our codes are available at https://github.com/TOWESSL/TOWESSL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.08280v1-abstract-full').style.display = 'none'; document.getElementById('2208.08280v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by COLING 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.07204">arXiv:2208.07204</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2208.07204">pdf</a>, <a href="https://arxiv.org/format/2208.07204">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> USB: A Unified Semi-supervised Learning Benchmark for Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yidong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+Y">Yue Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+W">Wang Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Tao%2C+R">Ran Tao</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenxin Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Renjie Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+L">Linyi Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Z">Zhi Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+L">Lan-Zhe Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+H">Heli Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Zhen Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yu-Feng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Nakamura%2C+S">Satoshi Nakamura</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+W">Wei Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Savvides%2C+M">Marios Savvides</a>, <a href="/search/cs?searchtype=author&amp;query=Raj%2C+B">Bhiksha Raj</a>, <a href="/search/cs?searchtype=author&amp;query=Shinozaki%2C+T">Takahiro Shinozaki</a>, <a href="/search/cs?searchtype=author&amp;query=Schiele%2C+B">Bernt Schiele</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jindong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+X">Xing Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yue Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.07204v2-abstract-short" style="display: inline;"> Semi-supervised learning (SSL) improves model generalization by leveraging massive unlabeled data to augment limited labeled samples. However, currently, popular SSL evaluation protocols are often constrained to computer vision (CV) tasks. In addition, previous work typically trains deep neural networks from scratch, which is time-consuming and environmentally unfriendly. To address the above issu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.07204v2-abstract-full').style.display = 'inline'; document.getElementById('2208.07204v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.07204v2-abstract-full" style="display: none;"> Semi-supervised learning (SSL) improves model generalization by leveraging massive unlabeled data to augment limited labeled samples. However, currently, popular SSL evaluation protocols are often constrained to computer vision (CV) tasks. In addition, previous work typically trains deep neural networks from scratch, which is time-consuming and environmentally unfriendly. To address the above issues, we construct a Unified SSL Benchmark (USB) for classification by selecting 15 diverse, challenging, and comprehensive tasks from CV, natural language processing (NLP), and audio processing (Audio), on which we systematically evaluate the dominant SSL methods, and also open-source a modular and extensible codebase for fair evaluation of these SSL methods. We further provide the pre-trained versions of the state-of-the-art neural models for CV tasks to make the cost affordable for further tuning. USB enables the evaluation of a single SSL algorithm on more tasks from multiple domains but with less cost. Specifically, on a single NVIDIA V100, only 39 GPU days are required to evaluate FixMatch on 15 tasks in USB while 335 GPU days (279 GPU days on 4 CV datasets except for ImageNet) are needed on 5 CV tasks with TorchSSL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.07204v2-abstract-full').style.display = 'none'; document.getElementById('2208.07204v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS&#39;22 dataset and benchmark track; code at https://github.com/microsoft/Semi-supervised-learning</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.12169">arXiv:2206.12169</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2206.12169">pdf</a>, <a href="https://arxiv.org/format/2206.12169">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AdAUC: End-to-end Adversarial AUC Optimization Against Long-tail Problems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenzheng Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Q">Qianqian Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Z">Zhiyong Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Bao%2C+S">Shilong Bao</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+Y">Yuan He</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Q">Qingming Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.12169v1-abstract-short" style="display: inline;"> It is well-known that deep learning models are vulnerable to adversarial examples. Existing studies of adversarial training have made great progress against this challenge. As a typical trait, they often assume that the class distribution is overall balanced. However, long-tail datasets are ubiquitous in a wide spectrum of applications, where the amount of head class instances is larger than the t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.12169v1-abstract-full').style.display = 'inline'; document.getElementById('2206.12169v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.12169v1-abstract-full" style="display: none;"> It is well-known that deep learning models are vulnerable to adversarial examples. Existing studies of adversarial training have made great progress against this challenge. As a typical trait, they often assume that the class distribution is overall balanced. However, long-tail datasets are ubiquitous in a wide spectrum of applications, where the amount of head class instances is larger than the tail classes. Under such a scenario, AUC is a much more reasonable metric than accuracy since it is insensitive toward class distribution. Motivated by this, we present an early trial to explore adversarial training methods to optimize AUC. The main challenge lies in that the positive and negative examples are tightly coupled in the objective function. As a direct result, one cannot generate adversarial examples without a full scan of the dataset. To address this issue, based on a concavity regularization scheme, we reformulate the AUC optimization problem as a saddle point problem, where the objective becomes an instance-wise function. This leads to an end-to-end training protocol. Furthermore, we provide a convergence guarantee of the proposed algorithm. Our analysis differs from the existing studies since the algorithm is asked to generate adversarial examples by calculating the gradient of a min-max problem. Finally, the extensive experimental results show the performance and robustness of our algorithm in three long-tail datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.12169v1-abstract-full').style.display = 'none'; document.getElementById('2206.12169v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.09783">arXiv:2206.09783</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2206.09783">pdf</a>, <a href="https://arxiv.org/format/2206.09783">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Boosting Cross-Domain Speech Recognition with Self-Supervision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+H">Han Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+G">Gaofeng Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jindong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenxin Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+P">Pengyuan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+Y">Yonghong Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.09783v2-abstract-short" style="display: inline;"> The cross-domain performance of automatic speech recognition (ASR) could be severely hampered due to the mismatch between training and testing distributions. Since the target domain usually lacks labeled data, and domain shifts exist at acoustic and linguistic levels, it is challenging to perform unsupervised domain adaptation (UDA) for ASR. Previous work has shown that self-supervised learning (S&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.09783v2-abstract-full').style.display = 'inline'; document.getElementById('2206.09783v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.09783v2-abstract-full" style="display: none;"> The cross-domain performance of automatic speech recognition (ASR) could be severely hampered due to the mismatch between training and testing distributions. Since the target domain usually lacks labeled data, and domain shifts exist at acoustic and linguistic levels, it is challenging to perform unsupervised domain adaptation (UDA) for ASR. Previous work has shown that self-supervised learning (SSL) or pseudo-labeling (PL) is effective in UDA by exploiting the self-supervisions of unlabeled data. However, these self-supervisions also face performance degradation in mismatched domain distributions, which previous work fails to address. This work presents a systematic UDA framework to fully utilize the unlabeled data with self-supervision in the pre-training and fine-tuning paradigm. On the one hand, we apply continued pre-training and data replay techniques to mitigate the domain mismatch of the SSL pre-trained model. On the other hand, we propose a domain-adaptive fine-tuning approach based on the PL technique with three unique modifications: Firstly, we design a dual-branch PL method to decrease the sensitivity to the erroneous pseudo-labels; Secondly, we devise an uncertainty-aware confidence filtering strategy to improve pseudo-label correctness; Thirdly, we introduce a two-step PL approach to incorporate target domain linguistic knowledge, thus generating more accurate target domain pseudo-labels. Experimental results on various cross-domain scenarios demonstrate that the proposed approach effectively boosts the cross-domain performance and significantly outperforms previous approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.09783v2-abstract-full').style.display = 'none'; document.getElementById('2206.09783v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE/ACM Transactions on Audio, Speech and Language Processing (TASLP), 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.07246">arXiv:2205.07246</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2205.07246">pdf</a>, <a href="https://arxiv.org/format/2205.07246">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FreeMatch: Self-adaptive Thresholding for Semi-supervised Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yidong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Heng%2C+Q">Qiang Heng</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenxin Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+Y">Yue Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Zhen Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jindong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Savvides%2C+M">Marios Savvides</a>, <a href="/search/cs?searchtype=author&amp;query=Shinozaki%2C+T">Takahiro Shinozaki</a>, <a href="/search/cs?searchtype=author&amp;query=Raj%2C+B">Bhiksha Raj</a>, <a href="/search/cs?searchtype=author&amp;query=Schiele%2C+B">Bernt Schiele</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+X">Xing Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.07246v3-abstract-short" style="display: inline;"> Semi-supervised Learning (SSL) has witnessed great success owing to the impressive performances brought by various methods based on pseudo labeling and consistency regularization. However, we argue that existing methods might fail to utilize the unlabeled data more effectively since they either use a pre-defined / fixed threshold or an ad-hoc threshold adjusting scheme, resulting in inferior perfo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.07246v3-abstract-full').style.display = 'inline'; document.getElementById('2205.07246v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.07246v3-abstract-full" style="display: none;"> Semi-supervised Learning (SSL) has witnessed great success owing to the impressive performances brought by various methods based on pseudo labeling and consistency regularization. However, we argue that existing methods might fail to utilize the unlabeled data more effectively since they either use a pre-defined / fixed threshold or an ad-hoc threshold adjusting scheme, resulting in inferior performance and slow convergence. We first analyze a motivating example to obtain intuitions on the relationship between the desirable threshold and model&#39;s learning status. Based on the analysis, we hence propose FreeMatch to adjust the confidence threshold in a self-adaptive manner according to the model&#39;s learning status. We further introduce a self-adaptive class fairness regularization penalty to encourage the model for diverse predictions during the early training stage. Extensive experiments indicate the superiority of FreeMatch especially when the labeled data are extremely rare. FreeMatch achieves 5.78%, 13.59%, and 1.28% error rate reduction over the latest state-of-the-art method FlexMatch on CIFAR-10 with 1 label per class, STL-10 with 4 labels per class, and ImageNet with 100 labels per class, respectively. Moreover, FreeMatch can also boost the performance of imbalanced SSL. The codes can be found at https://github.com/microsoft/Semi-supervised-learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.07246v3-abstract-full').style.display = 'none'; document.getElementById('2205.07246v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICLR 2023. Code: https://github.com/microsoft/Semi-supervised-learning</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.04121">arXiv:2205.04121</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2205.04121">pdf</a>, <a href="https://arxiv.org/format/2205.04121">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Identifying Fixation and Saccades in Virtual Reality </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xiao-lin Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wen-jun Hou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.04121v1-abstract-short" style="display: inline;"> Gaze recognition can significantly reduce the amount of eye movement data for a better understanding of cognitive and visual processing. Gaze recognition is an essential precondition for eye-based interaction applications in virtual reality. However, the three-dimensional characteristics of virtual reality environments also pose new challenges to existing recognition algorithms. Based on seven eva&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.04121v1-abstract-full').style.display = 'inline'; document.getElementById('2205.04121v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.04121v1-abstract-full" style="display: none;"> Gaze recognition can significantly reduce the amount of eye movement data for a better understanding of cognitive and visual processing. Gaze recognition is an essential precondition for eye-based interaction applications in virtual reality. However, the three-dimensional characteristics of virtual reality environments also pose new challenges to existing recognition algorithms. Based on seven evaluation metrics and the Overall score (the mean of the seven normalized metric values), we obtain optimal parameters of three existing recognition algorithms (Velocity-Threshold Identification, Dispersion-Threshold Identification, and Velocity &amp; Dispersion-Threshold Identification) and our modified Velocity &amp; Dispersion-Threshold Identification algorithm. We compare the performance of these four algorithms with optimal parameters. The results show that our modified Velocity &amp; Dispersion-Threshold Identification performs the best. The impact of interface complexity on classification results is also preliminarily explored. The results show that the algorithms are not sensitive to interface complexity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.04121v1-abstract-full').style.display = 'none'; document.getElementById('2205.04121v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.08643">arXiv:2112.08643</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2112.08643">pdf</a>, <a href="https://arxiv.org/format/2112.08643">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> TransZero++: Cross Attribute-Guided Transformer for Zero-Shot Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Shiming Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Hong%2C+Z">Ziming Hong</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenjin Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+G">Guo-Sen Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+Y">Yibing Song</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+J">Jian Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=You%2C+X">Xinge You</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+S">Shuicheng Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Shao%2C+L">Ling Shao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.08643v3-abstract-short" style="display: inline;"> Zero-shot learning (ZSL) tackles the novel class recognition problem by transferring semantic knowledge from seen classes to unseen ones. Existing attention-based models have struggled to learn inferior region features in a single image by solely using unidirectional attention, which ignore the transferability and discriminative attribute localization of visual features. In this paper, we propose&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.08643v3-abstract-full').style.display = 'inline'; document.getElementById('2112.08643v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.08643v3-abstract-full" style="display: none;"> Zero-shot learning (ZSL) tackles the novel class recognition problem by transferring semantic knowledge from seen classes to unseen ones. Existing attention-based models have struggled to learn inferior region features in a single image by solely using unidirectional attention, which ignore the transferability and discriminative attribute localization of visual features. In this paper, we propose a cross attribute-guided Transformer network, termed TransZero++, to refine visual features and learn accurate attribute localization for semantic-augmented visual embedding representations in ZSL. TransZero++ consists of an attribute$\rightarrow$visual Transformer sub-net (AVT) and a visual$\rightarrow$attribute Transformer sub-net (VAT). Specifically, AVT first takes a feature augmentation encoder to alleviate the cross-dataset problem, and improves the transferability of visual features by reducing the entangled relative geometry relationships among region features. Then, an attribute$\rightarrow$visual decoder is employed to localize the image regions most relevant to each attribute in a given image for attribute-based visual feature representations. Analogously, VAT uses the similar feature augmentation encoder to refine the visual features, which are further applied in visual$\rightarrow$attribute decoder to learn visual-based attribute features. By further introducing semantical collaborative losses, the two attribute-guided transformers teach each other to learn semantic-augmented visual embeddings via semantical collaborative learning. Extensive experiments show that TransZero++ achieves the new state-of-the-art results on three challenging ZSL benchmarks. The codes are available at: \url{https://github.com/shiming-chen/TransZero_pp}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.08643v3-abstract-full').style.display = 'none'; document.getElementById('2112.08643v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This is an extention of AAAI&#39;22 paper (TransZero). Accepted to TPAMI. arXiv admin note: substantial text overlap with arXiv:2112.01683</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.07225">arXiv:2112.07225</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2112.07225">pdf</a>, <a href="https://arxiv.org/format/2112.07225">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Margin Calibration for Long-Tailed Visual Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yidong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+B">Bowen Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenxin Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Zhen Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jindong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Shinozaki%2C+T">Takahiro Shinozaki</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.07225v5-abstract-short" style="display: inline;"> The long-tailed class distribution in visual recognition tasks poses great challenges for neural networks on how to handle the biased predictions between head and tail classes, i.e., the model tends to classify tail classes as head classes. While existing research focused on data resampling and loss function engineering, in this paper, we take a different perspective: the classification margins. W&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.07225v5-abstract-full').style.display = 'inline'; document.getElementById('2112.07225v5-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.07225v5-abstract-full" style="display: none;"> The long-tailed class distribution in visual recognition tasks poses great challenges for neural networks on how to handle the biased predictions between head and tail classes, i.e., the model tends to classify tail classes as head classes. While existing research focused on data resampling and loss function engineering, in this paper, we take a different perspective: the classification margins. We study the relationship between the margins and logits (classification scores) and empirically observe the biased margins and the biased logits are positively correlated. We propose MARC, a simple yet effective MARgin Calibration function to dynamically calibrate the biased margins for unbiased logits. We validate MARC through extensive experiments on common long-tailed benchmarks including CIFAR-LT, ImageNet-LT, Places-LT, and iNaturalist-LT. Experimental results demonstrate that our MARC achieves favorable results on these benchmarks. In addition, MARC is extremely easy to implement with just three lines of code. We hope this simple method will motivate people to rethink the biased margins and biased logits in long-tailed visual recognition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.07225v5-abstract-full').style.display = 'none'; document.getElementById('2112.07225v5-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Asian Conference on Machine Learning (ACML) 2022; 16 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.08263">arXiv:2110.08263</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2110.08263">pdf</a>, <a href="https://arxiv.org/format/2110.08263">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FlexMatch: Boosting Semi-Supervised Learning with Curriculum Pseudo Labeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+B">Bowen Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yidong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenxin Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+H">Hao Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jindong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Okumura%2C+M">Manabu Okumura</a>, <a href="/search/cs?searchtype=author&amp;query=Shinozaki%2C+T">Takahiro Shinozaki</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.08263v3-abstract-short" style="display: inline;"> The recently proposed FixMatch achieved state-of-the-art results on most semi-supervised learning (SSL) benchmarks. However, like other modern SSL algorithms, FixMatch uses a pre-defined constant threshold for all classes to select unlabeled data that contribute to the training, thus failing to consider different learning status and learning difficulties of different classes. To address this issue&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.08263v3-abstract-full').style.display = 'inline'; document.getElementById('2110.08263v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.08263v3-abstract-full" style="display: none;"> The recently proposed FixMatch achieved state-of-the-art results on most semi-supervised learning (SSL) benchmarks. However, like other modern SSL algorithms, FixMatch uses a pre-defined constant threshold for all classes to select unlabeled data that contribute to the training, thus failing to consider different learning status and learning difficulties of different classes. To address this issue, we propose Curriculum Pseudo Labeling (CPL), a curriculum learning approach to leverage unlabeled data according to the model&#39;s learning status. The core of CPL is to flexibly adjust thresholds for different classes at each time step to let pass informative unlabeled data and their pseudo labels. CPL does not introduce additional parameters or computations (forward or backward propagation). We apply CPL to FixMatch and call our improved algorithm FlexMatch. FlexMatch achieves state-of-the-art performance on a variety of SSL benchmarks, with especially strong performances when the labeled data are extremely limited or when the task is challenging. For example, FlexMatch achieves 13.96% and 18.96% error rate reduction over FixMatch on CIFAR-100 and STL-10 datasets respectively, when there are only 4 labels per class. CPL also significantly boosts the convergence speed, e.g., FlexMatch can use only 1/5 training time of FixMatch to achieve even better performance. Furthermore, we show that CPL can be easily adapted to other SSL algorithms and remarkably improve their performances. We open-source our code at https://github.com/TorchSSL/TorchSSL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.08263v3-abstract-full').style.display = 'none'; document.getElementById('2110.08263v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2021; camera-ready version; 16 pages with appendix; code: https://github.com/TorchSSL/TorchSSL</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2105.11905">arXiv:2105.11905</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2105.11905">pdf</a>, <a href="https://arxiv.org/format/2105.11905">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Exploiting Adapters for Cross-lingual Low-resource Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenxin Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+H">Han Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yidong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jindong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Qin%2C+T">Tao Qin</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+R">Renjun Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Shinozaki%2C+T">Takahiro Shinozaki</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2105.11905v2-abstract-short" style="display: inline;"> Cross-lingual speech adaptation aims to solve the problem of leveraging multiple rich-resource languages to build models for a low-resource target language. Since the low-resource language has limited training data, speech recognition models can easily overfit. In this paper, we propose to use adapters to investigate the performance of multiple adapters for parameter-efficient cross-lingual speech&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2105.11905v2-abstract-full').style.display = 'inline'; document.getElementById('2105.11905v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2105.11905v2-abstract-full" style="display: none;"> Cross-lingual speech adaptation aims to solve the problem of leveraging multiple rich-resource languages to build models for a low-resource target language. Since the low-resource language has limited training data, speech recognition models can easily overfit. In this paper, we propose to use adapters to investigate the performance of multiple adapters for parameter-efficient cross-lingual speech adaptation. Based on our previous MetaAdapter that implicitly leverages adapters, we propose a novel algorithms called SimAdapter for explicitly learning knowledge from adapters. Our algorithm leverages adapters which can be easily integrated into the Transformer structure.MetaAdapter leverages meta-learning to transfer the general knowledge from training data to the test language. SimAdapter aims to learn the similarities between the source and target languages during fine-tuning using the adapters. We conduct extensive experiments on five-low-resource languages in Common Voice dataset. Results demonstrate that our MetaAdapter and SimAdapter methods can reduce WER by 2.98% and 2.55% with only 2.5% and 15.5% of trainable parameters compared to the strong full-model fine-tuning baseline. Moreover, we also show that these two novel algorithms can be integrated for better performance with up to 3.55% relative WER reduction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2105.11905v2-abstract-full').style.display = 'none'; document.getElementById('2105.11905v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 May, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE Transactions on Audio, Speech, and Language Processing (TASLP) as a full paper; 12 pages; code at https://github.com/jindongwang/transferlearning/tree/master/code/ASR/Adapter</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.07491">arXiv:2104.07491</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2104.07491">pdf</a>, <a href="https://arxiv.org/format/2104.07491">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Cross-domain Speech Recognition with Unsupervised Character-level Distribution Matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenxin Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jindong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Tan%2C+X">Xu Tan</a>, <a href="/search/cs?searchtype=author&amp;query=Qin%2C+T">Tao Qin</a>, <a href="/search/cs?searchtype=author&amp;query=Shinozaki%2C+T">Takahiro Shinozaki</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.07491v3-abstract-short" style="display: inline;"> End-to-end automatic speech recognition (ASR) can achieve promising performance with large-scale training data. However, it is known that domain mismatch between training and testing data often leads to a degradation of recognition accuracy. In this work, we focus on the unsupervised domain adaptation for ASR and propose CMatch, a Character-level distribution matching method to perform fine-graine&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.07491v3-abstract-full').style.display = 'inline'; document.getElementById('2104.07491v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.07491v3-abstract-full" style="display: none;"> End-to-end automatic speech recognition (ASR) can achieve promising performance with large-scale training data. However, it is known that domain mismatch between training and testing data often leads to a degradation of recognition accuracy. In this work, we focus on the unsupervised domain adaptation for ASR and propose CMatch, a Character-level distribution matching method to perform fine-grained adaptation between each character in two domains. First, to obtain labels for the features belonging to each character, we achieve frame-level label assignment using the Connectionist Temporal Classification (CTC) pseudo labels. Then, we match the character-level distributions using Maximum Mean Discrepancy. We train our algorithm using the self-training technique. Experiments on the Libri-Adapt dataset show that our proposed approach achieves 14.39% and 16.50% relative Word Error Rate (WER) reduction on both cross-device and cross-environment ASR. We also comprehensively analyze the different strategies for frame-level label assignment and Transformer adaptations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.07491v3-abstract-full').style.display = 'none'; document.getElementById('2104.07491v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to INTERSPEECH 2021; code available at https://github.com/jindongwang/transferlearning/tree/master/code/ASR/CMatch</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.05752">arXiv:2104.05752</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2104.05752">pdf</a>, <a href="https://arxiv.org/format/2104.05752">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Speak or Chat with Me: End-to-End Spoken Language Understanding System with Flexible Inputs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cha%2C+S">Sujeong Cha</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wangrui Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Jung%2C+H">Hyun Jung</a>, <a href="/search/cs?searchtype=author&amp;query=Phung%2C+M">My Phung</a>, <a href="/search/cs?searchtype=author&amp;query=Picheny%2C+M">Michael Picheny</a>, <a href="/search/cs?searchtype=author&amp;query=Kuo%2C+H">Hong-Kwang Kuo</a>, <a href="/search/cs?searchtype=author&amp;query=Thomas%2C+S">Samuel Thomas</a>, <a href="/search/cs?searchtype=author&amp;query=Morais%2C+E">Edmilson Morais</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.05752v2-abstract-short" style="display: inline;"> A major focus of recent research in spoken language understanding (SLU) has been on the end-to-end approach where a single model can predict intents directly from speech inputs without intermediate transcripts. However, this approach presents some challenges. First, since speech can be considered as personally identifiable information, in some cases only automatic speech recognition (ASR) transcri&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.05752v2-abstract-full').style.display = 'inline'; document.getElementById('2104.05752v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.05752v2-abstract-full" style="display: none;"> A major focus of recent research in spoken language understanding (SLU) has been on the end-to-end approach where a single model can predict intents directly from speech inputs without intermediate transcripts. However, this approach presents some challenges. First, since speech can be considered as personally identifiable information, in some cases only automatic speech recognition (ASR) transcripts are accessible. Second, intent-labeled speech data is scarce. To address the first challenge, we propose a novel system that can predict intents from flexible types of inputs: speech, ASR transcripts, or both. We demonstrate strong performance for either modality separately, and when both speech and ASR transcripts are available, through system combination, we achieve better results than using a single input modality. To address the second challenge, we leverage a semantically robust pre-trained BERT model and adopt a cross-modal system that co-trains text embeddings and acoustic embeddings in a shared latent space. We further enhance this system by utilizing an acoustic module pre-trained on LibriSpeech and domain-adapting the text module on our target datasets. Our experiments show significant advantages for these pre-training and fine-tuning strategies, resulting in a system that achieves competitive intent-classification performance on Snips SLU and Fluent Speech Commands datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.05752v2-abstract-full').style.display = 'none'; document.getElementById('2104.05752v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to Interspeech 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2012.03729">arXiv:2012.03729</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2012.03729">pdf</a>, <a href="https://arxiv.org/ps/2012.03729">ps</a>, <a href="https://arxiv.org/format/2012.03729">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> TRACE: Early Detection of Chronic Kidney Disease Onset with Transformer-Enhanced Feature Embedding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Guan%2C+Z">Ziqiao Guan</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wei Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+F">Fusheng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2012.03729v1-abstract-short" style="display: inline;"> Chronic kidney disease (CKD) has a poor prognosis due to excessive risk factors and comorbidities associated with it. The early detection of CKD faces challenges of insufficient medical histories of positive patients and complicated risk factors. In this paper, we propose the TRACE (Transformer-RNN Autoencoder-enhanced CKD Detector) framework, an end-to-end prediction model using patients&#39; medical&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.03729v1-abstract-full').style.display = 'inline'; document.getElementById('2012.03729v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2012.03729v1-abstract-full" style="display: none;"> Chronic kidney disease (CKD) has a poor prognosis due to excessive risk factors and comorbidities associated with it. The early detection of CKD faces challenges of insufficient medical histories of positive patients and complicated risk factors. In this paper, we propose the TRACE (Transformer-RNN Autoencoder-enhanced CKD Detector) framework, an end-to-end prediction model using patients&#39; medical history data, to deal with these challenges. TRACE presents a comprehensive medical history representation with a novel key component: a Transformer-RNN autoencoder. The autoencoder jointly learns a medical concept embedding via Transformer for each hospital visit, and a latent representation which summarizes a patient&#39;s medical history across all the visits. We compared TRACE with multiple state-of-the-art methods on a dataset derived from real-world medical records. Our model has achieved 0.5708 AUPRC with a 2.31% relative improvement over the best-performing method. We also validated the clinical meaning of the learned embeddings through visualizations and a case study, showing the potential of TRACE to serve as a general disease prediction model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.03729v1-abstract-full').style.display = 'none'; document.getElementById('2012.03729v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2012.02006">arXiv:2012.02006</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2012.02006">pdf</a>, <a href="https://arxiv.org/format/2012.02006">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> AugSplicing: Synchronized Behavior Detection in Streaming Tensors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jiabao Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+S">Shenghua Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wenting Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Bhatia%2C+S">Siddharth Bhatia</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+H">Huawei Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+W">Wenjian Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+X">Xueqi Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2012.02006v5-abstract-short" style="display: inline;"> How can we track synchronized behavior in a stream of time-stamped tuples, such as mobile devices installing and uninstalling applications in the lockstep, to boost their ranks in the app store? We model such tuples as entries in a streaming tensor, which augments attribute sizes in its modes over time. Synchronized behavior tends to form dense blocks (i.e. subtensors) in such a tensor, signaling&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.02006v5-abstract-full').style.display = 'inline'; document.getElementById('2012.02006v5-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2012.02006v5-abstract-full" style="display: none;"> How can we track synchronized behavior in a stream of time-stamped tuples, such as mobile devices installing and uninstalling applications in the lockstep, to boost their ranks in the app store? We model such tuples as entries in a streaming tensor, which augments attribute sizes in its modes over time. Synchronized behavior tends to form dense blocks (i.e. subtensors) in such a tensor, signaling anomalous behavior, or interesting communities. However, existing dense block detection methods are either based on a static tensor, or lack an efficient algorithm in a streaming setting. Therefore, we propose a fast streaming algorithm, AugSplicing, which can detect the top dense blocks by incrementally splicing the previous detection with the incoming ones in new tuples, avoiding re-runs over all the history data at every tracking time step. AugSplicing is based on a splicing condition that guides the algorithm (Section 4). Compared to the state-of-the-art methods, our method is (1) effective to detect fraudulent behavior in installing data of real-world apps and find a synchronized group of students with interesting features in campus Wi-Fi data; (2) robust with splicing theory for dense block detection; (3) streaming and faster than the existing streaming algorithm, with closely comparable accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.02006v5-abstract-full').style.display = 'none'; document.getElementById('2012.02006v5-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 December, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AAAI Conference on Artificial Intelligence (AAAI), 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2010.04589">arXiv:2010.04589</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2010.04589">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Identifying Risk of Opioid Use Disorder for Patients Taking Opioid Medications with Deep Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dong%2C+X">Xinyu Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+J">Jianyuan Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Rashidian%2C+S">Sina Rashidian</a>, <a href="/search/cs?searchtype=author&amp;query=Abell-Hart%2C+K">Kayley Abell-Hart</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+W">Wei Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Rosenthal%2C+R+N">Richard N Rosenthal</a>, <a href="/search/cs?searchtype=author&amp;query=Saltz%2C+M">Mary Saltz</a>, <a href="/search/cs?searchtype=author&amp;query=Saltz%2C+J">Joel Saltz</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+F">Fusheng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2010.04589v1-abstract-short" style="display: inline;"> The United States is experiencing an opioid epidemic, and there were more than 10 million opioid misusers aged 12 or older each year. Identifying patients at high risk of Opioid Use Disorder (OUD) can help to make early clinical interventions to reduce the risk of OUD. Our goal is to predict OUD patients among opioid prescription users through analyzing electronic health records with machine learn&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.04589v1-abstract-full').style.display = 'inline'; document.getElementById('2010.04589v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2010.04589v1-abstract-full" style="display: none;"> The United States is experiencing an opioid epidemic, and there were more than 10 million opioid misusers aged 12 or older each year. Identifying patients at high risk of Opioid Use Disorder (OUD) can help to make early clinical interventions to reduce the risk of OUD. Our goal is to predict OUD patients among opioid prescription users through analyzing electronic health records with machine learning and deep learning methods. This will help us to better understand the diagnoses of OUD, providing new insights on opioid epidemic. Electronic health records of patients who have been prescribed with medications containing active opioid ingredients were extracted from Cerner Health Facts database between January 1, 2008 and December 31, 2017. Long Short-Term Memory (LSTM) models were applied to predict opioid use disorder risk in the future based on recent five encounters, and compared to Logistic Regression, Random Forest, Decision Tree and Dense Neural Network. Prediction performance was assessed using F-1 score, precision, recall, and AUROC. Our temporal deep learning model provided promising prediction results which outperformed other methods, with a F1 score of 0.8023 and AUCROC of 0.9369. The model can identify OUD related medications and vital signs as important features for the prediction. LSTM based temporal deep learning model is effective on predicting opioid use disorder using a patient past history of electronic health records, with minimal domain knowledge. It has potential to improve clinical decision support for early intervention and prevention to combat the opioid epidemic. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.04589v1-abstract-full').style.display = 'none'; document.getElementById('2010.04589v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 6 figures</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Hou%2C+W&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Hou%2C+W&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Hou%2C+W&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10