Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 96 results for author: <span class="mathjax">Niu, S</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Niu%2C+S">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Niu, S"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Niu%2C+S&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Niu, S"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Niu%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Niu%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Niu%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07611">arXiv:2411.07611</a> <span> [<a href="https://arxiv.org/pdf/2411.07611">pdf</a>, <a href="https://arxiv.org/format/2411.07611">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Multimodal Clinical Reasoning through Knowledge-augmented Rationale Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shuai Niu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jing Ma</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+L">Liang Bai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhihua Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yida Xu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yunya Song</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xian Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07611v1-abstract-short" style="display: inline;"> Clinical rationales play a pivotal role in accurate disease diagnosis; however, many models predominantly use discriminative methods and overlook the importance of generating supportive rationales. Rationale distillation is a process that transfers knowledge from large language models (LLMs) to smaller language models (SLMs), thereby enhancing the latter's ability to break down complex tasks. Desp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07611v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07611v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07611v1-abstract-full" style="display: none;"> Clinical rationales play a pivotal role in accurate disease diagnosis; however, many models predominantly use discriminative methods and overlook the importance of generating supportive rationales. Rationale distillation is a process that transfers knowledge from large language models (LLMs) to smaller language models (SLMs), thereby enhancing the latter's ability to break down complex tasks. Despite its benefits, rationale distillation alone is inadequate for addressing domain knowledge limitations in tasks requiring specialized expertise, such as disease diagnosis. Effectively embedding domain knowledge in SLMs poses a significant challenge. While current LLMs are primarily geared toward processing textual data, multimodal LLMs that incorporate time series data, especially electronic health records (EHRs), are still evolving. To tackle these limitations, we introduce ClinRaGen, an SLM optimized for multimodal rationale generation in disease diagnosis. ClinRaGen incorporates a unique knowledge-augmented attention mechanism to merge domain knowledge with time series EHR data, utilizing a stepwise rationale distillation strategy to produce both textual and time series-based clinical rationales. Our evaluations show that ClinRaGen markedly improves the SLM's capability to interpret multimodal EHR data and generate accurate clinical rationales, supporting more reliable disease diagnosis, advancing LLM applications in healthcare, and narrowing the performance divide between LLMs and SLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07611v1-abstract-full').style.display = 'none'; document.getElementById('2411.07611v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages. 4 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06667">arXiv:2411.06667</a> <span> [<a href="https://arxiv.org/pdf/2411.06667">pdf</a>, <a href="https://arxiv.org/format/2411.06667">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> DCF-DS: Deep Cascade Fusion of Diarization and Separation for Speech Recognition under Realistic Single-Channel Conditions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shu-Tong Niu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+J">Jun Du</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruo-Yu Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Gao-Bin Yang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+T">Tian Gao</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+J">Jia Pan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yu Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06667v1-abstract-short" style="display: inline;"> We propose a single-channel Deep Cascade Fusion of Diarization and Separation (DCF-DS) framework for back-end speech recognition, combining neural speaker diarization (NSD) and speech separation (SS). First, we sequentially integrate the NSD and SS modules within a joint training framework, enabling the separation module to leverage speaker time boundaries from the diarization module effectively.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06667v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06667v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06667v1-abstract-full" style="display: none;"> We propose a single-channel Deep Cascade Fusion of Diarization and Separation (DCF-DS) framework for back-end speech recognition, combining neural speaker diarization (NSD) and speech separation (SS). First, we sequentially integrate the NSD and SS modules within a joint training framework, enabling the separation module to leverage speaker time boundaries from the diarization module effectively. Then, to complement DCF-DS training, we introduce a window-level decoding scheme that allows the DCF-DS framework to handle the sparse data convergence instability (SDCI) problem. We also explore using an NSD system trained on real datasets to provide more accurate speaker boundaries during decoding. Additionally, we incorporate an optional multi-input multi-output speech enhancement module (MIMO-SE) within the DCF-DS framework, which offers further performance gains. Finally, we enhance diarization results by re-clustering DCF-DS outputs, improving ASR accuracy. By incorporating the DCF-DS method, we achieved first place in the realistic single-channel track of the CHiME-8 NOTSOFAR-1 challenge. We also perform the evaluation on the open LibriCSS dataset, achieving a new state-of-the-art performance on single-channel speech recognition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06667v1-abstract-full').style.display = 'none'; document.getElementById('2411.06667v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01833">arXiv:2411.01833</a> <span> [<a href="https://arxiv.org/pdf/2411.01833">pdf</a>, <a href="https://arxiv.org/format/2411.01833">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> OwMatch: Conditional Self-Labeling with Consistency for Open-World Semi-Supervised Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shengjie Niu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+L">Lifan Lin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jian Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01833v1-abstract-short" style="display: inline;"> Semi-supervised learning (SSL) offers a robust framework for harnessing the potential of unannotated data. Traditionally, SSL mandates that all classes possess labeled instances. However, the emergence of open-world SSL (OwSSL) introduces a more practical challenge, wherein unlabeled data may encompass samples from unseen classes. This scenario leads to misclassification of unseen classes as known… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01833v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01833v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01833v1-abstract-full" style="display: none;"> Semi-supervised learning (SSL) offers a robust framework for harnessing the potential of unannotated data. Traditionally, SSL mandates that all classes possess labeled instances. However, the emergence of open-world SSL (OwSSL) introduces a more practical challenge, wherein unlabeled data may encompass samples from unseen classes. This scenario leads to misclassification of unseen classes as known ones, consequently undermining classification accuracy. To overcome this challenge, this study revisits two methodologies from self-supervised and semi-supervised learning, self-labeling and consistency, tailoring them to address the OwSSL problem. Specifically, we propose an effective framework called OwMatch, combining conditional self-labeling and open-world hierarchical thresholding. Theoretically, we analyze the estimation of class distribution on unlabeled data through rigorous statistical analysis, thus demonstrating that OwMatch can ensure the unbiasedness of the self-label assignment estimator with reliability. Comprehensive empirical analyses demonstrate that our method yields substantial performance enhancements across both known and unknown classes in comparison to previous studies. Code is available at https://github.com/niusj03/OwMatch. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01833v1-abstract-full').style.display = 'none'; document.getElementById('2411.01833v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024 camera-ready (10 pages, 4 figures) with the appendices (10 pages, 7 figures)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22350">arXiv:2410.22350</a> <span> [<a href="https://arxiv.org/pdf/2410.22350">pdf</a>, <a href="https://arxiv.org/format/2410.22350">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Quality-Aware End-to-End Audio-Visual Neural Speaker Diarization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+M">Mao-Kui He</a>, <a href="/search/cs?searchtype=author&query=Du%2C+J">Jun Du</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shu-Tong Niu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qing-Feng Liu</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+C">Chin-Hui Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22350v1-abstract-short" style="display: inline;"> In this paper, we propose a quality-aware end-to-end audio-visual neural speaker diarization framework, which comprises three key techniques. First, our audio-visual model takes both audio and visual features as inputs, utilizing a series of binary classification output layers to simultaneously identify the activities of all speakers. This end-to-end framework is meticulously designed to effective… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22350v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22350v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22350v1-abstract-full" style="display: none;"> In this paper, we propose a quality-aware end-to-end audio-visual neural speaker diarization framework, which comprises three key techniques. First, our audio-visual model takes both audio and visual features as inputs, utilizing a series of binary classification output layers to simultaneously identify the activities of all speakers. This end-to-end framework is meticulously designed to effectively handle situations of overlapping speech, providing accurate discrimination between speech and non-speech segments through the utilization of multi-modal information. Next, we employ a quality-aware audio-visual fusion structure to address signal quality issues for both audio degradations, such as noise, reverberation and other distortions, and video degradations, such as occlusions, off-screen speakers, or unreliable detection. Finally, a cross attention mechanism applied to multi-speaker embedding empowers the network to handle scenarios with varying numbers of speakers. Our experimental results, obtained from various data sets, demonstrate the robustness of our proposed techniques in diverse acoustic environments. Even in scenarios with severely degraded video quality, our system attains performance levels comparable to the best available audio-visual systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22350v1-abstract-full').style.display = 'none'; document.getElementById('2410.22350v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15029">arXiv:2410.15029</a> <span> [<a href="https://arxiv.org/pdf/2410.15029">pdf</a>, <a href="https://arxiv.org/format/2410.15029">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Multimodal Sentiment Analysis for Missing Modality through Self-Distillation and Unified Modality Cross-Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Weng%2C+Y">Yuzhe Weng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haotian Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+T">Tian Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kewei Li</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shutong Niu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+J">Jun Du</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15029v1-abstract-short" style="display: inline;"> In multimodal sentiment analysis, collecting text data is often more challenging than video or audio due to higher annotation costs and inconsistent automatic speech recognition (ASR) quality. To address this challenge, our study has developed a robust model that effectively integrates multimodal sentiment information, even in the absence of text modality. Specifically, we have developed a Double-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15029v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15029v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15029v1-abstract-full" style="display: none;"> In multimodal sentiment analysis, collecting text data is often more challenging than video or audio due to higher annotation costs and inconsistent automatic speech recognition (ASR) quality. To address this challenge, our study has developed a robust model that effectively integrates multimodal sentiment information, even in the absence of text modality. Specifically, we have developed a Double-Flow Self-Distillation Framework, including Unified Modality Cross-Attention (UMCA) and Modality Imagination Autoencoder (MIA), which excels at processing both scenarios with complete modalities and those with missing text modality. In detail, when the text modality is missing, our framework uses the LLM-based model to simulate the text representation from the audio modality, while the MIA module supplements information from the other two modalities to make the simulated text representation similar to the real text representation. To further align the simulated and real representations, and to enable the model to capture the continuous nature of sample orders in sentiment valence regression tasks, we have also introduced the Rank-N Contrast (RNC) loss function. When testing on the CMU-MOSEI, our model achieved outstanding performance on MAE and significantly outperformed other models when text modality is missing. The code is available at: https://github.com/WarmCongee/SDUMC <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15029v1-abstract-full').style.display = 'none'; document.getElementById('2410.15029v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12788">arXiv:2410.12788</a> <span> [<a href="https://arxiv.org/pdf/2410.12788">pdf</a>, <a href="https://arxiv.org/format/2410.12788">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Meta-Chunking: Learning Efficient Text Segmentation via Logical Perception </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jihao Zhao</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+Z">Zhiyuan Ji</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+P">Pengnian Qi</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Simin Niu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+B">Bo Tang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+F">Feiyu Xiong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhiyu Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12788v1-abstract-short" style="display: inline;"> Retrieval-Augmented Generation (RAG), while serving as a viable complement to large language models (LLMs), often overlooks the crucial aspect of text chunking within its pipeline, which impacts the quality of knowledge-intensive tasks. This paper introduces the concept of Meta-Chunking, which refers to a granularity between sentences and paragraphs, consisting of a collection of sentences within… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12788v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12788v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12788v1-abstract-full" style="display: none;"> Retrieval-Augmented Generation (RAG), while serving as a viable complement to large language models (LLMs), often overlooks the crucial aspect of text chunking within its pipeline, which impacts the quality of knowledge-intensive tasks. This paper introduces the concept of Meta-Chunking, which refers to a granularity between sentences and paragraphs, consisting of a collection of sentences within a paragraph that have deep linguistic logical connections. To implement Meta-Chunking, we designed two strategies based on LLMs: Margin Sampling Chunking and Perplexity Chunking. The former employs LLMs to perform binary classification on whether consecutive sentences need to be segmented, making decisions based on the probability difference obtained from margin sampling. The latter precisely identifies text chunk boundaries by analyzing the characteristics of perplexity distribution. Additionally, considering the inherent complexity of different texts, we propose a strategy that combines Meta-Chunking with dynamic merging to achieve a balance between fine-grained and coarse-grained text chunking. Experiments conducted on eleven datasets demonstrate that Meta-Chunking can more efficiently improve the performance of single-hop and multi-hop question answering based on RAG. For instance, on the 2WikiMultihopQA dataset, it outperforms similarity chunking by 1.32 while only consuming 45.8% of the time. Our code is available at https://github.com/IAAR-Shanghai/Meta-Chunking. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12788v1-abstract-full').style.display = 'none'; document.getElementById('2410.12788v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07738">arXiv:2410.07738</a> <span> [<a href="https://arxiv.org/pdf/2410.07738">pdf</a>, <a href="https://arxiv.org/format/2410.07738">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Federated Domain Adaptation with Multi-Domain Prototype-Based Federated Fine-Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+Y">Yiyang Duan</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shuaicheng Niu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yang Cao</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+W+Y+B">Wei Yang Bryan Lim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07738v1-abstract-short" style="display: inline;"> Federated Domain Adaptation (FDA) is a Federated Learning (FL) scenario where models are trained across multiple clients with unique data domains but a shared category space, without transmitting private data. The primary challenge in FDA is data heterogeneity, which causes significant divergences in gradient updates when using conventional averaging-based aggregation methods, reducing the efficac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07738v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07738v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07738v1-abstract-full" style="display: none;"> Federated Domain Adaptation (FDA) is a Federated Learning (FL) scenario where models are trained across multiple clients with unique data domains but a shared category space, without transmitting private data. The primary challenge in FDA is data heterogeneity, which causes significant divergences in gradient updates when using conventional averaging-based aggregation methods, reducing the efficacy of the global model. This further undermines both in-domain and out-of-domain performance (within the same federated system but outside the local client). To address this, we propose a novel framework called \textbf{M}ulti-domain \textbf{P}rototype-based \textbf{F}ederated Fine-\textbf{T}uning (MPFT). MPFT fine-tunes a pre-trained model using multi-domain prototypes, i.e., pretrained representations enriched with domain-specific information from category-specific local data. This enables supervised learning on the server to derive a globally optimized adapter that is subsequently distributed to local clients, without the intrusion of data privacy. Empirical results show that MPFT significantly improves both in-domain and out-of-domain accuracy over conventional methods, enhancing knowledge preservation and adaptation in FDA. Notably, MPFT achieves convergence within a single communication round, greatly reducing computation and communication costs. To ensure privacy, MPFT applies differential privacy to protect the prototypes. Additionally, we develop a prototype-based feature space hijacking attack to evaluate robustness, confirming that raw data samples remain unrecoverable even after extensive training epochs. The complete implementation of MPFL is available at \url{https://anonymous.4open.science/r/DomainFL/}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07738v1-abstract-full').style.display = 'none'; document.getElementById('2410.07738v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05986">arXiv:2410.05986</a> <span> [<a href="https://arxiv.org/pdf/2410.05986">pdf</a>, <a href="https://arxiv.org/format/2410.05986">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> The USTC-NERCSLIP Systems for the CHiME-8 MMCSG Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Ya Jiang</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+H">Hongbo Lan</a>, <a href="/search/cs?searchtype=author&query=Du%2C+J">Jun Du</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qing Wang</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shutong Niu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05986v1-abstract-short" style="display: inline;"> In the two-person conversation scenario with one wearing smart glasses, transcribing and displaying the speaker's content in real-time is an intriguing application, providing a priori information for subsequent tasks such as translation and comprehension. Meanwhile, multi-modal data captured from the smart glasses is scarce. Therefore, we propose utilizing simulation data with multiple overlap rat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05986v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05986v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05986v1-abstract-full" style="display: none;"> In the two-person conversation scenario with one wearing smart glasses, transcribing and displaying the speaker's content in real-time is an intriguing application, providing a priori information for subsequent tasks such as translation and comprehension. Meanwhile, multi-modal data captured from the smart glasses is scarce. Therefore, we propose utilizing simulation data with multiple overlap rates and a one-to-one matching training strategy to narrow down the deviation for the model training between real and simulated data. In addition, combining IMU unit data in the model can assist the audio to achieve better real-time speech recognition performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05986v1-abstract-full').style.display = 'none'; document.getElementById('2410.05986v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05262">arXiv:2410.05262</a> <span> [<a href="https://arxiv.org/pdf/2410.05262">pdf</a>, <a href="https://arxiv.org/format/2410.05262">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> TurtleBench: Evaluating Top Language Models via Real-World Yes/No Puzzles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+Q">Qingchen Yu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+S">Shichao Song</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+K">Ke Fang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yunfeng Shi</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zifan Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hanyu Wang</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Simin Niu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhiyu Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05262v1-abstract-short" style="display: inline;"> As the application of Large Language Models (LLMs) expands, the demand for reliable evaluations increases. Existing LLM evaluation benchmarks primarily rely on static datasets, making it challenging to assess model performance in dynamic interactions with users. Moreover, these benchmarks often depend on specific background knowledge, complicating the measurement of a model's logical reasoning cap… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05262v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05262v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05262v1-abstract-full" style="display: none;"> As the application of Large Language Models (LLMs) expands, the demand for reliable evaluations increases. Existing LLM evaluation benchmarks primarily rely on static datasets, making it challenging to assess model performance in dynamic interactions with users. Moreover, these benchmarks often depend on specific background knowledge, complicating the measurement of a model's logical reasoning capabilities. Other dynamic evaluation methods based on strong models or manual efforts may introduce biases and incur high costs and time demands, hindering large-scale application. To address these issues, we propose TurtleBench. TurtleBench collects real user guesses from our online Turtle Soup Puzzle platform that we developed. This approach allows for the relatively dynamic generation of evaluation datasets, mitigating the risk of model cheating while aligning assessments more closely with genuine user needs for reasoning capabilities, thus enhancing the reliability of evaluations. TurtleBench includes 1,532 user guesses along with the correctness of guesses after annotation. Using this dataset, we thoroughly evaluated nine of the most advanced LLMs available today. Notably, the OpenAI o1 series models did not achieve leading results in these evaluations. We propose several hypotheses for further research, such as "the latent reasoning of o1 utilizes trivial Chain-of-Thought (CoT) techniques" and "increasing CoT length not only provides reasoning benefits but also incurs noise costs." <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05262v1-abstract-full').style.display = 'none'; document.getElementById('2410.05262v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.20434">arXiv:2409.20434</a> <span> [<a href="https://arxiv.org/pdf/2409.20434">pdf</a>, <a href="https://arxiv.org/format/2409.20434">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> QAEncoder: Towards Aligned Representation Learning in Question Answering System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhengren Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Q">Qinhan Yu</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+S">Shida Wei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhiyu Li</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+F">Feiyu Xiong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaoxing Wang</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Simin Niu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+H">Hao Liang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wentao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.20434v1-abstract-short" style="display: inline;"> Modern QA systems entail retrieval-augmented generation (RAG) for accurate and trustworthy responses. However, the inherent gap between user queries and relevant documents hinders precise matching. Motivated by our conical distribution hypothesis, which posits that potential queries and documents form a cone-like structure in the embedding space, we introduce QAEncoder, a training-free approach to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20434v1-abstract-full').style.display = 'inline'; document.getElementById('2409.20434v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.20434v1-abstract-full" style="display: none;"> Modern QA systems entail retrieval-augmented generation (RAG) for accurate and trustworthy responses. However, the inherent gap between user queries and relevant documents hinders precise matching. Motivated by our conical distribution hypothesis, which posits that potential queries and documents form a cone-like structure in the embedding space, we introduce QAEncoder, a training-free approach to bridge this gap. Specifically, QAEncoder estimates the expectation of potential queries in the embedding space as a robust surrogate for the document embedding, and attaches document fingerprints to effectively distinguish these embeddings. Extensive experiments on fourteen embedding models across six languages and eight datasets validate QAEncoder's alignment capability, which offers a plug-and-play solution that seamlessly integrates with existing RAG architectures and training-based methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20434v1-abstract-full').style.display = 'none'; document.getElementById('2409.20434v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Report number:</span> v00 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16803">arXiv:2409.16803</a> <span> [<a href="https://arxiv.org/pdf/2409.16803">pdf</a>, <a href="https://arxiv.org/format/2409.16803">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Incorporating Spatial Cues in Modular Speaker Diarization for Multi-channel Multi-party Meetings </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruoyu Wang</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shutong Niu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Gaobin Yang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+J">Jun Du</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+S">Shuangqing Qian</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+T">Tian Gao</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+J">Jia Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16803v1-abstract-short" style="display: inline;"> Although fully end-to-end speaker diarization systems have made significant progress in recent years, modular systems often achieve superior results in real-world scenarios due to their greater adaptability and robustness. Historically, modular speaker diarization methods have seldom discussed how to leverage spatial cues from multi-channel speech. This paper proposes a three-stage modular system… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16803v1-abstract-full').style.display = 'inline'; document.getElementById('2409.16803v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16803v1-abstract-full" style="display: none;"> Although fully end-to-end speaker diarization systems have made significant progress in recent years, modular systems often achieve superior results in real-world scenarios due to their greater adaptability and robustness. Historically, modular speaker diarization methods have seldom discussed how to leverage spatial cues from multi-channel speech. This paper proposes a three-stage modular system to enhance single-channel neural speaker diarization systems and recognition performance by utilizing spatial cues from multi-channel speech to provide more accurate initialization for each stage of neural speaker diarization (NSD) decoding: (1) Overlap detection and continuous speech separation (CSS) on multi-channel speech are used to obtain cleaner single speaker speech segments for clustering, followed by the first NSD decoding pass. (2) The results from the first pass initialize a complex Angular Central Gaussian Mixture Model (cACGMM) to estimate speaker-wise masks on multi-channel speech, and through Overlap-add and Mask-to-VAD, achieve initialization with lower speaker error (SpkErr), followed by the second NSD decoding pass. (3) The second decoding results are used for guided source separation (GSS), recognizing and filtering short segments containing less one word to obtain cleaner speech segments, followed by re-clustering and the final NSD decoding pass. We presented the progressively explored evaluation results from the CHiME-8 NOTSOFAR-1 (Natural Office Talkers in Settings Of Far-field Audio Recordings) challenge, demonstrating the effectiveness of our system and its contribution to improving recognition performance. Our final system achieved the first place in the challenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16803v1-abstract-full').style.display = 'none'; document.getElementById('2409.16803v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, Submitted to ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.02041">arXiv:2409.02041</a> <span> [<a href="https://arxiv.org/pdf/2409.02041">pdf</a>, <a href="https://arxiv.org/format/2409.02041">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> The USTC-NERCSLIP Systems for the CHiME-8 NOTSOFAR-1 Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shutong Niu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruoyu Wang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+J">Jun Du</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Gaobin Yang</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+Y">Yanhui Tu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Siyuan Wu</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+S">Shuangqing Qian</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Huaxin Wu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Haitao Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xueyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+G">Guolong Zhong</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xindi Yu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jieru Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mengzhi Wang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+D">Di Cai</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+T">Tian Gao</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+G">Genshun Wan</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+F">Feng Ma</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+J">Jia Pan</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jianqing Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.02041v2-abstract-short" style="display: inline;"> This technical report outlines our submission system for the CHiME-8 NOTSOFAR-1 Challenge. The primary difficulty of this challenge is the dataset recorded across various conference rooms, which captures real-world complexities such as high overlap rates, background noises, a variable number of speakers, and natural conversation styles. To address these issues, we optimized the system in several a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02041v2-abstract-full').style.display = 'inline'; document.getElementById('2409.02041v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.02041v2-abstract-full" style="display: none;"> This technical report outlines our submission system for the CHiME-8 NOTSOFAR-1 Challenge. The primary difficulty of this challenge is the dataset recorded across various conference rooms, which captures real-world complexities such as high overlap rates, background noises, a variable number of speakers, and natural conversation styles. To address these issues, we optimized the system in several aspects: For front-end speech signal processing, we introduced a data-driven joint training method for diarization and separation (JDS) to enhance audio quality. Additionally, we also integrated traditional guided source separation (GSS) for multi-channel track to provide complementary information for the JDS. For back-end speech recognition, we enhanced Whisper with WavLM, ConvNeXt, and Transformer innovations, applying multi-task training and Noise KLD augmentation, to significantly advance ASR robustness and accuracy. Our system attained a Time-Constrained minimum Permutation Word Error Rate (tcpWER) of 14.265% and 22.989% on the CHiME-8 NOTSOFAR-1 Dev-set-2 multi-channel and single-channel tracks, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02041v2-abstract-full').style.display = 'none'; document.getElementById('2409.02041v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12599">arXiv:2408.12599</a> <span> [<a href="https://arxiv.org/pdf/2408.12599">pdf</a>, <a href="https://arxiv.org/format/2408.12599">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Controllable Text Generation for Large Language Models: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xun Liang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hanyu Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yezhaohui Wang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+S">Shichao Song</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiawei Yang</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Simin Niu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jie Hu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dan Liu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+S">Shunyu Yao</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+F">Feiyu Xiong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhiyu Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12599v1-abstract-short" style="display: inline;"> In Natural Language Processing (NLP), Large Language Models (LLMs) have demonstrated high text generation quality. However, in real-world applications, LLMs must meet increasingly complex requirements. Beyond avoiding misleading or inappropriate content, LLMs are also expected to cater to specific user needs, such as imitating particular writing styles or generating text with poetic richness. Thes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12599v1-abstract-full').style.display = 'inline'; document.getElementById('2408.12599v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12599v1-abstract-full" style="display: none;"> In Natural Language Processing (NLP), Large Language Models (LLMs) have demonstrated high text generation quality. However, in real-world applications, LLMs must meet increasingly complex requirements. Beyond avoiding misleading or inappropriate content, LLMs are also expected to cater to specific user needs, such as imitating particular writing styles or generating text with poetic richness. These varied demands have driven the development of Controllable Text Generation (CTG) techniques, which ensure that outputs adhere to predefined control conditions--such as safety, sentiment, thematic consistency, and linguistic style--while maintaining high standards of helpfulness, fluency, and diversity. This paper systematically reviews the latest advancements in CTG for LLMs, offering a comprehensive definition of its core concepts and clarifying the requirements for control conditions and text quality. We categorize CTG tasks into two primary types: content control and attribute control. The key methods are discussed, including model retraining, fine-tuning, reinforcement learning, prompt engineering, latent space manipulation, and decoding-time intervention. We analyze each method's characteristics, advantages, and limitations, providing nuanced insights for achieving generation control. Additionally, we review CTG evaluation methods, summarize its applications across domains, and address key challenges in current research, including reduced fluency and practicality. We also propose several appeals, such as placing greater emphasis on real-world applications in future research. This paper aims to offer valuable guidance to researchers and developers in the field. Our reference list and Chinese version are open-sourced at https://github.com/IAAR-Shanghai/CTGSurvey. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12599v1-abstract-full').style.display = 'none'; document.getElementById('2408.12599v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">52 pages, 11 figures, 7 tables, 11 equations</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> A.2; I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05705">arXiv:2408.05705</a> <span> [<a href="https://arxiv.org/pdf/2408.05705">pdf</a>, <a href="https://arxiv.org/format/2408.05705">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TC-KANRecon: High-Quality and Accelerated MRI Reconstruction via Adaptive KAN Mechanisms and Intelligent Feature Scaling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ge%2C+R">Ruiquan Ge</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xiao Yu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yifei Chen</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+F">Fan Jia</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+S">Shenghao Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+G">Guanyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yiyu Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenyan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+D">Dong Zeng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Changmiao Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qiegen Liu</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shanzhou Niu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05705v1-abstract-short" style="display: inline;"> Magnetic Resonance Imaging (MRI) has become essential in clinical diagnosis due to its high resolution and multiple contrast mechanisms. However, the relatively long acquisition time limits its broader application. To address this issue, this study presents an innovative conditional guided diffusion model, named as TC-KANRecon, which incorporates the Multi-Free U-KAN (MF-UKAN) module and a dynamic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05705v1-abstract-full').style.display = 'inline'; document.getElementById('2408.05705v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05705v1-abstract-full" style="display: none;"> Magnetic Resonance Imaging (MRI) has become essential in clinical diagnosis due to its high resolution and multiple contrast mechanisms. However, the relatively long acquisition time limits its broader application. To address this issue, this study presents an innovative conditional guided diffusion model, named as TC-KANRecon, which incorporates the Multi-Free U-KAN (MF-UKAN) module and a dynamic clipping strategy. TC-KANRecon model aims to accelerate the MRI reconstruction process through deep learning methods while maintaining the quality of the reconstructed images. The MF-UKAN module can effectively balance the tradeoff between image denoising and structure preservation. Specifically, it presents the multi-head attention mechanisms and scalar modulation factors, which significantly enhances the model's robustness and structure preservation capabilities in complex noise environments. Moreover, the dynamic clipping strategy in TC-KANRecon adjusts the cropping interval according to the sampling steps, thereby mitigating image detail loss typically caused by traditional cropping methods and enriching the visual features of the images. Furthermore, the MC-Model module incorporates full-sampling k-space information, realizing efficient fusion of conditional information, enhancing the model's ability to process complex data, and improving the realism and detail richness of reconstructed images. Experimental results demonstrate that the proposed method outperforms other MRI reconstruction methods in both qualitative and quantitative evaluations. Notably, TC-KANRecon method exhibits excellent reconstruction results when processing high-noise, low-sampling-rate MRI data. Our source code is available at https://github.com/lcbkmm/TC-KANRecon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05705v1-abstract-full').style.display = 'none'; document.getElementById('2408.05705v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.00512">arXiv:2408.00512</a> <span> [<a href="https://arxiv.org/pdf/2408.00512">pdf</a>, <a href="https://arxiv.org/format/2408.00512">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3678884.3681875">10.1145/3678884.3681875 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> FlowGPT: Exploring Domains, Output Modalities, and Goals of Community-Generated AI Chatbots </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xian Li</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Y">Yuanning Han</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Di Liu</a>, <a href="/search/cs?searchtype=author&query=An%2C+P">Pengcheng An</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shuo Niu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.00512v1-abstract-short" style="display: inline;"> The advent of Generative AI and Large Language Models has not only enhanced the intelligence of interactive applications but also catalyzed the formation of communities passionate about customizing these AI capabilities. FlowGPT, an emerging platform for sharing AI prompts and use cases, exemplifies this trend, attracting many creators who develop and share chatbots with a broader community. Despi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00512v1-abstract-full').style.display = 'inline'; document.getElementById('2408.00512v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.00512v1-abstract-full" style="display: none;"> The advent of Generative AI and Large Language Models has not only enhanced the intelligence of interactive applications but also catalyzed the formation of communities passionate about customizing these AI capabilities. FlowGPT, an emerging platform for sharing AI prompts and use cases, exemplifies this trend, attracting many creators who develop and share chatbots with a broader community. Despite its growing popularity, there remains a significant gap in understanding the types and purposes of the AI tools created and shared by community members. In this study, we delve into FlowGPT and present our preliminary findings on the domain, output modality, and goals of chatbots. We aim to highlight common types of AI applications and identify future directions for research in AI-sharing communities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00512v1-abstract-full').style.display = 'none'; document.getElementById('2408.00512v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear at CSCW Companion '24</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.00668">arXiv:2407.00668</a> <span> [<a href="https://arxiv.org/pdf/2407.00668">pdf</a>, <a href="https://arxiv.org/format/2407.00668">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> HRDE: Retrieval-Augmented Large Language Models for Chinese Health Rumor Detection and Explainability </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yanfang Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Ding Chen</a>, <a href="/search/cs?searchtype=author&query=Song%2C+S">Shichao Song</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Simin Niu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hanyu Wang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zeyun Tang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+F">Feiyu Xiong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhiyu Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.00668v2-abstract-short" style="display: inline;"> As people increasingly prioritize their health, the speed and breadth of health information dissemination on the internet have also grown. At the same time, the presence of false health information (health rumors) intermingled with genuine content poses a significant potential threat to public health. However, current research on Chinese health rumors still lacks a large-scale, public, and open-so… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00668v2-abstract-full').style.display = 'inline'; document.getElementById('2407.00668v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.00668v2-abstract-full" style="display: none;"> As people increasingly prioritize their health, the speed and breadth of health information dissemination on the internet have also grown. At the same time, the presence of false health information (health rumors) intermingled with genuine content poses a significant potential threat to public health. However, current research on Chinese health rumors still lacks a large-scale, public, and open-source dataset of health rumor information, as well as effective and reliable rumor detection methods. This paper addresses this gap by constructing a dataset containing 1.12 million health-related rumors (HealthRCN) through web scraping of common health-related questions and a series of data processing steps. HealthRCN is the largest known dataset of Chinese health information rumors to date. Based on this dataset, we propose retrieval-augmented large language models for Chinese health rumor detection and explainability (HRDE). This model leverages retrieved relevant information to accurately determine whether the input health information is a rumor and provides explanatory responses, effectively aiding users in verifying the authenticity of health information. In evaluation experiments, we compared multiple models and found that HRDE outperformed them all, including GPT-4-1106-Preview, in rumor detection accuracy and answer quality. HRDE achieved an average accuracy of 91.04% and an F1 score of 91.58%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00668v2-abstract-full').style.display = 'none'; document.getElementById('2407.00668v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19528">arXiv:2406.19528</a> <span> [<a href="https://arxiv.org/pdf/2406.19528">pdf</a>, <a href="https://arxiv.org/format/2406.19528">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3678884.3681850">10.1145/3678884.3681850 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Harnessing LLMs for Automated Video Content Analysis: An Exploratory Workflow of Short Videos on Depression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+J+L">Jiaying Lizzy Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yunlong Wang</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+Y">Yao Lyu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yiheng Su</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shuo Niu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X+O">Xuhai Orson Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19528v3-abstract-short" style="display: inline;"> Despite the growing interest in leveraging Large Language Models (LLMs) for content analysis, current studies have primarily focused on text-based content. In the present work, we explored the potential of LLMs in assisting video content analysis by conducting a case study that followed a new workflow of LLM-assisted multimodal content analysis. The workflow encompasses codebook design, prompt eng… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19528v3-abstract-full').style.display = 'inline'; document.getElementById('2406.19528v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19528v3-abstract-full" style="display: none;"> Despite the growing interest in leveraging Large Language Models (LLMs) for content analysis, current studies have primarily focused on text-based content. In the present work, we explored the potential of LLMs in assisting video content analysis by conducting a case study that followed a new workflow of LLM-assisted multimodal content analysis. The workflow encompasses codebook design, prompt engineering, LLM processing, and human evaluation. We strategically crafted annotation prompts to get LLM Annotations in structured form and explanation prompts to generate LLM Explanations for a better understanding of LLM reasoning and transparency. To test LLM's video annotation capabilities, we analyzed 203 keyframes extracted from 25 YouTube short videos about depression. We compared the LLM Annotations with those of two human coders and found that LLM has higher accuracy in object and activity Annotations than emotion and genre Annotations. Moreover, we identified the potential and limitations of LLM's capabilities in annotating videos. Based on the findings, we explore opportunities and challenges for future research and improvements to the workflow. We also discuss ethical concerns surrounding future studies based on LLM-assisted video analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19528v3-abstract-full').style.display = 'none'; document.getElementById('2406.19528v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 2 figures, accepted by CSCW 24</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09684">arXiv:2406.09684</a> <span> [<a href="https://arxiv.org/pdf/2406.09684">pdf</a>, <a href="https://arxiv.org/format/2406.09684">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Explainable AI for Comparative Analysis of Intrusion Detection Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Corea%2C+P+M">Pap M. Corea</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yongxin Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jian Wang</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shuteng Niu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+H">Houbing Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09684v2-abstract-short" style="display: inline;"> Explainable Artificial Intelligence (XAI) has become a widely discussed topic, the related technologies facilitate better understanding of conventional black-box models like Random Forest, Neural Networks and etc. However, domain-specific applications of XAI are still insufficient. To fill this gap, this research analyzes various machine learning models to the tasks of binary and multi-class class… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09684v2-abstract-full').style.display = 'inline'; document.getElementById('2406.09684v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09684v2-abstract-full" style="display: none;"> Explainable Artificial Intelligence (XAI) has become a widely discussed topic, the related technologies facilitate better understanding of conventional black-box models like Random Forest, Neural Networks and etc. However, domain-specific applications of XAI are still insufficient. To fill this gap, this research analyzes various machine learning models to the tasks of binary and multi-class classification for intrusion detection from network traffic on the same dataset using occlusion sensitivity. The models evaluated include Linear Regression, Logistic Regression, Linear Support Vector Machine (SVM), K-Nearest Neighbors (KNN), Random Forest, Decision Trees, and Multi-Layer Perceptrons (MLP). We trained all models to the accuracy of 90\% on the UNSW-NB15 Dataset. We found that most classifiers leverage only less than three critical features to achieve such accuracies, indicating that effective feature engineering could actually be far more important for intrusion detection than applying complicated models. We also discover that Random Forest provides the best performance in terms of accuracy, time efficiency and robustness. Data and code available at https://github.com/pcwhy/XML-IntrusionDetection.git <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09684v2-abstract-full').style.display = 'none'; document.getElementById('2406.09684v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to IEEE MeditCom 2024 - WS-05</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.19682">arXiv:2405.19682</a> <span> [<a href="https://arxiv.org/pdf/2405.19682">pdf</a>, <a href="https://arxiv.org/format/2405.19682">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Fully Test-Time Adaptation for Monocular 3D Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hongbin Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yifan Zhang</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shuaicheng Niu</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+S">Shuguang Cui</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhen Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.19682v1-abstract-short" style="display: inline;"> Monocular 3D object detection (Mono 3Det) aims to identify 3D objects from a single RGB image. However, existing methods often assume training and test data follow the same distribution, which may not hold in real-world test scenarios. To address the out-of-distribution (OOD) problems, we explore a new adaptation paradigm for Mono 3Det, termed Fully Test-time Adaptation. It aims to adapt a well-tr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19682v1-abstract-full').style.display = 'inline'; document.getElementById('2405.19682v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.19682v1-abstract-full" style="display: none;"> Monocular 3D object detection (Mono 3Det) aims to identify 3D objects from a single RGB image. However, existing methods often assume training and test data follow the same distribution, which may not hold in real-world test scenarios. To address the out-of-distribution (OOD) problems, we explore a new adaptation paradigm for Mono 3Det, termed Fully Test-time Adaptation. It aims to adapt a well-trained model to unlabeled test data by handling potential data distribution shifts at test time without access to training data and test labels. However, applying this paradigm in Mono 3Det poses significant challenges due to OOD test data causing a remarkable decline in object detection scores. This decline conflicts with the pre-defined score thresholds of existing detection methods, leading to severe object omissions (i.e., rare positive detections and many false negatives). Consequently, the limited positive detection and plenty of noisy predictions cause test-time adaptation to fail in Mono 3Det. To handle this problem, we propose a novel Monocular Test-Time Adaptation (MonoTTA) method, based on two new strategies. 1) Reliability-driven adaptation: we empirically find that high-score objects are still reliable and the optimization of high-score objects can enhance confidence across all detections. Thus, we devise a self-adaptive strategy to identify reliable objects for model adaptation, which discovers potential objects and alleviates omissions. 2) Noise-guard adaptation: since high-score objects may be scarce, we develop a negative regularization term to exploit the numerous low-score objects via negative learning, preventing overfitting to noise and trivial solutions. Experimental results show that MonoTTA brings significant performance gains for Mono 3Det models in OOD test scenarios, approximately 190% gains by average on KITTI and 198% gains on nuScenes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19682v1-abstract-full').style.display = 'none'; document.getElementById('2405.19682v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.16933">arXiv:2405.16933</a> <span> [<a href="https://arxiv.org/pdf/2405.16933">pdf</a>, <a href="https://arxiv.org/format/2405.16933">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Empowering Large Language Models to Set up a Knowledge Retrieval Indexer via Self-Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xun Liang</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Simin Niu</a>, <a href="/search/cs?searchtype=author&query=li%2C+Z">Zhiyu li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Sensen Zhang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+S">Shichao Song</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hanyu Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiawei Yang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+F">Feiyu Xiong</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+B">Bo Tang</a>, <a href="/search/cs?searchtype=author&query=Xi%2C+C">Chenyang Xi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.16933v1-abstract-short" style="display: inline;"> Retrieval-Augmented Generation (RAG) offers a cost-effective approach to injecting real-time knowledge into large language models (LLMs). Nevertheless, constructing and validating high-quality knowledge repositories require considerable effort. We propose a pre-retrieval framework named Pseudo-Graph Retrieval-Augmented Generation (PG-RAG), which conceptualizes LLMs as students by providing them wi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16933v1-abstract-full').style.display = 'inline'; document.getElementById('2405.16933v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.16933v1-abstract-full" style="display: none;"> Retrieval-Augmented Generation (RAG) offers a cost-effective approach to injecting real-time knowledge into large language models (LLMs). Nevertheless, constructing and validating high-quality knowledge repositories require considerable effort. We propose a pre-retrieval framework named Pseudo-Graph Retrieval-Augmented Generation (PG-RAG), which conceptualizes LLMs as students by providing them with abundant raw reading materials and encouraging them to engage in autonomous reading to record factual information in their own words. The resulting concise, well-organized mental indices are interconnected through common topics or complementary facts to form a pseudo-graph database. During the retrieval phase, PG-RAG mimics the human behavior in flipping through notes, identifying fact paths and subsequently exploring the related contexts. Adhering to the principle of the path taken by many is the best, it integrates highly corroborated fact paths to provide a structured and refined sub-graph assisting LLMs. We validated PG-RAG on three specialized question-answering datasets. In single-document tasks, PG-RAG significantly outperformed the current best baseline, KGP-LLaMA, across all key evaluation metrics, with an average overall performance improvement of 11.6%. Specifically, its BLEU score increased by approximately 14.3%, and the QE-F1 metric improved by 23.7%. In multi-document scenarios, the average metrics of PG-RAG were at least 2.35% higher than the best baseline. Notably, the BLEU score and QE-F1 metric showed stable improvements of around 7.55% and 12.75%, respectively. Our code: https://github.com/IAAR-Shanghai/PGRAG. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16933v1-abstract-full').style.display = 'none'; document.getElementById('2405.16933v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.00711">arXiv:2405.00711</a> <span> [<a href="https://arxiv.org/pdf/2405.00711">pdf</a>, <a href="https://arxiv.org/format/2405.00711">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Fake Artificial Intelligence Generated Contents (FAIGC): A Survey of Theories, Detection Methods, and Opportunities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xiaomin Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yezhaohui Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yanfang Chen</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+Z">Zhen Tao</a>, <a href="/search/cs?searchtype=author&query=Xi%2C+D">Dinghao Xi</a>, <a href="/search/cs?searchtype=author&query=Song%2C+S">Shichao Song</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Simin Niu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhiyu Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.00711v2-abstract-short" style="display: inline;"> In recent years, generative artificial intelligence models, represented by Large Language Models (LLMs) and Diffusion Models (DMs), have revolutionized content production methods. These artificial intelligence-generated content (AIGC) have become deeply embedded in various aspects of daily life and work. However, these technologies have also led to the emergence of Fake Artificial Intelligence Gen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.00711v2-abstract-full').style.display = 'inline'; document.getElementById('2405.00711v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.00711v2-abstract-full" style="display: none;"> In recent years, generative artificial intelligence models, represented by Large Language Models (LLMs) and Diffusion Models (DMs), have revolutionized content production methods. These artificial intelligence-generated content (AIGC) have become deeply embedded in various aspects of daily life and work. However, these technologies have also led to the emergence of Fake Artificial Intelligence Generated Content (FAIGC), posing new challenges in distinguishing genuine information. It is crucial to recognize that AIGC technology is akin to a double-edged sword; its potent generative capabilities, while beneficial, also pose risks for the creation and dissemination of FAIGC. In this survey, We propose a new taxonomy that provides a more comprehensive breakdown of the space of FAIGC methods today. Next, we explore the modalities and generative technologies of FAIGC. We introduce FAIGC detection methods and summarize the related benchmark from various perspectives. Finally, we discuss outstanding challenges and promising areas for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.00711v2-abstract-full').style.display = 'none'; document.getElementById('2405.00711v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.01650">arXiv:2404.01650</a> <span> [<a href="https://arxiv.org/pdf/2404.01650">pdf</a>, <a href="https://arxiv.org/format/2404.01650">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Test-Time Model Adaptation with Only Forward Passes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shuaicheng Niu</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+C">Chunyan Miao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guohao Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+P">Pengcheng Wu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+P">Peilin Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.01650v2-abstract-short" style="display: inline;"> Test-time adaptation has proven effective in adapting a given trained model to unseen test samples with potential distribution shifts. However, in real-world scenarios, models are usually deployed on resource-limited devices, e.g., FPGAs, and are often quantized and hard-coded with non-modifiable parameters for acceleration. In light of this, existing methods are often infeasible since they heavil… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01650v2-abstract-full').style.display = 'inline'; document.getElementById('2404.01650v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.01650v2-abstract-full" style="display: none;"> Test-time adaptation has proven effective in adapting a given trained model to unseen test samples with potential distribution shifts. However, in real-world scenarios, models are usually deployed on resource-limited devices, e.g., FPGAs, and are often quantized and hard-coded with non-modifiable parameters for acceleration. In light of this, existing methods are often infeasible since they heavily depend on computation-intensive backpropagation for model updating that may be not supported. To address this, we propose a test-time Forward-Optimization Adaptation (FOA) method. In FOA, we seek to solely learn a newly added prompt (as model's input) via a derivative-free covariance matrix adaptation evolution strategy. To make this strategy work stably under our online unsupervised setting, we devise a novel fitness function by measuring test-training statistic discrepancy and model prediction entropy. Moreover, we design an activation shifting scheme that directly tunes the model activations for shifted test samples, making them align with the source training domain, thereby further enhancing adaptation performance. Without using any backpropagation and altering model weights, FOA runs on quantized 8-bit ViT outperforms gradient-based TENT on full-precision 32-bit ViT, while achieving an up to 24-fold memory reduction on ImageNet-C. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01650v2-abstract-full').style.display = 'none'; document.getElementById('2404.01650v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 4 figures, 17 tables, accepted by International Conference on Machine Learning</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.11491">arXiv:2403.11491</a> <span> [<a href="https://arxiv.org/pdf/2403.11491">pdf</a>, <a href="https://arxiv.org/format/2403.11491">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Uncertainty-Calibrated Test-Time Model Adaptation without Forgetting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+M">Mingkui Tan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guohao Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiaxiang Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yifan Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yaofo Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+P">Peilin Zhao</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shuaicheng Niu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.11491v1-abstract-short" style="display: inline;"> Test-time adaptation (TTA) seeks to tackle potential distribution shifts between training and test data by adapting a given model w.r.t. any test sample. Although recent TTA has shown promising performance, we still face two key challenges: 1) prior methods perform backpropagation for each test sample, resulting in unbearable optimization costs to many applications; 2) while existing TTA can signi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.11491v1-abstract-full').style.display = 'inline'; document.getElementById('2403.11491v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.11491v1-abstract-full" style="display: none;"> Test-time adaptation (TTA) seeks to tackle potential distribution shifts between training and test data by adapting a given model w.r.t. any test sample. Although recent TTA has shown promising performance, we still face two key challenges: 1) prior methods perform backpropagation for each test sample, resulting in unbearable optimization costs to many applications; 2) while existing TTA can significantly improve the test performance on out-of-distribution data, they often suffer from severe performance degradation on in-distribution data after TTA (known as forgetting). To this end, we have proposed an Efficient Anti-Forgetting Test-Time Adaptation (EATA) method which develops an active sample selection criterion to identify reliable and non-redundant samples for test-time entropy minimization. To alleviate forgetting, EATA introduces a Fisher regularizer estimated from test samples to constrain important model parameters from drastic changes. However, in EATA, the adopted entropy loss consistently assigns higher confidence to predictions even for samples that are underlying uncertain, leading to overconfident predictions. To tackle this, we further propose EATA with Calibration (EATA-C) to separately exploit the reducible model uncertainty and the inherent data uncertainty for calibrated TTA. Specifically, we measure the model uncertainty by the divergence between predictions from the full network and its sub-networks, on which we propose a divergence loss to encourage consistent predictions instead of overconfident ones. To further recalibrate prediction confidence, we utilize the disagreement among predicted labels as an indicator of the data uncertainty, and then devise a min-max entropy regularizer to selectively increase and decrease prediction confidence for different samples. Experiments on image classification and semantic segmentation verify the effectiveness of our methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.11491v1-abstract-full').style.display = 'none'; document.getElementById('2403.11491v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 14 tables, 11 figures. arXiv admin note: substantial text overlap with arXiv:2204.02610</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.10206">arXiv:2403.10206</a> <span> [<a href="https://arxiv.org/pdf/2403.10206">pdf</a>, <a href="https://arxiv.org/format/2403.10206">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Instrumentation and Methods for Astrophysics">astro-ph.IM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Solar and Stellar Astrophysics">astro-ph.SR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Instrumentation and Detectors">physics.ins-det</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> </div> </div> <p class="title is-5 mathjax"> A Data-Driven Approach for Mitigating Dark Current Noise and Bad Pixels in Complementary Metal Oxide Semiconductor Cameras for Space-based Telescopes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jia%2C+P">Peng Jia</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+C">Chao Lv</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yushan Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yongyang Sun</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shu Niu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhuoxiao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.10206v1-abstract-short" style="display: inline;"> In recent years, there has been a gradual increase in the performance of Complementary Metal Oxide Semiconductor (CMOS) cameras. These cameras have gained popularity as a viable alternative to charge-coupled device (CCD) cameras in a wide range of applications. One particular application is the CMOS camera installed in small space telescopes. However, the limited power and spatial resources availa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10206v1-abstract-full').style.display = 'inline'; document.getElementById('2403.10206v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.10206v1-abstract-full" style="display: none;"> In recent years, there has been a gradual increase in the performance of Complementary Metal Oxide Semiconductor (CMOS) cameras. These cameras have gained popularity as a viable alternative to charge-coupled device (CCD) cameras in a wide range of applications. One particular application is the CMOS camera installed in small space telescopes. However, the limited power and spatial resources available on satellites present challenges in maintaining ideal observation conditions, including temperature and radiation environment. Consequently, images captured by CMOS cameras are susceptible to issues such as dark current noise and defective pixels. In this paper, we introduce a data-driven framework for mitigating dark current noise and bad pixels for CMOS cameras. Our approach involves two key steps: pixel clustering and function fitting. During pixel clustering step, we identify and group pixels exhibiting similar dark current noise properties. Subsequently, in the function fitting step, we formulate functions that capture the relationship between dark current and temperature, as dictated by the Arrhenius law. Our framework leverages ground-based test data to establish distinct temperature-dark current relations for pixels within different clusters. The cluster results could then be utilized to estimate the dark current noise level and detect bad pixels from real observational data. To assess the effectiveness of our approach, we have conducted tests using real observation data obtained from the Yangwang-1 satellite, equipped with a near-ultraviolet telescope and an optical telescope. The results show a considerable improvement in the detection efficiency of space-based telescopes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10206v1-abstract-full').style.display = 'none'; document.getElementById('2403.10206v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the AJ, comments are welcome. The complete code could be downloaded from: DOI: 10.12149/101387</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.06039">arXiv:2403.06039</a> <span> [<a href="https://arxiv.org/pdf/2403.06039">pdf</a>, <a href="https://arxiv.org/format/2403.06039">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3613905.3651057">10.1145/3613905.3651057 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A Preliminary Exploration of YouTubers' Use of Generative-AI in Content Creation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lyu%2C+Y">Yao Lyu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">He Zhang</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shuo Niu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+J">Jie Cai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.06039v1-abstract-short" style="display: inline;"> Content creators increasingly utilize generative artificial intelligence (Gen-AI) on platforms such as YouTube, TikTok, Instagram, and various blogging sites to produce imaginative images, AI-generated videos, and articles using Large Language Models (LLMs). Despite its growing popularity, there remains an underexplored area concerning the specific domains where AI-generated content is being appli… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.06039v1-abstract-full').style.display = 'inline'; document.getElementById('2403.06039v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.06039v1-abstract-full" style="display: none;"> Content creators increasingly utilize generative artificial intelligence (Gen-AI) on platforms such as YouTube, TikTok, Instagram, and various blogging sites to produce imaginative images, AI-generated videos, and articles using Large Language Models (LLMs). Despite its growing popularity, there remains an underexplored area concerning the specific domains where AI-generated content is being applied, and the methodologies content creators employ with Gen-AI tools during the creation process. This study initially explores this emerging area through a qualitative analysis of 68 YouTube videos demonstrating Gen-AI usage. Our research focuses on identifying the content domains, the variety of tools used, the activities performed, and the nature of the final products generated by Gen-AI in the context of user-generated content. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.06039v1-abstract-full').style.display = 'none'; document.getElementById('2403.06039v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at CHI LBW 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.17316">arXiv:2402.17316</a> <span> [<a href="https://arxiv.org/pdf/2402.17316">pdf</a>, <a href="https://arxiv.org/format/2402.17316">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Robust and Efficient Cloud-Edge Elastic Model Adaptation via Selective Entropy Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yaofo Chen</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shuaicheng Niu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaowei Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shoukai Xu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+H">Hengjie Song</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+M">Mingkui Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.17316v3-abstract-short" style="display: inline;"> The conventional deep learning paradigm often involves training a deep model on a server and then deploying the model or its distilled ones to resource-limited edge devices. Usually, the models shall remain fixed once deployed (at least for some period) due to the potential high cost of model adaptation for both the server and edge sides. However, in many real-world scenarios, the test environment… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.17316v3-abstract-full').style.display = 'inline'; document.getElementById('2402.17316v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.17316v3-abstract-full" style="display: none;"> The conventional deep learning paradigm often involves training a deep model on a server and then deploying the model or its distilled ones to resource-limited edge devices. Usually, the models shall remain fixed once deployed (at least for some period) due to the potential high cost of model adaptation for both the server and edge sides. However, in many real-world scenarios, the test environments may change dynamically (known as distribution shifts), which often results in degraded performance. Thus, one has to adapt the edge models promptly to attain promising performance. Moreover, with the increasing data collected at the edge, this paradigm also fails to further adapt the cloud model for better performance. To address these, we encounter two primary challenges: 1) the edge model has limited computation power and may only support forward propagation; 2) the data transmission budget between cloud and edge devices is limited in latency-sensitive scenarios. In this paper, we establish a Cloud-Edge Elastic Model Adaptation (CEMA) paradigm in which the edge models only need to perform forward propagation and the edge models can be adapted online. In our CEMA, to reduce the communication burden, we devise two criteria to exclude unnecessary samples from uploading to the cloud, i.e., dynamic unreliable and low-informative sample exclusion. Based on the uploaded samples, we update and distribute the affine parameters of normalization layers by distilling from the stronger foundation model to the edge model with a sample replay strategy. Extensive experimental results on ImageNet-C and ImageNet-R verify the effectiveness of our CEMA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.17316v3-abstract-full').style.display = 'none'; document.getElementById('2402.17316v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in ICLR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.17043">arXiv:2401.17043</a> <span> [<a href="https://arxiv.org/pdf/2401.17043">pdf</a>, <a href="https://arxiv.org/format/2401.17043">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> CRUD-RAG: A Comprehensive Chinese Benchmark for Retrieval-Augmented Generation of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lyu%2C+Y">Yuanjie Lyu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhiyu Li</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Simin Niu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+F">Feiyu Xiong</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+B">Bo Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenjin Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Hao Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huanyong Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+T">Tong Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+E">Enhong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.17043v3-abstract-short" style="display: inline;"> Retrieval-Augmented Generation (RAG) is a technique that enhances the capabilities of large language models (LLMs) by incorporating external knowledge sources. This method addresses common LLM limitations, including outdated information and the tendency to produce inaccurate "hallucinated" content. However, the evaluation of RAG systems is challenging, as existing benchmarks are limited in scope a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.17043v3-abstract-full').style.display = 'inline'; document.getElementById('2401.17043v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.17043v3-abstract-full" style="display: none;"> Retrieval-Augmented Generation (RAG) is a technique that enhances the capabilities of large language models (LLMs) by incorporating external knowledge sources. This method addresses common LLM limitations, including outdated information and the tendency to produce inaccurate "hallucinated" content. However, the evaluation of RAG systems is challenging, as existing benchmarks are limited in scope and diversity. Most of the current benchmarks predominantly assess question-answering applications, overlooking the broader spectrum of situations where RAG could prove advantageous. Moreover, they only evaluate the performance of the LLM component of the RAG pipeline in the experiments, and neglect the influence of the retrieval component and the external knowledge database. To address these issues, this paper constructs a large-scale and more comprehensive benchmark, and evaluates all the components of RAG systems in various RAG application scenarios. Specifically, we have categorized the range of RAG applications into four distinct types-Create, Read, Update, and Delete (CRUD), each representing a unique use case. "Create" refers to scenarios requiring the generation of original, varied content. "Read" involves responding to intricate questions in knowledge-intensive situations. "Update" focuses on revising and rectifying inaccuracies or inconsistencies in pre-existing texts. "Delete" pertains to the task of summarizing extensive texts into more concise forms. For each of these CRUD categories, we have developed comprehensive datasets to evaluate the performance of RAG systems. We also analyze the effects of various components of the RAG system, such as the retriever, the context length, the knowledge base construction, and the LLM. Finally, we provide useful insights for optimizing the RAG technology for different scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.17043v3-abstract-full').style.display = 'none'; document.getElementById('2401.17043v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">40 Pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.11671">arXiv:2401.11671</a> <span> [<a href="https://arxiv.org/pdf/2401.11671">pdf</a>, <a href="https://arxiv.org/format/2401.11671">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> RTA-Former: Reverse Transformer Attention for Polyp Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhikai Li</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+M">Murong Yi</a>, <a href="/search/cs?searchtype=author&query=Uneri%2C+A">Ali Uneri</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Sihan Niu</a>, <a href="/search/cs?searchtype=author&query=Jones%2C+C">Craig Jones</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.11671v2-abstract-short" style="display: inline;"> Polyp segmentation is a key aspect of colorectal cancer prevention, enabling early detection and guiding subsequent treatments. Intelligent diagnostic tools, including deep learning solutions, are widely explored to streamline and potentially automate this process. However, even with many powerful network architectures, there still comes the problem of producing accurate edge segmentation. In this… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.11671v2-abstract-full').style.display = 'inline'; document.getElementById('2401.11671v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.11671v2-abstract-full" style="display: none;"> Polyp segmentation is a key aspect of colorectal cancer prevention, enabling early detection and guiding subsequent treatments. Intelligent diagnostic tools, including deep learning solutions, are widely explored to streamline and potentially automate this process. However, even with many powerful network architectures, there still comes the problem of producing accurate edge segmentation. In this paper, we introduce a novel network, namely RTA-Former, that employs a transformer model as the encoder backbone and innovatively adapts Reverse Attention (RA) with a transformer stage in the decoder for enhanced edge segmentation. The results of the experiments illustrate that RTA-Former achieves state-of-the-art (SOTA) performance in five polyp segmentation datasets. The strong capability of RTA-Former holds promise in improving the accuracy of Transformer-based polyp segmentation, potentially leading to better clinical decisions and patient outcomes. Our code is publicly available on GitHub. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.11671v2-abstract-full').style.display = 'none'; document.getElementById('2401.11671v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The paper has been accepted by EMBC 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.11669">arXiv:2401.11669</a> <span> [<a href="https://arxiv.org/pdf/2401.11669">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> An Improved Grey Wolf Optimization Algorithm for Heart Disease Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Niu%2C+S">Sihan Niu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yifan Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhikai Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Shuyao Huang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yujun Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.11669v1-abstract-short" style="display: inline;"> This paper presents a unique solution to challenges in medical image processing by incorporating an adaptive curve grey wolf optimization (ACGWO) algorithm into neural network backpropagation. Neural networks show potential in medical data but suffer from issues like overfitting and lack of interpretability due to imbalanced and scarce data. Traditional Gray Wolf Optimization (GWO) also has its dr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.11669v1-abstract-full').style.display = 'inline'; document.getElementById('2401.11669v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.11669v1-abstract-full" style="display: none;"> This paper presents a unique solution to challenges in medical image processing by incorporating an adaptive curve grey wolf optimization (ACGWO) algorithm into neural network backpropagation. Neural networks show potential in medical data but suffer from issues like overfitting and lack of interpretability due to imbalanced and scarce data. Traditional Gray Wolf Optimization (GWO) also has its drawbacks, such as a lack of population diversity and premature convergence. This paper addresses these problems by introducing an adaptive algorithm, enhancing the standard GWO with a sigmoid function. This algorithm was extensively compared to four leading algorithms using six well-known test functions, outperforming them effectively. Moreover, by utilizing the ACGWO, we increase the robustness and generalization of the neural network, resulting in more interpretable predictions. Applied to the publicly accessible Cleveland Heart Disease dataset, our technique surpasses ten other methods, achieving 86.8% accuracy, indicating its potential for efficient heart disease prediction in the clinical setting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.11669v1-abstract-full').style.display = 'none'; document.getElementById('2401.11669v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.15296">arXiv:2311.15296</a> <span> [<a href="https://arxiv.org/pdf/2311.15296">pdf</a>, <a href="https://arxiv.org/format/2311.15296">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.18653/v1/2024.acl-long.288">10.18653/v1/2024.acl-long.288 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> UHGEval: Benchmarking the Hallucination of Chinese Large Language Models via Unconstrained Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xun Liang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+S">Shichao Song</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Simin Niu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhiyu Li</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+F">Feiyu Xiong</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+B">Bo Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yezhaohui Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+D">Dawei He</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+P">Peng Cheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhonghao Wang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Haiying Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.15296v3-abstract-short" style="display: inline;"> Large language models (LLMs) have emerged as pivotal contributors in contemporary natural language processing and are increasingly being applied across a diverse range of industries. However, these large-scale probabilistic statistical models cannot currently ensure the requisite quality in professional content generation. These models often produce hallucinated text, compromising their practical… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.15296v3-abstract-full').style.display = 'inline'; document.getElementById('2311.15296v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.15296v3-abstract-full" style="display: none;"> Large language models (LLMs) have emerged as pivotal contributors in contemporary natural language processing and are increasingly being applied across a diverse range of industries. However, these large-scale probabilistic statistical models cannot currently ensure the requisite quality in professional content generation. These models often produce hallucinated text, compromising their practical utility in professional contexts. To assess the authentic reliability of LLMs in text generation, numerous initiatives have developed benchmark evaluations for hallucination phenomena. Nevertheless, these benchmarks frequently utilize constrained generation techniques due to cost and temporal constraints. These techniques encompass the use of directed hallucination induction and strategies that deliberately alter authentic text to produce hallucinations. These approaches are not congruent with the unrestricted text generation demanded by real-world applications. Furthermore, a well-established Chinese-language dataset dedicated to the evaluation of hallucinations in text generation is presently lacking. Consequently, we have developed an Unconstrained Hallucination Generation Evaluation (UHGEval) benchmark, designed to compile outputs produced with minimal restrictions by LLMs. Concurrently, we have established a comprehensive benchmark evaluation framework to aid subsequent researchers in undertaking scalable and reproducible experiments. We have also executed extensive experiments, evaluating prominent Chinese language models and the GPT series models to derive professional performance insights regarding hallucination challenges. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.15296v3-abstract-full').style.display = 'none'; document.getElementById('2311.15296v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACL 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.13107">arXiv:2311.13107</a> <span> [<a href="https://arxiv.org/pdf/2311.13107">pdf</a>, <a href="https://arxiv.org/format/2311.13107">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> </div> <p class="title is-5 mathjax"> Powerful Quantum Circuit Resizing with Resource Efficient Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Niu%2C+S">Siyuan Niu</a>, <a href="/search/cs?searchtype=author&query=Hashim%2C+A">Akel Hashim</a>, <a href="/search/cs?searchtype=author&query=Iancu%2C+C">Costin Iancu</a>, <a href="/search/cs?searchtype=author&query=de+Jong%2C+W+A">Wibe Albert de Jong</a>, <a href="/search/cs?searchtype=author&query=Younis%2C+E">Ed Younis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.13107v1-abstract-short" style="display: inline;"> In the noisy intermediate-scale quantum era, mid-circuit measurement and reset operations facilitate novel circuit optimization strategies by reducing a circuit's qubit count in a method called resizing. This paper introduces two such algorithms. The first one leverages gate-dependency rules to reduce qubit count by 61.6% or 45.3% when optimizing depth as well. Based on numerical instantiation and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.13107v1-abstract-full').style.display = 'inline'; document.getElementById('2311.13107v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.13107v1-abstract-full" style="display: none;"> In the noisy intermediate-scale quantum era, mid-circuit measurement and reset operations facilitate novel circuit optimization strategies by reducing a circuit's qubit count in a method called resizing. This paper introduces two such algorithms. The first one leverages gate-dependency rules to reduce qubit count by 61.6% or 45.3% when optimizing depth as well. Based on numerical instantiation and synthesis, the second algorithm finds resizing opportunities in previously unresizable circuits via dependency rules and other state-of-the-art tools. This resizing algorithm reduces qubit count by 20.7% on average for these previously impossible-to-resize circuits. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.13107v1-abstract-full').style.display = 'none'; document.getElementById('2311.13107v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.10952">arXiv:2311.10952</a> <span> [<a href="https://arxiv.org/pdf/2311.10952">pdf</a>, <a href="https://arxiv.org/format/2311.10952">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NAS-ASDet: An Adaptive Design Method for Surface Defect Detection Network using Neural Architecture Search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhenrong Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bin Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weifeng Li</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shuanlong Niu</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+W">Wang Miao</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+T">Tongzhi Niu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.10952v1-abstract-short" style="display: inline;"> Deep convolutional neural networks (CNNs) have been widely used in surface defect detection. However, no CNN architecture is suitable for all detection tasks and designing effective task-specific requires considerable effort. The neural architecture search (NAS) technology makes it possible to automatically generate adaptive data-driven networks. Here, we propose a new method called NAS-ASDet to a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.10952v1-abstract-full').style.display = 'inline'; document.getElementById('2311.10952v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.10952v1-abstract-full" style="display: none;"> Deep convolutional neural networks (CNNs) have been widely used in surface defect detection. However, no CNN architecture is suitable for all detection tasks and designing effective task-specific requires considerable effort. The neural architecture search (NAS) technology makes it possible to automatically generate adaptive data-driven networks. Here, we propose a new method called NAS-ASDet to adaptively design network for surface defect detection. First, a refined and industry-appropriate search space that can adaptively adjust the feature distribution is designed, which consists of repeatedly stacked basic novel cells with searchable attention operations. Then, a progressive search strategy with a deep supervision mechanism is used to explore the search space faster and better. This method can design high-performance and lightweight defect detection networks with data scarcity in industrial scenarios. The experimental results on four datasets demonstrate that the proposed method achieves superior performance and a relatively lighter model size compared to other competitive methods, including both manual and NAS-based approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.10952v1-abstract-full').style.display = 'none'; document.getElementById('2311.10952v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.19193">arXiv:2310.19193</a> <span> [<a href="https://arxiv.org/pdf/2310.19193">pdf</a>, <a href="https://arxiv.org/format/2310.19193">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> A Survey on Watching Social Issue Videos among YouTube and TikTok Users </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shuo Niu</a>, <a href="/search/cs?searchtype=author&query=Shrestha%2C+D">Dilasha Shrestha</a>, <a href="/search/cs?searchtype=author&query=Ghimire%2C+A">Abhisan Ghimire</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zhicong Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.19193v1-abstract-short" style="display: inline;"> The openness and influence of video-sharing platforms (VSPs) such as YouTube and TikTok attracted creators to share videos on various social issues. Although social issue videos (SIVs) affect public opinions and breed misinformation, how VSP users obtain information and interact with SIVs is under-explored. This work surveyed 659 YouTube and 127 TikTok users to understand the motives for consuming… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.19193v1-abstract-full').style.display = 'inline'; document.getElementById('2310.19193v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.19193v1-abstract-full" style="display: none;"> The openness and influence of video-sharing platforms (VSPs) such as YouTube and TikTok attracted creators to share videos on various social issues. Although social issue videos (SIVs) affect public opinions and breed misinformation, how VSP users obtain information and interact with SIVs is under-explored. This work surveyed 659 YouTube and 127 TikTok users to understand the motives for consuming SIVs on VSPs. We found that VSP users are primarily motivated by the information and entertainment gratifications to use the platform. VSP users use SIVs for information-seeking purposes and find YouTube and TikTok convenient to interact with SIVs. VSP users moderately watch SIVs for entertainment and inactively engage in social interactions. SIV consumption is associated with information and socialization gratifications of the platform. VSP users appreciate the diversity of information and opinions but would also do their own research and are concerned about the misinformation and echo chamber problems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.19193v1-abstract-full').style.display = 'none'; document.getElementById('2310.19193v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> J.4 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.19011">arXiv:2310.19011</a> <span> [<a href="https://arxiv.org/pdf/2310.19011">pdf</a>, <a href="https://arxiv.org/format/2310.19011">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Efficient Test-Time Adaptation for Super-Resolution with Second-Order Degradation and Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+Z">Zeshuai Deng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhuokun Chen</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shuaicheng Niu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T+H">Thomas H. Li</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+B">Bohan Zhuang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+M">Mingkui Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.19011v1-abstract-short" style="display: inline;"> Image super-resolution (SR) aims to learn a mapping from low-resolution (LR) to high-resolution (HR) using paired HR-LR training images. Conventional SR methods typically gather the paired training data by synthesizing LR images from HR images using a predetermined degradation model, e.g., Bicubic down-sampling. However, the realistic degradation type of test images may mismatch with the training-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.19011v1-abstract-full').style.display = 'inline'; document.getElementById('2310.19011v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.19011v1-abstract-full" style="display: none;"> Image super-resolution (SR) aims to learn a mapping from low-resolution (LR) to high-resolution (HR) using paired HR-LR training images. Conventional SR methods typically gather the paired training data by synthesizing LR images from HR images using a predetermined degradation model, e.g., Bicubic down-sampling. However, the realistic degradation type of test images may mismatch with the training-time degradation type due to the dynamic changes of the real-world scenarios, resulting in inferior-quality SR images. To address this, existing methods attempt to estimate the degradation model and train an image-specific model, which, however, is quite time-consuming and impracticable to handle rapidly changing domain shifts. Moreover, these methods largely concentrate on the estimation of one degradation type (e.g., blur degradation), overlooking other degradation types like noise and JPEG in real-world test-time scenarios, thus limiting their practicality. To tackle these problems, we present an efficient test-time adaptation framework for SR, named SRTTA, which is able to quickly adapt SR models to test domains with different/unknown degradation types. Specifically, we design a second-order degradation scheme to construct paired data based on the degradation type of the test image, which is predicted by a pre-trained degradation classifier. Then, we adapt the SR model by implementing feature-level reconstruction learning from the initial test image to its second-order degraded counterparts, which helps the SR model generate plausible HR images. Extensive experiments are conducted on newly synthesized corrupted DIV2K datasets with 8 different degradations and several real-world datasets, demonstrating that our SRTTA framework achieves an impressive improvement over existing methods with satisfying speed. The source code is available at https://github.com/DengZeshuai/SRTTA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.19011v1-abstract-full').style.display = 'none'; document.getElementById('2310.19011v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by 37th Conference on Neural Information Processing Systems (NeurIPS 2023)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.09412">arXiv:2309.09412</a> <span> [<a href="https://arxiv.org/pdf/2309.09412">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Cross-attention-based saliency inference for predicting cancer metastasis on whole slide images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+Z">Ziyu Su</a>, <a href="/search/cs?searchtype=author&query=Rezapour%2C+M">Mostafa Rezapour</a>, <a href="/search/cs?searchtype=author&query=Sajjad%2C+U">Usama Sajjad</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shuo Niu</a>, <a href="/search/cs?searchtype=author&query=Gurcan%2C+M+N">Metin Nafi Gurcan</a>, <a href="/search/cs?searchtype=author&query=Niazi%2C+M+K+K">Muhammad Khalid Khan Niazi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.09412v1-abstract-short" style="display: inline;"> Although multiple instance learning (MIL) methods are widely used for automatic tumor detection on whole slide images (WSI), they suffer from the extreme class imbalance within the small tumor WSIs. This occurs when the tumor comprises only a few isolated cells. For early detection, it is of utmost importance that MIL algorithms can identify small tumors, even when they are less than 1% of the siz… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.09412v1-abstract-full').style.display = 'inline'; document.getElementById('2309.09412v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.09412v1-abstract-full" style="display: none;"> Although multiple instance learning (MIL) methods are widely used for automatic tumor detection on whole slide images (WSI), they suffer from the extreme class imbalance within the small tumor WSIs. This occurs when the tumor comprises only a few isolated cells. For early detection, it is of utmost importance that MIL algorithms can identify small tumors, even when they are less than 1% of the size of the WSI. Existing studies have attempted to address this issue using attention-based architectures and instance selection-based methodologies, but have not yielded significant improvements. This paper proposes cross-attention-based salient instance inference MIL (CASiiMIL), which involves a novel saliency-informed attention mechanism, to identify breast cancer lymph node micro-metastasis on WSIs without the need for any annotations. Apart from this new attention mechanism, we introduce a negative representation learning algorithm to facilitate the learning of saliency-informed attention weights for improved sensitivity on tumor WSIs. The proposed model outperforms the state-of-the-art MIL methods on two popular tumor metastasis detection datasets, and demonstrates great cross-center generalizability. In addition, it exhibits excellent accuracy in classifying WSIs with small tumor lesions. Moreover, we show that the proposed model has excellent interpretability attributed to the saliency-informed attention weights. We strongly believe that the proposed method will pave the way for training algorithms for early tumor detection on large datasets where acquiring fine-grained annotations is practically impossible. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.09412v1-abstract-full').style.display = 'none'; document.getElementById('2309.09412v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.09180">arXiv:2309.09180</a> <span> [<a href="https://arxiv.org/pdf/2309.09180">pdf</a>, <a href="https://arxiv.org/format/2309.09180">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Neural Speaker Diarization Using Memory-Aware Multi-Speaker Embedding with Sequence-to-Sequence Architecture </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+G">Gaobin Yang</a>, <a href="/search/cs?searchtype=author&query=He%2C+M">Maokui He</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shutong Niu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruoyu Wang</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+Y">Yanyan Yue</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+S">Shuangqing Qian</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shilong Wu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+J">Jun Du</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+C">Chin-Hui Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.09180v2-abstract-short" style="display: inline;"> We propose a novel neural speaker diarization system using memory-aware multi-speaker embedding with sequence-to-sequence architecture (NSD-MS2S), which integrates the strengths of memory-aware multi-speaker embedding (MA-MSE) and sequence-to-sequence (Seq2Seq) architecture, leading to improvement in both efficiency and performance. Next, we further decrease the memory occupation of decoding by in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.09180v2-abstract-full').style.display = 'inline'; document.getElementById('2309.09180v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.09180v2-abstract-full" style="display: none;"> We propose a novel neural speaker diarization system using memory-aware multi-speaker embedding with sequence-to-sequence architecture (NSD-MS2S), which integrates the strengths of memory-aware multi-speaker embedding (MA-MSE) and sequence-to-sequence (Seq2Seq) architecture, leading to improvement in both efficiency and performance. Next, we further decrease the memory occupation of decoding by incorporating input features fusion and then employ a multi-head attention mechanism to capture features at different levels. NSD-MS2S achieved a macro diarization error rate (DER) of 15.9% on the CHiME-7 EVAL set, which signifies a relative improvement of 49% over the official baseline system, and is the key technique for us to achieve the best performance for the main track of CHiME-7 DASR Challenge. Additionally, we introduce a deep interactive module (DIM) in MA-MSE module to better retrieve a cleaner and more discriminative multi-speaker embedding, enabling the current model to outperform the system we used in the CHiME-7 DASR Challenge. Our code will be available at https://github.com/liyunlongaaa/NSD-MS2S. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.09180v2-abstract-full').style.display = 'none'; document.getElementById('2309.09180v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.07383">arXiv:2309.07383</a> <span> [<a href="https://arxiv.org/pdf/2309.07383">pdf</a>, <a href="https://arxiv.org/ps/2309.07383">ps</a>, <a href="https://arxiv.org/format/2309.07383">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Rates of Convergence in Certain Native Spaces of Approximations used in Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bouland%2C+A">Ali Bouland</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shengyuan Niu</a>, <a href="/search/cs?searchtype=author&query=Paruchuri%2C+S+T">Sai Tej Paruchuri</a>, <a href="/search/cs?searchtype=author&query=Kurdila%2C+A">Andrew Kurdila</a>, <a href="/search/cs?searchtype=author&query=Burns%2C+J">John Burns</a>, <a href="/search/cs?searchtype=author&query=Schuster%2C+E">Eugenio Schuster</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.07383v4-abstract-short" style="display: inline;"> This paper studies convergence rates for some value function approximations that arise in a collection of reproducing kernel Hilbert spaces (RKHS) $H(惟)$. By casting an optimal control problem in a specific class of native spaces, strong rates of convergence are derived for the operator equation that enables offline approximations that appear in policy iteration. Explicit upper bounds on error in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.07383v4-abstract-full').style.display = 'inline'; document.getElementById('2309.07383v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.07383v4-abstract-full" style="display: none;"> This paper studies convergence rates for some value function approximations that arise in a collection of reproducing kernel Hilbert spaces (RKHS) $H(惟)$. By casting an optimal control problem in a specific class of native spaces, strong rates of convergence are derived for the operator equation that enables offline approximations that appear in policy iteration. Explicit upper bounds on error in value function and controller approximations are derived in terms of power function $\mathcal{P}_{H,N}$ for the space of finite dimensional approximants $H_N$ in the native space $H(惟)$. These bounds are geometric in nature and refine some well-known, now classical results concerning convergence of approximations of value functions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.07383v4-abstract-full').style.display = 'none'; document.getElementById('2309.07383v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.14638">arXiv:2308.14638</a> <span> [<a href="https://arxiv.org/pdf/2308.14638">pdf</a>, <a href="https://arxiv.org/format/2308.14638">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> The USTC-NERCSLIP Systems for the CHiME-7 DASR Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruoyu Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+M">Maokui He</a>, <a href="/search/cs?searchtype=author&query=Du%2C+J">Jun Du</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hengshun Zhou</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shutong Niu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hang Chen</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+Y">Yanyan Yue</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Gaobin Yang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shilong Wu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Lei Sun</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+Y">Yanhui Tu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+H">Haitao Tang</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+S">Shuangqing Qian</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+T">Tian Gao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mengzhi Wang</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+G">Genshun Wan</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+J">Jia Pan</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jianqing Gao</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+C">Chin-Hui Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.14638v2-abstract-short" style="display: inline;"> This technical report details our submission system to the CHiME-7 DASR Challenge, which focuses on speaker diarization and speech recognition under complex multi-speaker scenarios. Additionally, it also evaluates the efficiency of systems in handling diverse array devices. To address these issues, we implemented an end-to-end speaker diarization system and introduced a rectification strategy base… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.14638v2-abstract-full').style.display = 'inline'; document.getElementById('2308.14638v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.14638v2-abstract-full" style="display: none;"> This technical report details our submission system to the CHiME-7 DASR Challenge, which focuses on speaker diarization and speech recognition under complex multi-speaker scenarios. Additionally, it also evaluates the efficiency of systems in handling diverse array devices. To address these issues, we implemented an end-to-end speaker diarization system and introduced a rectification strategy based on multi-channel spatial information. This approach significantly diminished the word error rates (WER). In terms of recognition, we utilized publicly available pre-trained models as the foundational models to train our end-to-end speech recognition models. Our system attained a Macro-averaged diarization-attributed WER (DA-WER) of 21.01% on the CHiME-7 evaluation set, which signifies a relative improvement of 62.04% over the official baseline system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.14638v2-abstract-full').style.display = 'none'; document.getElementById('2308.14638v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by 2023 CHiME Workshop, Oral</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.13996">arXiv:2307.13996</a> <span> [<a href="https://arxiv.org/pdf/2307.13996">pdf</a>, <a href="https://arxiv.org/ps/2307.13996">ps</a>, <a href="https://arxiv.org/format/2307.13996">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Fast algorithms for k-submodular maximization subject to a matroid constraint </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shuxian Niu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qian Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yang Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Min Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.13996v1-abstract-short" style="display: inline;"> In this paper, we apply a Threshold-Decreasing Algorithm to maximize $k$-submodular functions under a matroid constraint, which reduces the query complexity of the algorithm compared to the greedy algorithm with little loss in approximation ratio. We give a $(\frac{1}{2} - 蔚)$-approximation algorithm for monotone $k$-submodular function maximization, and a $(\frac{1}{3} - 蔚)$-approximation algorit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.13996v1-abstract-full').style.display = 'inline'; document.getElementById('2307.13996v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.13996v1-abstract-full" style="display: none;"> In this paper, we apply a Threshold-Decreasing Algorithm to maximize $k$-submodular functions under a matroid constraint, which reduces the query complexity of the algorithm compared to the greedy algorithm with little loss in approximation ratio. We give a $(\frac{1}{2} - 蔚)$-approximation algorithm for monotone $k$-submodular function maximization, and a $(\frac{1}{3} - 蔚)$-approximation algorithm for non-monotone case, with complexity $O(\frac{n(k\cdot EO + IO)}蔚 \log \frac{r}蔚)$, where $r$ denotes the rank of the matroid, and $IO, EO$ denote the number of oracles to evaluate whether a subset is an independent set and to compute the function value of $f$, respectively. Since the constraint of total size can be looked as a special matroid, called uniform matroid, then we present the fast algorithm for maximizing $k$-submodular functions subject to a total size constraint as corollaries. corollaries. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.13996v1-abstract-full').style.display = 'none'; document.getElementById('2307.13996v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.18998">arXiv:2305.18998</a> <span> [<a href="https://arxiv.org/pdf/2305.18998">pdf</a>, <a href="https://arxiv.org/format/2305.18998">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Blind Beamforming for Intelligent Surface </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lai%2C+W">Wenhai Lai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenyu Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+F">Fan Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shaobo Niu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+K">Kaiming Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.18998v2-abstract-short" style="display: inline;"> Configuring intelligent surface (IS) or passive antenna array without any channel knowledge, namely blind beamforming, is a frontier research topic in the wireless communication field. Existing methods in the previous literature for blind beamforming include the RFocus and the CSM, the effectiveness of which has been demonstrated on hardware prototypes. However, this paper points out a subtle issu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.18998v2-abstract-full').style.display = 'inline'; document.getElementById('2305.18998v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.18998v2-abstract-full" style="display: none;"> Configuring intelligent surface (IS) or passive antenna array without any channel knowledge, namely blind beamforming, is a frontier research topic in the wireless communication field. Existing methods in the previous literature for blind beamforming include the RFocus and the CSM, the effectiveness of which has been demonstrated on hardware prototypes. However, this paper points out a subtle issue with these blind beamforming algorithms: the RFocus and the CSM may fail to work in the non-line-of-sight (NLoS) channel case. To address this issue, we suggest a grouping strategy that enables adaptive blind beamforming. Specifically, the reflective elements (REs) of the IS are divided into three groups; each group is configured randomly to obtain a dataset of random samples. We then extract the statistical feature of the wireless environment from the random samples, thereby coordinating phase shifts of the IS without channel acquisition. The RE grouping plays a critical role in guaranteeing performance gain in the NLoS case. In particular, if we place all the REs in the same group, the proposed algorithm would reduce to the RFocus and the CSM. We validate the advantage of the proposed blind beamforming algorithm in the real-world networks at 3.5 GHz aside from simulations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.18998v2-abstract-full').style.display = 'none'; document.getElementById('2305.18998v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.12649">arXiv:2305.12649</a> <span> [<a href="https://arxiv.org/pdf/2305.12649">pdf</a>, <a href="https://arxiv.org/format/2305.12649">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Imbalance-Agnostic Source-Free Domain Adaptation via Avatar Prototype Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hongbin Lin</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+M">Mingkui Tan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yifan Zhang</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+Z">Zhen Qiu</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shuaicheng Niu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dong Liu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Q">Qing Du</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yanxia Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.12649v1-abstract-short" style="display: inline;"> Source-free Unsupervised Domain Adaptation (SF-UDA) aims to adapt a well-trained source model to an unlabeled target domain without access to the source data. One key challenge is the lack of source data during domain adaptation. To handle this, we propose to mine the hidden knowledge of the source model and exploit it to generate source avatar prototypes. To this end, we propose a Contrastive Pro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.12649v1-abstract-full').style.display = 'inline'; document.getElementById('2305.12649v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.12649v1-abstract-full" style="display: none;"> Source-free Unsupervised Domain Adaptation (SF-UDA) aims to adapt a well-trained source model to an unlabeled target domain without access to the source data. One key challenge is the lack of source data during domain adaptation. To handle this, we propose to mine the hidden knowledge of the source model and exploit it to generate source avatar prototypes. To this end, we propose a Contrastive Prototype Generation and Adaptation (CPGA) method. CPGA consists of two stages: Prototype generation and Prototype adaptation. Extensive experiments on three UDA benchmark datasets demonstrate the superiority of CPGA. However, existing SF.UDA studies implicitly assume balanced class distributions for both the source and target domains, which hinders their real applications. To address this issue, we study a more practical SF-UDA task, termed imbalance-agnostic SF-UDA, where the class distributions of both the unseen source domain and unlabeled target domain are unknown and could be arbitrarily skewed. This task is much more challenging than vanilla SF-UDA due to the co-occurrence of covariate shifts and unidentified class distribution shifts between the source and target domains. To address this task, we extend CPGA and propose a new Target-aware Contrastive Prototype Generation and Adaptation (T-CPGA) method. Specifically, for better prototype adaptation in the imbalance-agnostic scenario, T-CPGA applies a new pseudo label generation strategy to identify unknown target class distribution and generate accurate pseudo labels, by utilizing the collective intelligence of the source model and an additional contrastive language-image pre-trained model. Meanwhile, we further devise a target label-distribution-aware classifier to adapt the model to the unknown target class distribution. We empirically show that T-CPGA significantly outperforms CPGA and other SF-UDA methods in imbalance-agnostic SF-UDA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.12649v1-abstract-full').style.display = 'none'; document.getElementById('2305.12649v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2106.15326</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.06151">arXiv:2303.06151</a> <span> [<a href="https://arxiv.org/pdf/2303.06151">pdf</a>, <a href="https://arxiv.org/format/2303.06151">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> NoiseCAM: Explainable AI for the Boundary Between Noise and Adversarial Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+W">Wenkai Tan</a>, <a href="/search/cs?searchtype=author&query=Renkhoff%2C+J">Justus Renkhoff</a>, <a href="/search/cs?searchtype=author&query=Velasquez%2C+A">Alvaro Velasquez</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziyu Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lusi Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jian Wang</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shuteng Niu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+F">Fan Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yongxin Liu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+H">Houbing Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.06151v1-abstract-short" style="display: inline;"> Deep Learning (DL) and Deep Neural Networks (DNNs) are widely used in various domains. However, adversarial attacks can easily mislead a neural network and lead to wrong decisions. Defense mechanisms are highly preferred in safety-critical applications. In this paper, firstly, we use the gradient class activation map (GradCAM) to analyze the behavior deviation of the VGG-16 network when its inputs… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.06151v1-abstract-full').style.display = 'inline'; document.getElementById('2303.06151v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.06151v1-abstract-full" style="display: none;"> Deep Learning (DL) and Deep Neural Networks (DNNs) are widely used in various domains. However, adversarial attacks can easily mislead a neural network and lead to wrong decisions. Defense mechanisms are highly preferred in safety-critical applications. In this paper, firstly, we use the gradient class activation map (GradCAM) to analyze the behavior deviation of the VGG-16 network when its inputs are mixed with adversarial perturbation or Gaussian noise. In particular, our method can locate vulnerable layers that are sensitive to adversarial perturbation and Gaussian noise. We also show that the behavior deviation of vulnerable layers can be used to detect adversarial examples. Secondly, we propose a novel NoiseCAM algorithm that integrates information from globally and pixel-level weighted class activation maps. Our algorithm is susceptible to adversarial perturbations and will not respond to Gaussian random noise mixed in the inputs. Third, we compare detecting adversarial examples using both behavior deviation and NoiseCAM, and we show that NoiseCAM outperforms behavior deviation modeling in its overall performance. Our work could provide a useful tool to defend against certain adversarial attacks on deep neural networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.06151v1-abstract-full').style.display = 'none'; document.getElementById('2303.06151v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to IEEE Fuzzy 2023. arXiv admin note: text overlap with arXiv:2303.06032</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.06032">arXiv:2303.06032</a> <span> [<a href="https://arxiv.org/pdf/2303.06032">pdf</a>, <a href="https://arxiv.org/format/2303.06032">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Exploring Adversarial Attacks on Neural Networks: An Explainable Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Renkhoff%2C+J">Justus Renkhoff</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+W">Wenkai Tan</a>, <a href="/search/cs?searchtype=author&query=Velasquez%2C+A">Alvaro Velasquez</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+i+Y">illiam Yichen Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yongxin Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jian Wang</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shuteng Niu</a>, <a href="/search/cs?searchtype=author&query=Fazlic%2C+L+B">Lejla Begic Fazlic</a>, <a href="/search/cs?searchtype=author&query=Dartmann%2C+G">Guido Dartmann</a>, <a href="/search/cs?searchtype=author&query=Song%2C+H">Houbing Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.06032v1-abstract-short" style="display: inline;"> Deep Learning (DL) is being applied in various domains, especially in safety-critical applications such as autonomous driving. Consequently, it is of great significance to ensure the robustness of these methods and thus counteract uncertain behaviors caused by adversarial attacks. In this paper, we use gradient heatmaps to analyze the response characteristics of the VGG-16 model when the input ima… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.06032v1-abstract-full').style.display = 'inline'; document.getElementById('2303.06032v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.06032v1-abstract-full" style="display: none;"> Deep Learning (DL) is being applied in various domains, especially in safety-critical applications such as autonomous driving. Consequently, it is of great significance to ensure the robustness of these methods and thus counteract uncertain behaviors caused by adversarial attacks. In this paper, we use gradient heatmaps to analyze the response characteristics of the VGG-16 model when the input images are mixed with adversarial noise and statistically similar Gaussian random noise. In particular, we compare the network response layer by layer to determine where errors occurred. Several interesting findings are derived. First, compared to Gaussian random noise, intentionally generated adversarial noise causes severe behavior deviation by distracting the area of concentration in the networks. Second, in many cases, adversarial examples only need to compromise a few intermediate blocks to mislead the final decision. Third, our experiments revealed that specific blocks are more vulnerable and easier to exploit by adversarial examples. Finally, we demonstrate that the layers $Block4\_conv1$ and $Block5\_cov1$ of the VGG-16 model are more susceptible to adversarial attacks. Our work could provide valuable insights into developing more reliable Deep Neural Network (DNN) models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.06032v1-abstract-full').style.display = 'none'; document.getElementById('2303.06032v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.04253">arXiv:2303.04253</a> <span> [<a href="https://arxiv.org/pdf/2303.04253">pdf</a>, <a href="https://arxiv.org/format/2303.04253">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TMHOI: Translational Model for Human-Object Interaction Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Lijing Zhu</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+Q">Qizhen Lan</a>, <a href="/search/cs?searchtype=author&query=Velasquez%2C+A">Alvaro Velasquez</a>, <a href="/search/cs?searchtype=author&query=Song%2C+H">Houbing Song</a>, <a href="/search/cs?searchtype=author&query=Kamal%2C+A">Acharya Kamal</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Q">Qing Tian</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shuteng Niu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.04253v3-abstract-short" style="display: inline;"> Detecting human-object interactions (HOIs) is an intricate challenge in the field of computer vision. Existing methods for HOI detection heavily rely on appearance-based features, but these may not fully capture all the essential characteristics necessary for accurate detection. To overcome these challenges, we propose an innovative graph-based approach called TMGHOI (Translational Model for Human… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.04253v3-abstract-full').style.display = 'inline'; document.getElementById('2303.04253v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.04253v3-abstract-full" style="display: none;"> Detecting human-object interactions (HOIs) is an intricate challenge in the field of computer vision. Existing methods for HOI detection heavily rely on appearance-based features, but these may not fully capture all the essential characteristics necessary for accurate detection. To overcome these challenges, we propose an innovative graph-based approach called TMGHOI (Translational Model for Human-Object Interaction Detection). Our method effectively captures the sentiment representation of HOIs by integrating both spatial and semantic knowledge. By representing HOIs as a graph, where the interaction components serve as nodes and their spatial relationships as edges. To extract crucial spatial and semantic information, TMGHOI employs separate spatial and semantic encoders. Subsequently, these encodings are combined to construct a knowledge graph that effectively captures the sentiment representation of HOIs. Additionally, the ability to incorporate prior knowledge enhances the understanding of interactions, further boosting detection accuracy. We conducted extensive evaluations on the widely-used HICO-DET datasets to demonstrate the effectiveness of TMGHOI. Our approach outperformed existing state-of-the-art graph-based methods by a significant margin, showcasing its potential as a superior solution for HOI detection. We are confident that TMGHOI has the potential to significantly improve the accuracy and efficiency of HOI detection. Its integration of spatial and semantic knowledge, along with its computational efficiency and practicality, makes it a valuable tool for researchers and practitioners in the computer vision community. As with any research, we acknowledge the importance of further exploration and evaluation on various datasets to establish the generalizability and robustness of our proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.04253v3-abstract-full').style.display = 'none'; document.getElementById('2303.04253v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 3 figures, 2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.12400">arXiv:2302.12400</a> <span> [<a href="https://arxiv.org/pdf/2302.12400">pdf</a>, <a href="https://arxiv.org/format/2302.12400">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Stable Test-Time Adaptation in Dynamic Wild World </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shuaicheng Niu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiaxiang Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yifan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Z">Zhiquan Wen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yaofo Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+P">Peilin Zhao</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+M">Mingkui Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.12400v1-abstract-short" style="display: inline;"> Test-time adaptation (TTA) has shown to be effective at tackling distribution shifts between training and testing data by adapting a given model on test samples. However, the online model updating of TTA may be unstable and this is often a key obstacle preventing existing TTA methods from being deployed in the real world. Specifically, TTA may fail to improve or even harm the model performance whe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.12400v1-abstract-full').style.display = 'inline'; document.getElementById('2302.12400v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.12400v1-abstract-full" style="display: none;"> Test-time adaptation (TTA) has shown to be effective at tackling distribution shifts between training and testing data by adapting a given model on test samples. However, the online model updating of TTA may be unstable and this is often a key obstacle preventing existing TTA methods from being deployed in the real world. Specifically, TTA may fail to improve or even harm the model performance when test data have: 1) mixed distribution shifts, 2) small batch sizes, and 3) online imbalanced label distribution shifts, which are quite common in practice. In this paper, we investigate the unstable reasons and find that the batch norm layer is a crucial factor hindering TTA stability. Conversely, TTA can perform more stably with batch-agnostic norm layers, \ie, group or layer norm. However, we observe that TTA with group and layer norms does not always succeed and still suffers many failure cases. By digging into the failure cases, we find that certain noisy test samples with large gradients may disturb the model adaption and result in collapsed trivial solutions, \ie, assigning the same class label for all samples. To address the above collapse issue, we propose a sharpness-aware and reliable entropy minimization method, called SAR, for further stabilizing TTA from two aspects: 1) remove partial noisy samples with large gradients, 2) encourage model weights to go to a flat minimum so that the model is robust to the remaining noisy samples. Promising results demonstrate that SAR performs more stably over prior methods and is computationally efficient under the above wild test scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.12400v1-abstract-full').style.display = 'none'; document.getElementById('2302.12400v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by International Conference on Learning Representations (ICLR) 2023 as Notable-Top-5%; 27 pages, 10 figures, 18 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.09971">arXiv:2302.09971</a> <span> [<a href="https://arxiv.org/pdf/2302.09971">pdf</a>, <a href="https://arxiv.org/format/2302.09971">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Social4Rec: Distilling User Preference from Social Graph for Video Recommendation in Tencent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+X">Xuanji Xiao</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+H">Huaqiang Dai</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qian Dong</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shuzi Niu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuzhen Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Pei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.09971v3-abstract-short" style="display: inline;"> Despite recommender systems play a key role in network content platforms, mining the user's interests is still a significant challenge. Existing works predict the user interest by utilizing user behaviors, i.e., clicks, views, etc., but current solutions are ineffective when users perform unsettled activities. The latter ones involve new users, which have few activities of any kind, and sparse use… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.09971v3-abstract-full').style.display = 'inline'; document.getElementById('2302.09971v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.09971v3-abstract-full" style="display: none;"> Despite recommender systems play a key role in network content platforms, mining the user's interests is still a significant challenge. Existing works predict the user interest by utilizing user behaviors, i.e., clicks, views, etc., but current solutions are ineffective when users perform unsettled activities. The latter ones involve new users, which have few activities of any kind, and sparse users who have low-frequency behaviors. We uniformly describe both these user-types as "cold users", which are very common but often neglected in network content platforms. To address this issue, we enhance the representation of the user interest by combining his social interest, e.g., friendship, following bloggers, interest groups, etc., with the activity behaviors. Thus, in this work, we present a novel algorithm entitled SocialNet, which adopts a two-stage method to progressively extract the coarse-grained and fine-grained social interest. Our technique then concatenates SocialNet's output with the original user representation to get the final user representation that combines behavior interests and social interests. Offline experiments on Tencent video's recommender system demonstrate the superiority over the baseline behavior-based model. The online experiment also shows a significant performance improvement in clicks and view time in the real-world recommendation system. The source code is available at https://github.com/Social4Rec/SocialNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.09971v3-abstract-full').style.display = 'none'; document.getElementById('2302.09971v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.13926">arXiv:2211.13926</a> <span> [<a href="https://arxiv.org/pdf/2211.13926">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Generative Modeling in Sinogram Domain for Sparse-view CT Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guan%2C+B">Bing Guan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Cailian Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Liu Zhang</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shanzhou Niu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Minghui Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuhao Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Weiwen Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qiegen Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.13926v1-abstract-short" style="display: inline;"> The radiation dose in computed tomography (CT) examinations is harmful for patients but can be significantly reduced by intuitively decreasing the number of projection views. Reducing projection views usually leads to severe aliasing artifacts in reconstructed images. Previous deep learning (DL) techniques with sparse-view data require sparse-view/full-view CT image pairs to train the network with… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.13926v1-abstract-full').style.display = 'inline'; document.getElementById('2211.13926v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.13926v1-abstract-full" style="display: none;"> The radiation dose in computed tomography (CT) examinations is harmful for patients but can be significantly reduced by intuitively decreasing the number of projection views. Reducing projection views usually leads to severe aliasing artifacts in reconstructed images. Previous deep learning (DL) techniques with sparse-view data require sparse-view/full-view CT image pairs to train the network with supervised manners. When the number of projection view changes, the DL network should be retrained with updated sparse-view/full-view CT image pairs. To relieve this limitation, we present a fully unsupervised score-based generative model in sinogram domain for sparse-view CT reconstruction. Specifically, we first train a score-based generative model on full-view sinogram data and use multi-channel strategy to form highdimensional tensor as the network input to capture their prior distribution. Then, at the inference stage, the stochastic differential equation (SDE) solver and data-consistency step were performed iteratively to achieve fullview projection. Filtered back-projection (FBP) algorithm was used to achieve the final image reconstruction. Qualitative and quantitative studies were implemented to evaluate the presented method with several CT data. Experimental results demonstrated that our method achieved comparable or better performance than the supervised learning counterparts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.13926v1-abstract-full').style.display = 'none'; document.getElementById('2211.13926v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 12 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.10856">arXiv:2207.10856</a> <span> [<a href="https://arxiv.org/pdf/2207.10856">pdf</a>, <a href="https://arxiv.org/format/2207.10856">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Prototype-Guided Continual Adaptation for Class-Incremental Unsupervised Domain Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hongbin Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yifan Zhang</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+Z">Zhen Qiu</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shuaicheng Niu</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+C">Chuang Gan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yanxia Liu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+M">Mingkui Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.10856v2-abstract-short" style="display: inline;"> This paper studies a new, practical but challenging problem, called Class-Incremental Unsupervised Domain Adaptation (CI-UDA), where the labeled source domain contains all classes, but the classes in the unlabeled target domain increase sequentially. This problem is challenging due to two difficulties. First, source and target label sets are inconsistent at each time step, which makes it difficult… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.10856v2-abstract-full').style.display = 'inline'; document.getElementById('2207.10856v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.10856v2-abstract-full" style="display: none;"> This paper studies a new, practical but challenging problem, called Class-Incremental Unsupervised Domain Adaptation (CI-UDA), where the labeled source domain contains all classes, but the classes in the unlabeled target domain increase sequentially. This problem is challenging due to two difficulties. First, source and target label sets are inconsistent at each time step, which makes it difficult to conduct accurate domain alignment. Second, previous target classes are unavailable in the current step, resulting in the forgetting of previous knowledge. To address this problem, we propose a novel Prototype-guided Continual Adaptation (ProCA) method, consisting of two solution strategies. 1) Label prototype identification: we identify target label prototypes by detecting shared classes with cumulative prediction probabilities of target samples. 2) Prototype-based alignment and replay: based on the identified label prototypes, we align both domains and enforce the model to retain previous knowledge. With these two strategies, ProCA is able to adapt the source model to a class-incremental unlabeled target domain effectively. Extensive experiments demonstrate the effectiveness and superiority of ProCA in resolving CI-UDA. The source code is available at https://github.com/Hongbin98/ProCA.git <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.10856v2-abstract-full').style.display = 'none'; document.getElementById('2207.10856v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ECCV 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.11673">arXiv:2204.11673</a> <span> [<a href="https://arxiv.org/pdf/2204.11673">pdf</a>, <a href="https://arxiv.org/format/2204.11673">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Incorporating Explicit Knowledge in Pre-trained Language Models for Passage Re-ranking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qian Dong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yiding Liu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+S">Suqi Cheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuaiqiang Wang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Z">Zhicong Cheng</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shuzi Niu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+D">Dawei Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.11673v1-abstract-short" style="display: inline;"> Passage re-ranking is to obtain a permutation over the candidate passage set from retrieval stage. Re-rankers have been boomed by Pre-trained Language Models (PLMs) due to their overwhelming advantages in natural language understanding. However, existing PLM based re-rankers may easily suffer from vocabulary mismatch and lack of domain specific knowledge. To alleviate these problems, explicit know… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.11673v1-abstract-full').style.display = 'inline'; document.getElementById('2204.11673v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.11673v1-abstract-full" style="display: none;"> Passage re-ranking is to obtain a permutation over the candidate passage set from retrieval stage. Re-rankers have been boomed by Pre-trained Language Models (PLMs) due to their overwhelming advantages in natural language understanding. However, existing PLM based re-rankers may easily suffer from vocabulary mismatch and lack of domain specific knowledge. To alleviate these problems, explicit knowledge contained in knowledge graph is carefully introduced in our work. Specifically, we employ the existing knowledge graph which is incomplete and noisy, and first apply it in passage re-ranking task. To leverage a reliable knowledge, we propose a novel knowledge graph distillation method and obtain a knowledge meta graph as the bridge between query and passage. To align both kinds of embedding in the latent space, we employ PLM as text encoder and graph neural network over knowledge meta graph as knowledge encoder. Besides, a novel knowledge injector is designed for the dynamic interaction between text and knowledge encoder. Experimental results demonstrate the effectiveness of our method especially in queries requiring in-depth domain knowledge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.11673v1-abstract-full').style.display = 'none'; document.getElementById('2204.11673v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.11014">arXiv:2204.11014</a> <span> [<a href="https://arxiv.org/pdf/2204.11014">pdf</a>, <a href="https://arxiv.org/format/2204.11014">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Discriminative Feature Learning Framework with Gradient Preference for Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+M">Muhao Xu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xueying Zhou</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xizhan Gao</a>, <a href="/search/cs?searchtype=author&query=He%2C+W">WeiKai He</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Sijie Niu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.11014v1-abstract-short" style="display: inline;"> Unsupervised representation learning has been extensively employed in anomaly detection, achieving impressive performance. Extracting valuable feature vectors that can remarkably improve the performance of anomaly detection are essential in unsupervised representation learning. To this end, we propose a novel discriminative feature learning framework with gradient preference for anomaly detection.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.11014v1-abstract-full').style.display = 'inline'; document.getElementById('2204.11014v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.11014v1-abstract-full" style="display: none;"> Unsupervised representation learning has been extensively employed in anomaly detection, achieving impressive performance. Extracting valuable feature vectors that can remarkably improve the performance of anomaly detection are essential in unsupervised representation learning. To this end, we propose a novel discriminative feature learning framework with gradient preference for anomaly detection. Specifically, we firstly design a gradient preference based selector to store powerful feature points in space and then construct a feature repository, which alleviate the interference of redundant feature vectors and improve inference efficiency. To overcome the looseness of feature vectors, secondly, we present a discriminative feature learning with center constrain to map the feature repository to a compact subspace, so that the anomalous samples are more distinguishable from the normal ones. Moreover, our method can be easily extended to anomaly localization. Extensive experiments on popular industrial and medical anomaly detection datasets demonstrate our proposed framework can achieve competitive results in both anomaly detection and localization. More important, our method outperforms the state-of-the-art in few shot anomaly detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.11014v1-abstract-full').style.display = 'none'; document.getElementById('2204.11014v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Niu%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Niu%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Niu%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository