Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 93 results for author: <span class="mathjax">Shu, H</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Shu%2C+H">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Shu, H"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Shu%2C+H&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Shu, H"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Shu%2C+H&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Shu%2C+H&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Shu%2C+H&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15388">arXiv:2501.15388</a> <span> [<a href="https://arxiv.org/pdf/2501.15388">pdf</a>, <a href="https://arxiv.org/format/2501.15388">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Guaranteed Multidimensional Time Series Prediction via Deterministic Tensor Completion Theory </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shu%2C+H">Hao Shu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jicheng Li</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Yu Jin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hailin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15388v1-abstract-short" style="display: inline;"> In recent years, the prediction of multidimensional time series data has become increasingly important due to its wide-ranging applications. Tensor-based prediction methods have gained attention for their ability to preserve the inherent structure of such data. However, existing approaches, such as tensor autoregression and tensor decomposition, often have consistently failed to provide clear asse… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15388v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15388v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15388v1-abstract-full" style="display: none;"> In recent years, the prediction of multidimensional time series data has become increasingly important due to its wide-ranging applications. Tensor-based prediction methods have gained attention for their ability to preserve the inherent structure of such data. However, existing approaches, such as tensor autoregression and tensor decomposition, often have consistently failed to provide clear assertions regarding the number of samples that can be exactly predicted. While matrix-based methods using nuclear norms address this limitation, their reliance on matrices limits accuracy and increases computational costs when handling multidimensional data. To overcome these challenges, we reformulate multidimensional time series prediction as a deterministic tensor completion problem and propose a novel theoretical framework. Specifically, we develop a deterministic tensor completion theory and introduce the Temporal Convolutional Tensor Nuclear Norm (TCTNN) model. By convolving the multidimensional time series along the temporal dimension and applying the tensor nuclear norm, our approach identifies the maximum forecast horizon for exact predictions. Additionally, TCTNN achieves superior performance in prediction accuracy and computational efficiency compared to existing methods across diverse real-world datasets, including climate temperature, network flow, and traffic ride data. Our implementation is publicly available at https://github.com/HaoShu2000/TCTNN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15388v1-abstract-full').style.display = 'none'; document.getElementById('2501.15388v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13365">arXiv:2501.13365</a> <span> [<a href="https://arxiv.org/pdf/2501.13365">pdf</a>, <a href="https://arxiv.org/format/2501.13365">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Enhanced Extractor-Selector Framework and Symmetrization Weighted Binary Cross-Entropy for Edge Detections </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shu%2C+H">Hao Shu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13365v1-abstract-short" style="display: inline;"> Recent advancements have demonstrated the effectiveness of the extractor-selector (E-S) framework in edge detection (ED) tasks, which achieves state-of-the-art (SOTA) performance in both quantitative metrics and perceptual quality. However, this method still falls short of fully exploiting the potential of feature extractors, as selectors only operate on highly compressed feature maps that lack di… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13365v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13365v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13365v1-abstract-full" style="display: none;"> Recent advancements have demonstrated the effectiveness of the extractor-selector (E-S) framework in edge detection (ED) tasks, which achieves state-of-the-art (SOTA) performance in both quantitative metrics and perceptual quality. However, this method still falls short of fully exploiting the potential of feature extractors, as selectors only operate on highly compressed feature maps that lack diversity and suffer from substantial information loss. Additionally, while union training can improve perceptual quality, the highest evaluation scores are typically obtained without it, creating a trade-off between quantitative accuracy and perceptual fidelity. To address these limitations, we propose an enhanced E-S architecture, which utilizes richer, less-loss feature representations and incorporates auxiliary features during the selection process, thereby improving the effectiveness of the feature selection mechanism. Additionally, we introduce a novel loss function, the Symmetrization Weight Binary Cross-Entropy (SWBCE), which simultaneously emphasizes both the recall of edge pixels and the suppression of erroneous edge predictions, thereby enhancing the predictions both in the perceptual quality and the prediction accuracy. The effectiveness and superiority of our approaches over baseline models, the standard E-S framework, and the standard Weight Binary Cross-Entropy (WBCE) loss function are demonstrated by extensive experiments. For example, our enhanced E-S architecture trained with SWBCE loss function achieves average improvements of 8.25$\%$, 8.01$\%$, and 33.25$\%$ in ODS, OIS, and AP, measured on BIPED2 compared with the baseline models, significantly outperforming the standard E-S method. The results set new benchmarks for ED tasks, and highlight the potential of the methods in beyond. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13365v1-abstract-full').style.display = 'none'; document.getElementById('2501.13365v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02534">arXiv:2501.02534</a> <span> [<a href="https://arxiv.org/pdf/2501.02534">pdf</a>, <a href="https://arxiv.org/format/2501.02534">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Pixel-Wise Feature Selection for Perceptual Edge Detection without post-processing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shu%2C+H">Hao Shu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02534v1-abstract-short" style="display: inline;"> Although deep convolutional neutral networks (CNNs) have significantly enhanced performance in image edge detection (ED), current models remain highly dependent on post-processing techniques such as non-maximum suppression (NMS), and often fail to deliver satisfactory perceptual results, while the performance will deteriorate significantly if the allowed error toleration distance decreases. These… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02534v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02534v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02534v1-abstract-full" style="display: none;"> Although deep convolutional neutral networks (CNNs) have significantly enhanced performance in image edge detection (ED), current models remain highly dependent on post-processing techniques such as non-maximum suppression (NMS), and often fail to deliver satisfactory perceptual results, while the performance will deteriorate significantly if the allowed error toleration distance decreases. These limitations arise from the uniform fusion of features across all pixels, regardless of their specific characteristics, such as the distinction between textural and edge areas. If the features extracted by the ED models are selected more meticulously and encompass greater diversity, the resulting predictions are expected to be more accurate and perceptually meaningful. Motivated by this observation, this paper proposes a novel feature selection paradigm for deep networks that facilitates the differential selection of features and can be seamlessly integrated into existing ED models. By incorporating this additional structure, the performance of conventional ED models is substantially enhanced without post-processing, while simultaneously enhancing the perceptual quality of the predictions. Extensive experimental evaluations validate the effectiveness of the proposed model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02534v1-abstract-full').style.display = 'none'; document.getElementById('2501.02534v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18800">arXiv:2412.18800</a> <span> [<a href="https://arxiv.org/pdf/2412.18800">pdf</a>, <a href="https://arxiv.org/format/2412.18800">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Improving Generated and Retrieved Knowledge Combination Through Zero-shot Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Du%2C+X">Xinkai Du</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Q">Quanjie Han</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+C">Chao Lv</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yan Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yalin Sun</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+H">Hao Shu</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+H">Hongbo Shan</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18800v1-abstract-short" style="display: inline;"> Open-domain Question Answering (QA) has garnered substantial interest by combining the advantages of faithfully retrieved passages and relevant passages generated through Large Language Models (LLMs). However, there is a lack of definitive labels available to pair these sources of knowledge. In order to address this issue, we propose an unsupervised and simple framework called Bi-Reranking for Mer… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18800v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18800v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18800v1-abstract-full" style="display: none;"> Open-domain Question Answering (QA) has garnered substantial interest by combining the advantages of faithfully retrieved passages and relevant passages generated through Large Language Models (LLMs). However, there is a lack of definitive labels available to pair these sources of knowledge. In order to address this issue, we propose an unsupervised and simple framework called Bi-Reranking for Merging Generated and Retrieved Knowledge (BRMGR), which utilizes re-ranking methods for both retrieved passages and LLM-generated passages. We pair the two types of passages using two separate re-ranking methods and then combine them through greedy matching. We demonstrate that BRMGR is equivalent to employing a bipartite matching loss when assigning each retrieved passage with a corresponding LLM-generated passage. The application of our model yielded experimental results from three datasets, improving their performance by +1.7 and +1.6 on NQ and WebQ datasets, respectively, and obtaining comparable result on TriviaQA dataset when compared to competitive baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18800v1-abstract-full').style.display = 'none'; document.getElementById('2412.18800v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11744">arXiv:2412.11744</a> <span> [<a href="https://arxiv.org/pdf/2412.11744">pdf</a>, <a href="https://arxiv.org/format/2412.11744">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Conditional Diffusion Models Based Conditional Independence Testing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yanfeng Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuai Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yingjie Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zhuoran Sun</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+H">Hai Shu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Ziqi Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Renming Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11744v2-abstract-short" style="display: inline;"> Conditional independence (CI) testing is a fundamental task in modern statistics and machine learning. The conditional randomization test (CRT) was recently introduced to test whether two random variables, $X$ and $Y$, are conditionally independent given a potentially high-dimensional set of random variables, $Z$. The CRT operates exceptionally well under the assumption that the conditional distri… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11744v2-abstract-full').style.display = 'inline'; document.getElementById('2412.11744v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11744v2-abstract-full" style="display: none;"> Conditional independence (CI) testing is a fundamental task in modern statistics and machine learning. The conditional randomization test (CRT) was recently introduced to test whether two random variables, $X$ and $Y$, are conditionally independent given a potentially high-dimensional set of random variables, $Z$. The CRT operates exceptionally well under the assumption that the conditional distribution $X|Z$ is known. However, since this distribution is typically unknown in practice, accurately approximating it becomes crucial. In this paper, we propose using conditional diffusion models (CDMs) to learn the distribution of $X|Z$. Theoretically and empirically, it is shown that CDMs closely approximate the true conditional distribution. Furthermore, CDMs offer a more accurate approximation of $X|Z$ compared to GANs, potentially leading to a CRT that performs better than those based on GANs. To accommodate complex dependency structures, we utilize a computationally efficient classifier-based conditional mutual information (CMI) estimator as our test statistic. The proposed testing procedure performs effectively without requiring assumptions about specific distribution forms or feature dependencies, and is capable of handling mixed-type conditioning sets that include both continuous and discrete variables. Theoretical analysis shows that our proposed test achieves a valid control of the type I error. A series of experiments on synthetic data demonstrates that our new test effectively controls both type-I and type-II errors, even in high dimensional scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11744v2-abstract-full').style.display = 'none'; document.getElementById('2412.11744v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 7 figures, aaai 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04907">arXiv:2411.04907</a> <span> [<a href="https://arxiv.org/pdf/2411.04907">pdf</a>, <a href="https://arxiv.org/format/2411.04907">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Missing Data Imputation through Combined Bipartite Graph and Complete Directed Graph </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhaoyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Hongtu Zhu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Ziqi Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yingjie Zhang</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+H">Hai Shu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04907v1-abstract-short" style="display: inline;"> In this paper, we aim to address a significant challenge in the field of missing data imputation: identifying and leveraging the interdependencies among features to enhance missing data imputation for tabular data. We introduce a novel framework named the Bipartite and Complete Directed Graph Neural Network (BCGNN). Within BCGNN, observations and features are differentiated as two distinct node ty… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04907v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04907v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04907v1-abstract-full" style="display: none;"> In this paper, we aim to address a significant challenge in the field of missing data imputation: identifying and leveraging the interdependencies among features to enhance missing data imputation for tabular data. We introduce a novel framework named the Bipartite and Complete Directed Graph Neural Network (BCGNN). Within BCGNN, observations and features are differentiated as two distinct node types, and the values of observed features are converted into attributed edges linking them. The bipartite segment of our framework inductively learns embedding representations for nodes, efficiently utilizing the comprehensive information encapsulated in the attributed edges. In parallel, the complete directed graph segment adeptly outlines and communicates the complex interdependencies among features. When compared to contemporary leading imputation methodologies, BCGNN consistently outperforms them, achieving a noteworthy average reduction of 15% in mean absolute error for feature imputation tasks under different missing mechanisms. Our extensive experimental investigation confirms that an in-depth grasp of the interdependence structure substantially enhances the model's feature embedding ability. We also highlight the model's superior performance in label prediction tasks involving missing data, and its formidable ability to generalize to unseen data points. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04907v1-abstract-full').style.display = 'none'; document.getElementById('2411.04907v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04473">arXiv:2411.04473</a> <span> [<a href="https://arxiv.org/pdf/2411.04473">pdf</a>, <a href="https://arxiv.org/format/2411.04473">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ML-Promise: A Multilingual Dataset for Corporate Promise Verification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Seki%2C+Y">Yohei Seki</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+H">Hakusen Shu</a>, <a href="/search/cs?searchtype=author&query=Lhuissier%2C+A">Ana茂s Lhuissier</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+H">Hanwool Lee</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+J">Juyeon Kang</a>, <a href="/search/cs?searchtype=author&query=Day%2C+M">Min-Yuh Day</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chung-Chi Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04473v1-abstract-short" style="display: inline;"> Promises made by politicians, corporate leaders, and public figures have a significant impact on public perception, trust, and institutional reputation. However, the complexity and volume of such commitments, coupled with difficulties in verifying their fulfillment, necessitate innovative methods for assessing their credibility. This paper introduces the concept of Promise Verification, a systemat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04473v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04473v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04473v1-abstract-full" style="display: none;"> Promises made by politicians, corporate leaders, and public figures have a significant impact on public perception, trust, and institutional reputation. However, the complexity and volume of such commitments, coupled with difficulties in verifying their fulfillment, necessitate innovative methods for assessing their credibility. This paper introduces the concept of Promise Verification, a systematic approach involving steps such as promise identification, evidence assessment, and the evaluation of timing for verification. We propose the first multilingual dataset, ML-Promise, which includes English, French, Chinese, Japanese, and Korean, aimed at facilitating in-depth verification of promises, particularly in the context of Environmental, Social, and Governance (ESG) reports. Given the growing emphasis on corporate environmental contributions, this dataset addresses the challenge of evaluating corporate promises, especially in light of practices like greenwashing. Our findings also explore textual and image-based baselines, with promising results from retrieval-augmented generation (RAG) approaches. This work aims to foster further discourse on the accountability of public commitments across multiple languages and domains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04473v1-abstract-full').style.display = 'none'; document.getElementById('2411.04473v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20026">arXiv:2410.20026</a> <span> [<a href="https://arxiv.org/pdf/2410.20026">pdf</a>, <a href="https://arxiv.org/format/2410.20026">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Robust Algorithms for Surgical Phase Recognition via Digital Twin-based Scene Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ding%2C+H">Hao Ding</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuqian Zhang</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+H">Hongchao Shu</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+X">Xu Lian</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J+W">Ji Woong Kim</a>, <a href="/search/cs?searchtype=author&query=Krieger%2C+A">Axel Krieger</a>, <a href="/search/cs?searchtype=author&query=Unberath%2C+M">Mathias Unberath</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20026v1-abstract-short" style="display: inline;"> Purpose: Surgical phase recognition (SPR) is an integral component of surgical data science, enabling high-level surgical analysis. End-to-end trained neural networks that predict surgical phase directly from videos have shown excellent performance on benchmarks. However, these models struggle with robustness due to non-causal associations in the training set, resulting in poor generalizability. O… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20026v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20026v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20026v1-abstract-full" style="display: none;"> Purpose: Surgical phase recognition (SPR) is an integral component of surgical data science, enabling high-level surgical analysis. End-to-end trained neural networks that predict surgical phase directly from videos have shown excellent performance on benchmarks. However, these models struggle with robustness due to non-causal associations in the training set, resulting in poor generalizability. Our goal is to improve model robustness to variations in the surgical videos by leveraging the digital twin (DT) paradigm -- an intermediary layer to separate high-level analysis (SPR) from low-level processing (geometric understanding). This approach takes advantage of the recent vision foundation models that ensure reliable low-level scene understanding to craft DT-based scene representations that support various high-level tasks. Methods: We present a DT-based framework for SPR from videos. The framework employs vision foundation models to extract representations. We embed the representation in place of raw video inputs in the state-of-the-art Surgformer model. The framework is trained on the Cholec80 dataset and evaluated on out-of-distribution (OOD) and corrupted test samples. Results: Contrary to the vulnerability of the baseline model, our framework demonstrates strong robustness on both OOD and corrupted samples, with a video-level accuracy of 51.1 on the challenging CRCD dataset, 96.0 on an internal robotics training dataset, and 64.4 on a highly corrupted Cholec80 test set. Conclusion: Our findings lend support to the thesis that DT-based scene representations are effective in enhancing model robustness. Future work will seek to improve the feature informativeness, automate feature extraction, and incorporate interpretability for a more comprehensive framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20026v1-abstract-full').style.display = 'none'; document.getElementById('2410.20026v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00386">arXiv:2410.00386</a> <span> [<a href="https://arxiv.org/pdf/2410.00386">pdf</a>, <a href="https://arxiv.org/format/2410.00386">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Seamless Augmented Reality Integration in Arthroscopy: A Pipeline for Articular Reconstruction and Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shu%2C+H">Hongchao Shu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mingxu Liu</a>, <a href="/search/cs?searchtype=author&query=Seenivasan%2C+L">Lalithkumar Seenivasan</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+S">Suxi Gu</a>, <a href="/search/cs?searchtype=author&query=Ku%2C+P">Ping-Cheng Ku</a>, <a href="/search/cs?searchtype=author&query=Knopf%2C+J">Jonathan Knopf</a>, <a href="/search/cs?searchtype=author&query=Taylor%2C+R">Russell Taylor</a>, <a href="/search/cs?searchtype=author&query=Unberath%2C+M">Mathias Unberath</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00386v1-abstract-short" style="display: inline;"> Arthroscopy is a minimally invasive surgical procedure used to diagnose and treat joint problems. The clinical workflow of arthroscopy typically involves inserting an arthroscope into the joint through a small incision, during which surgeons navigate and operate largely by relying on their visual assessment through the arthroscope. However, the arthroscope's restricted field of view and lack of de… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00386v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00386v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00386v1-abstract-full" style="display: none;"> Arthroscopy is a minimally invasive surgical procedure used to diagnose and treat joint problems. The clinical workflow of arthroscopy typically involves inserting an arthroscope into the joint through a small incision, during which surgeons navigate and operate largely by relying on their visual assessment through the arthroscope. However, the arthroscope's restricted field of view and lack of depth perception pose challenges in navigating complex articular structures and achieving surgical precision during procedures. Aiming at enhancing intraoperative awareness, we present a robust pipeline that incorporates simultaneous localization and mapping, depth estimation, and 3D Gaussian splatting to realistically reconstruct intra-articular structures solely based on monocular arthroscope video. Extending 3D reconstruction to Augmented Reality (AR) applications, our solution offers AR assistance for articular notch measurement and annotation anchoring in a human-in-the-loop manner. Compared to traditional Structure-from-Motion and Neural Radiance Field-based methods, our pipeline achieves dense 3D reconstruction and competitive rendering fidelity with explicit 3D representation in 7 minutes on average. When evaluated on four phantom datasets, our method achieves RMSE = 2.21mm reconstruction error, PSNR = 32.86 and SSIM = 0.89 on average. Because our pipeline enables AR reconstruction and guidance directly from monocular arthroscopy without any additional data and/or hardware, our solution may hold the potential for enhancing intraoperative awareness and facilitating surgical precision in arthroscopy. Our AR measurement tool achieves accuracy within 1.59 +/- 1.81mm and the AR annotation tool achieves a mIoU of 0.721. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00386v1-abstract-full').style.display = 'none'; document.getElementById('2410.00386v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, with 2 additional pages as the supplementary. Accepted by AE-CAI 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> F.2.2; I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13107">arXiv:2409.13107</a> <span> [<a href="https://arxiv.org/pdf/2409.13107">pdf</a>, <a href="https://arxiv.org/format/2409.13107">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Towards Robust Automation of Surgical Systems via Digital Twin-based Scene Representations from Foundation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ding%2C+H">Hao Ding</a>, <a href="/search/cs?searchtype=author&query=Seenivasan%2C+L">Lalithkumar Seenivasan</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+H">Hongchao Shu</a>, <a href="/search/cs?searchtype=author&query=Byrd%2C+G">Grayson Byrd</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Han Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+P">Pu Xiao</a>, <a href="/search/cs?searchtype=author&query=Barragan%2C+J+A">Juan Antonio Barragan</a>, <a href="/search/cs?searchtype=author&query=Taylor%2C+R+H">Russell H. Taylor</a>, <a href="/search/cs?searchtype=author&query=Kazanzides%2C+P">Peter Kazanzides</a>, <a href="/search/cs?searchtype=author&query=Unberath%2C+M">Mathias Unberath</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13107v2-abstract-short" style="display: inline;"> Large language model-based (LLM) agents are emerging as a powerful enabler of robust embodied intelligence due to their capability of planning complex action sequences. Sound planning ability is necessary for robust automation in many task domains, but especially in surgical automation. These agents rely on a highly detailed natural language representation of the scene. Thus, to leverage the emerg… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13107v2-abstract-full').style.display = 'inline'; document.getElementById('2409.13107v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13107v2-abstract-full" style="display: none;"> Large language model-based (LLM) agents are emerging as a powerful enabler of robust embodied intelligence due to their capability of planning complex action sequences. Sound planning ability is necessary for robust automation in many task domains, but especially in surgical automation. These agents rely on a highly detailed natural language representation of the scene. Thus, to leverage the emergent capabilities of LLM agents for surgical task planning, developing similarly powerful and robust perception algorithms is necessary to derive a detailed scene representation of the environment from visual input. Previous research has focused primarily on enabling LLM-based task planning while adopting simple yet severely limited perception solutions to meet the needs for bench-top experiments but lack the critical flexibility to scale to less constrained settings. In this work, we propose an alternate perception approach -- a digital twin-based machine perception approach that capitalizes on the convincing performance and out-of-the-box generalization of recent vision foundation models. Integrating our digital twin-based scene representation and LLM agent for planning with the dVRK platform, we develop an embodied intelligence system and evaluate its robustness in performing peg transfer and gauze retrieval tasks. Our approach shows strong task performance and generalizability to varied environment settings. Despite convincing performance, this work is merely a first step towards the integration of digital twin-based scene representations. Future studies are necessary for the realization of a comprehensive digital twin framework to improve the interpretability and generalizability of embodied intelligence in surgery. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13107v2-abstract-full').style.display = 'none'; document.getElementById('2409.13107v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.06690">arXiv:2409.06690</a> <span> [<a href="https://arxiv.org/pdf/2409.06690">pdf</a>, <a href="https://arxiv.org/format/2409.06690">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Benchmarking Sub-Genre Classification For Mainstage Dance Music </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shu%2C+H">Hongzhi Shu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xinglin Li</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Hongyu Jiang</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+M">Minghao Fu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xinyu Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.06690v1-abstract-short" style="display: inline;"> Music classification, with a wide range of applications, is one of the most prominent tasks in music information retrieval. To address the absence of comprehensive datasets and high-performing methods in the classification of mainstage dance music, this work introduces a novel benchmark comprising a new dataset and a baseline. Our dataset extends the number of sub-genres to cover most recent mains… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06690v1-abstract-full').style.display = 'inline'; document.getElementById('2409.06690v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.06690v1-abstract-full" style="display: none;"> Music classification, with a wide range of applications, is one of the most prominent tasks in music information retrieval. To address the absence of comprehensive datasets and high-performing methods in the classification of mainstage dance music, this work introduces a novel benchmark comprising a new dataset and a baseline. Our dataset extends the number of sub-genres to cover most recent mainstage live sets by top DJs worldwide in music festivals. A continuous soft labeling approach is employed to account for tracks that span multiple sub-genres, preserving the inherent sophistication. For the baseline, we developed deep learning models that outperform current state-of-the-art multimodel language models, which struggle to identify house music sub-genres, emphasizing the need for specialized models trained on fine-grained datasets. Our benchmark is applicable to serve for application scenarios such as music recommendation, DJ set curation, and interactive multimedia, where we also provide video demos. Our code is on \url{https://anonymous.4open.science/r/Mainstage-EDM-Benchmark/}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06690v1-abstract-full').style.display = 'none'; document.getElementById('2409.06690v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP 2025</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.1 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.01622">arXiv:2409.01622</a> <span> [<a href="https://arxiv.org/pdf/2409.01622">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> T1-contrast Enhanced MRI Generation from Multi-parametric MRI for Glioma Patients with Latent Tumor Conditioning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Eidex%2C+Z">Zach Eidex</a>, <a href="/search/cs?searchtype=author&query=Safari%2C+M">Mojtaba Safari</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+R+L+J">Richard L. J. Qiu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+D+S">David S. Yu</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+H">Hui-Kuo Shu</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+H">Hui Mao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaofeng Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.01622v1-abstract-short" style="display: inline;"> Objective: Gadolinium-based contrast agents (GBCAs) are commonly used in MRI scans of patients with gliomas to enhance brain tumor characterization using T1-weighted (T1W) MRI. However, there is growing concern about GBCA toxicity. This study develops a deep-learning framework to generate T1-postcontrast (T1C) from pre-contrast multiparametric MRI. Approach: We propose the tumor-aware vision trans… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01622v1-abstract-full').style.display = 'inline'; document.getElementById('2409.01622v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.01622v1-abstract-full" style="display: none;"> Objective: Gadolinium-based contrast agents (GBCAs) are commonly used in MRI scans of patients with gliomas to enhance brain tumor characterization using T1-weighted (T1W) MRI. However, there is growing concern about GBCA toxicity. This study develops a deep-learning framework to generate T1-postcontrast (T1C) from pre-contrast multiparametric MRI. Approach: We propose the tumor-aware vision transformer (TA-ViT) model that predicts high-quality T1C images. The predicted tumor region is significantly improved (P < .001) by conditioning the transformer layers from predicted segmentation maps through adaptive layer norm zero mechanism. The predicted segmentation maps were generated with the multi-parametric residual (MPR) ViT model and transformed into a latent space to produce compressed, feature-rich representations. The TA-ViT model predicted T1C MRI images of 501 glioma cases. Selected patients were split into training (N=400), validation (N=50), and test (N=51) sets. Main Results: Both qualitative and quantitative results demonstrate that the TA-ViT model performs superior against the benchmark MRP-ViT model. Our method produces synthetic T1C MRI with high soft tissue contrast and more accurately reconstructs both the tumor and whole brain volumes. The synthesized T1C images achieved remarkable improvements in both tumor and healthy tissue regions compared to the MRP-ViT model. For healthy tissue and tumor regions, the results were as follows: NMSE: 8.53 +/- 4.61E-4; PSNR: 31.2 +/- 2.2; NCC: 0.908 +/- .041 and NMSE: 1.22 +/- 1.27E-4, PSNR: 41.3 +/- 4.7, and NCC: 0.879 +/- 0.042, respectively. Significance: The proposed method generates synthetic T1C images that closely resemble real T1C images. Future development and application of this approach may enable contrast-agent-free MRI for brain tumor patients, eliminating the risk of GBCA toxicity and simplifying the MRI scan protocol. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01622v1-abstract-full').style.display = 'none'; document.getElementById('2409.01622v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2407.02616</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.01760">arXiv:2408.01760</a> <span> [<a href="https://arxiv.org/pdf/2408.01760">pdf</a>, <a href="https://arxiv.org/format/2408.01760">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Large Language Models for Equivalent Mutant Detection: How Far Are We? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tian%2C+Z">Zhao Tian</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+H">Honglin Shu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Dong Wang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xuejie Cao</a>, <a href="/search/cs?searchtype=author&query=Kamei%2C+Y">Yasutaka Kamei</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junjie Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.01760v1-abstract-short" style="display: inline;"> Mutation testing is vital for ensuring software quality. However, the presence of equivalent mutants is known to introduce redundant cost and bias issues, hindering the effectiveness of mutation testing in practical use. Although numerous equivalent mutant detection (EMD) techniques have been proposed, they exhibit limitations due to the scarcity of training data and challenges in generalizing to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01760v1-abstract-full').style.display = 'inline'; document.getElementById('2408.01760v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.01760v1-abstract-full" style="display: none;"> Mutation testing is vital for ensuring software quality. However, the presence of equivalent mutants is known to introduce redundant cost and bias issues, hindering the effectiveness of mutation testing in practical use. Although numerous equivalent mutant detection (EMD) techniques have been proposed, they exhibit limitations due to the scarcity of training data and challenges in generalizing to unseen mutants. Recently, large language models (LLMs) have been extensively adopted in various code-related tasks and have shown superior performance by more accurately capturing program semantics. Yet the performance of LLMs in equivalent mutant detection remains largely unclear. In this paper, we conduct an empirical study on 3,302 method-level Java mutant pairs to comprehensively investigate the effectiveness and efficiency of LLMs for equivalent mutant detection. Specifically, we assess the performance of LLMs compared to existing EMD techniques, examine the various strategies of LLMs, evaluate the orthogonality between EMD techniques, and measure the time overhead of training and inference. Our findings demonstrate that LLM-based techniques significantly outperform existing techniques (i.e., the average improvement of 35.69% in terms of F1-score), with the fine-tuned code embedding strategy being the most effective. Moreover, LLM-based techniques offer an excellent balance between cost (relatively low training and inference time) and effectiveness. Based on our findings, we further discuss the impact of model size and embedding quality, and provide several promising directions for future research. This work is the first to examine LLMs in equivalent mutant detection, affirming their effectiveness and efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01760v1-abstract-full').style.display = 'none'; document.getElementById('2408.01760v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted by ISSTA'2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.00273">arXiv:2408.00273</a> <span> [<a href="https://arxiv.org/pdf/2408.00273">pdf</a>, <a href="https://arxiv.org/format/2408.00273">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> 3D U-KAN Implementation for Multi-modal MRI Brain Tumor Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+T">Tianze Tang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yanbing Chen</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+H">Hai Shu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.00273v1-abstract-short" style="display: inline;"> We explore the application of U-KAN, a U-Net based network enhanced with Kolmogorov-Arnold Network (KAN) layers, for 3D brain tumor segmentation using multi-modal MRI data. We adapt the original 2D U-KAN model to the 3D task, and introduce a variant called UKAN-SE, which incorporates Squeeze-and-Excitation modules for global attention. We compare the performance of U-KAN and UKAN-SE against existi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00273v1-abstract-full').style.display = 'inline'; document.getElementById('2408.00273v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.00273v1-abstract-full" style="display: none;"> We explore the application of U-KAN, a U-Net based network enhanced with Kolmogorov-Arnold Network (KAN) layers, for 3D brain tumor segmentation using multi-modal MRI data. We adapt the original 2D U-KAN model to the 3D task, and introduce a variant called UKAN-SE, which incorporates Squeeze-and-Excitation modules for global attention. We compare the performance of U-KAN and UKAN-SE against existing methods such as U-Net, Attention U-Net, and Swin UNETR, using the BraTS 2024 dataset. Our results show that U-KAN and UKAN-SE, with approximately 10.6 million parameters, achieve exceptional efficiency, requiring only about 1/4 of the training time of U-Net and Attention U-Net, and 1/6 that of Swin UNETR, while surpassing these models across most evaluation metrics. Notably, UKAN-SE slightly outperforms U-KAN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00273v1-abstract-full').style.display = 'none'; document.getElementById('2408.00273v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.19992">arXiv:2407.19992</a> <span> [<a href="https://arxiv.org/pdf/2407.19992">pdf</a>, <a href="https://arxiv.org/format/2407.19992">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> More precise edge detections </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shu%2C+H">Hao Shu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.19992v3-abstract-short" style="display: inline;"> Image Edge detection (ED) is a base task in computer vision. While the performance of the ED algorithm has been improved greatly by introducing CNN-based models, current models still suffer from unsatisfactory precision rates especially when only a low error toleration distance is allowed. Therefore, model architecture for more precise predictions still needs an investigation. On the other hand, t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19992v3-abstract-full').style.display = 'inline'; document.getElementById('2407.19992v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.19992v3-abstract-full" style="display: none;"> Image Edge detection (ED) is a base task in computer vision. While the performance of the ED algorithm has been improved greatly by introducing CNN-based models, current models still suffer from unsatisfactory precision rates especially when only a low error toleration distance is allowed. Therefore, model architecture for more precise predictions still needs an investigation. On the other hand, the unavoidable noise training data provided by humans would lead to unsatisfactory model predictions even when inputs are edge maps themselves, which also needs a solution. In this paper, more precise ED models are presented with cascaded skipping density blocks (CSDB). Our models obtain state-of-the-art(SOTA) predictions in several datasets, especially in average precision rate (AP), over a high-standard benchmark, which is confirmed by extensive experiments. Also, a novel modification on data augmentation for training is employed, which allows noiseless data to be employed in model training for the first time, and thus further improves the model performance. The relative Python codes can be found on https://github.com/Hao-B-Shu/SDPED. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19992v3-abstract-full').style.display = 'none'; document.getElementById('2407.19992v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.11906">arXiv:2407.11906</a> <span> [<a href="https://arxiv.org/pdf/2407.11906">pdf</a>, <a href="https://arxiv.org/format/2407.11906">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> SegSTRONG-C: Segmenting Surgical Tools Robustly On Non-adversarial Generated Corruptions -- An EndoVis'24 Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ding%2C+H">Hao Ding</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+T">Tuxun Lu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuqian Zhang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+R">Ruixing Liang</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+H">Hongchao Shu</a>, <a href="/search/cs?searchtype=author&query=Seenivasan%2C+L">Lalithkumar Seenivasan</a>, <a href="/search/cs?searchtype=author&query=Long%2C+Y">Yonghao Long</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+Q">Qi Dou</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+C">Cong Gao</a>, <a href="/search/cs?searchtype=author&query=Unberath%2C+M">Mathias Unberath</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.11906v1-abstract-short" style="display: inline;"> Accurate segmentation of tools in robot-assisted surgery is critical for machine perception, as it facilitates numerous downstream tasks including augmented reality feedback. While current feed-forward neural network-based methods exhibit excellent segmentation performance under ideal conditions, these models have proven susceptible to even minor corruptions, significantly impairing the model's pe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11906v1-abstract-full').style.display = 'inline'; document.getElementById('2407.11906v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.11906v1-abstract-full" style="display: none;"> Accurate segmentation of tools in robot-assisted surgery is critical for machine perception, as it facilitates numerous downstream tasks including augmented reality feedback. While current feed-forward neural network-based methods exhibit excellent segmentation performance under ideal conditions, these models have proven susceptible to even minor corruptions, significantly impairing the model's performance. This vulnerability is especially problematic in surgical settings where predictions might be used to inform high-stakes decisions. To better understand model behavior under non-adversarial corruptions, prior work has explored introducing artificial corruptions, like Gaussian noise or contrast perturbation to test set images, to assess model robustness. However, these corruptions are either not photo-realistic or model/task agnostic. Thus, these investigations provide limited insights into model deterioration under realistic surgical corruptions. To address this limitation, we introduce the SegSTRONG-C challenge that aims to promote the development of algorithms robust to unforeseen but plausible image corruptions of surgery, like smoke, bleeding, and low brightness. We collect and release corruption-free mock endoscopic video sequences for the challenge participants to train their algorithms and benchmark them on video sequences with photo-realistic non-adversarial corruptions for a binary robot tool segmentation task. This new benchmark will allow us to carefully study neural network robustness to non-adversarial corruptions of surgery, thus constituting an important first step towards more robust models for surgical computer vision. In this paper, we describe the data collection and annotation protocol, baseline evaluations of established segmentation models, and data augmentation-based techniques to enhance model robustness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11906v1-abstract-full').style.display = 'none'; document.getElementById('2407.11906v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.02616">arXiv:2407.02616</a> <span> [<a href="https://arxiv.org/pdf/2407.02616">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Deep Learning Based Apparent Diffusion Coefficient Map Generation from Multi-parametric MR Images for Patients with Diffuse Gliomas </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Eidex%2C+Z">Zach Eidex</a>, <a href="/search/cs?searchtype=author&query=Safari%2C+M">Mojtaba Safari</a>, <a href="/search/cs?searchtype=author&query=Wynne%2C+J">Jacob Wynne</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+R+L+J">Richard L. J. Qiu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tonghe Wang</a>, <a href="/search/cs?searchtype=author&query=Hernandez%2C+D+V">David Viar Hernandez</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+H">Hui-Kuo Shu</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+H">Hui Mao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaofeng Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.02616v2-abstract-short" style="display: inline;"> Purpose: Apparent diffusion coefficient (ADC) maps derived from diffusion weighted (DWI) MRI provides functional measurements about the water molecules in tissues. However, DWI is time consuming and very susceptible to image artifacts, leading to inaccurate ADC measurements. This study aims to develop a deep learning framework to synthesize ADC maps from multi-parametric MR images. Methods: We pro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02616v2-abstract-full').style.display = 'inline'; document.getElementById('2407.02616v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.02616v2-abstract-full" style="display: none;"> Purpose: Apparent diffusion coefficient (ADC) maps derived from diffusion weighted (DWI) MRI provides functional measurements about the water molecules in tissues. However, DWI is time consuming and very susceptible to image artifacts, leading to inaccurate ADC measurements. This study aims to develop a deep learning framework to synthesize ADC maps from multi-parametric MR images. Methods: We proposed the multiparametric residual vision transformer model (MPR-ViT) that leverages the long-range context of ViT layers along with the precision of convolutional operators. Residual blocks throughout the network significantly increasing the representational power of the model. The MPR-ViT model was applied to T1w and T2- fluid attenuated inversion recovery images of 501 glioma cases from a publicly available dataset including preprocessed ADC maps. Selected patients were divided into training (N=400), validation (N=50) and test (N=51) sets, respectively. Using the preprocessed ADC maps as ground truth, model performance was evaluated and compared against the Vision Convolutional Transformer (VCT) and residual vision transformer (ResViT) models. Results: The results are as follows using T1w + T2-FLAIR MRI as inputs: MPR-ViT - PSNR: 31.0 +/- 2.1, MSE: 0.009 +/- 0.0005, SSIM: 0.950 +/- 0.015. In addition, ablation studies showed the relative impact on performance of each input sequence. Both qualitative and quantitative results indicate that the proposed MR- ViT model performs favorably against the ground truth data. Conclusion: We show that high-quality ADC maps can be synthesized from structural MRI using a MPR- VCT model. Our predicted images show better conformality to the ground truth volume than ResViT and VCT predictions. These high-quality synthetic ADC maps would be particularly useful for disease diagnosis and intervention, especially when ADC maps have artifacts or are unavailable. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02616v2-abstract-full').style.display = 'none'; document.getElementById('2407.02616v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2311.15044</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.00730">arXiv:2407.00730</a> <span> [<a href="https://arxiv.org/pdf/2407.00730">pdf</a>, <a href="https://arxiv.org/format/2407.00730">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> D-CDLF: Decomposition of Common and Distinctive Latent Factors for Multi-view High-dimensional Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shu%2C+H">Hai Shu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.00730v2-abstract-short" style="display: inline;"> A typical approach to the joint analysis of multiple high-dimensional data views is to decompose each view's data matrix into three parts: a low-rank common-source matrix generated by common latent factors of all data views, a low-rank distinctive-source matrix generated by distinctive latent factors of the corresponding data view, and an additive noise matrix. Existing decomposition methods often… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00730v2-abstract-full').style.display = 'inline'; document.getElementById('2407.00730v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.00730v2-abstract-full" style="display: none;"> A typical approach to the joint analysis of multiple high-dimensional data views is to decompose each view's data matrix into three parts: a low-rank common-source matrix generated by common latent factors of all data views, a low-rank distinctive-source matrix generated by distinctive latent factors of the corresponding data view, and an additive noise matrix. Existing decomposition methods often focus on the uncorrelatedness between the common latent factors and distinctive latent factors, but inadequately address the equally necessary uncorrelatedness between distinctive latent factors from different data views. We propose a novel decomposition method, called Decomposition of Common and Distinctive Latent Factors (D-CDLF), to effectively achieve both types of uncorrelatedness for two-view data. We also discuss the estimation of the D-CDLF under high-dimensional settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00730v2-abstract-full').style.display = 'none'; document.getElementById('2407.00730v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This revision updates only Paragraph 1 of Section 2.1 and Remark 2 of Section 3.2 from version 1</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.11257">arXiv:2406.11257</a> <span> [<a href="https://arxiv.org/pdf/2406.11257">pdf</a>, <a href="https://arxiv.org/format/2406.11257">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ExCP: Extreme LLM Checkpoint Compression via Weight-Momentum Joint Shrinking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenshuo Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinghao Chen</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+H">Han Shu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yehui Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yunhe Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.11257v1-abstract-short" style="display: inline;"> Large language models (LLM) have recently attracted significant attention in the field of artificial intelligence. However, the training process of these models poses significant challenges in terms of computational and storage capacities, thus compressing checkpoints has become an urgent problem. In this paper, we propose a novel Extreme Checkpoint Compression (ExCP) framework, which significantl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11257v1-abstract-full').style.display = 'inline'; document.getElementById('2406.11257v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.11257v1-abstract-full" style="display: none;"> Large language models (LLM) have recently attracted significant attention in the field of artificial intelligence. However, the training process of these models poses significant challenges in terms of computational and storage capacities, thus compressing checkpoints has become an urgent problem. In this paper, we propose a novel Extreme Checkpoint Compression (ExCP) framework, which significantly reduces the required storage of training checkpoints while achieving nearly lossless performance. We first calculate the residuals of adjacent checkpoints to obtain the essential but sparse information for higher compression ratio. To further excavate the redundancy parameters in checkpoints, we then propose a weight-momentum joint shrinking method to utilize another important information during the model optimization, i.e., momentum. In particular, we exploit the information of both model and optimizer to discard as many parameters as possible while preserving critical information to ensure optimal performance. Furthermore, we utilize non-uniform quantization to further compress the storage of checkpoints. We extensively evaluate our proposed ExCP framework on several models ranging from 410M to 7B parameters and demonstrate significant storage reduction while maintaining strong performance. For instance, we achieve approximately $70\times$ compression for the Pythia-410M model, with the final performance being as accurate as the original model on various downstream tasks. Codes will be available at https://github.com/Gaffey/ExCP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11257v1-abstract-full').style.display = 'none'; document.getElementById('2406.11257v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2024 oral</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.15230">arXiv:2405.15230</a> <span> [<a href="https://arxiv.org/pdf/2405.15230">pdf</a>, <a href="https://arxiv.org/format/2405.15230">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> $i$REPO: $i$mplicit Reward Pairwise Difference based Empirical Preference Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Le%2C+L+T">Long Tan Le</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+H">Han Shu</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+T">Tung-Anh Nguyen</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+C+S">Choong Seon Hong</a>, <a href="/search/cs?searchtype=author&query=Tran%2C+N+H">Nguyen H. Tran</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.15230v2-abstract-short" style="display: inline;"> While astonishingly capable, large Language Models (LLM) can sometimes produce outputs that deviate from human expectations. Such deviations necessitate an alignment phase to prevent disseminating untruthful, toxic, or biased information. Traditional alignment methods based on reinforcement learning often struggle with the identified instability, whereas preference optimization methods are limited… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.15230v2-abstract-full').style.display = 'inline'; document.getElementById('2405.15230v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.15230v2-abstract-full" style="display: none;"> While astonishingly capable, large Language Models (LLM) can sometimes produce outputs that deviate from human expectations. Such deviations necessitate an alignment phase to prevent disseminating untruthful, toxic, or biased information. Traditional alignment methods based on reinforcement learning often struggle with the identified instability, whereas preference optimization methods are limited by their overfitting to pre-collected hard-label datasets. In this paper, we propose a novel LLM alignment framework named $i$REPO, which utilizes implicit Reward pairwise difference regression for Empirical Preference Optimization. Particularly, $i$REPO employs self-generated datasets labeled by empirical human (or AI annotator) preference to iteratively refine the aligned policy through a novel regression-based loss function. Furthermore, we introduce an innovative algorithm backed by theoretical guarantees for achieving optimal results under ideal assumptions and providing a practical performance-gap result without such assumptions. Experimental results with Phi-2 and Mistral-7B demonstrate that $i$REPO effectively achieves self-alignment using soft-label, self-generated responses and the logit of empirical AI annotators. Furthermore, our approach surpasses preference optimization baselines in evaluations using the Language Model Evaluation Harness and Multi-turn benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.15230v2-abstract-full').style.display = 'none'; document.getElementById('2405.15230v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.06668">arXiv:2404.06668</a> <span> [<a href="https://arxiv.org/pdf/2404.06668">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Atmospheric and Oceanic Physics">physics.ao-ph</span> </div> </div> <p class="title is-5 mathjax"> Forecasting the Future with Future Technologies: Advancements in Large Meteorological Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shu%2C+H">Hailong Shu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yue Wang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+W">Weiwei Song</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Huichuang Guo</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zhen Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.06668v1-abstract-short" style="display: inline;"> The field of meteorological forecasting has undergone a significant transformation with the integration of large models, especially those employing deep learning techniques. This paper reviews the advancements and applications of these models in weather prediction, emphasizing their role in transforming traditional forecasting methods. Models like FourCastNet, Pangu-Weather, GraphCast, ClimaX, and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.06668v1-abstract-full').style.display = 'inline'; document.getElementById('2404.06668v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.06668v1-abstract-full" style="display: none;"> The field of meteorological forecasting has undergone a significant transformation with the integration of large models, especially those employing deep learning techniques. This paper reviews the advancements and applications of these models in weather prediction, emphasizing their role in transforming traditional forecasting methods. Models like FourCastNet, Pangu-Weather, GraphCast, ClimaX, and FengWu have made notable contributions by providing accurate, high-resolution forecasts, surpassing the capabilities of traditional Numerical Weather Prediction (NWP) models. These models utilize advanced neural network architectures, such as Convolutional Neural Networks (CNNs), Graph Neural Networks (GNNs), and Transformers, to process diverse meteorological data, enhancing predictive accuracy across various time scales and spatial resolutions. The paper addresses challenges in this domain, including data acquisition and computational demands, and explores future opportunities for model optimization and hardware advancements. It underscores the integration of artificial intelligence with conventional meteorological techniques, promising improved weather prediction accuracy and a significant contribution to addressing climate-related challenges. This synergy positions large models as pivotal in the evolving landscape of meteorological forecasting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.06668v1-abstract-full').style.display = 'none'; document.getElementById('2404.06668v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.10004">arXiv:2403.10004</a> <span> [<a href="https://arxiv.org/pdf/2403.10004">pdf</a>, <a href="https://arxiv.org/format/2403.10004">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ST-LDM: A Universal Framework for Text-Grounded Object Generation in Real Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xue%2C+X">Xiangtian Xue</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiasong Wu</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+Y">Youyong Kong</a>, <a href="/search/cs?searchtype=author&query=Senhadji%2C+L">Lotfi Senhadji</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+H">Huazhong Shu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.10004v1-abstract-short" style="display: inline;"> We present a novel image editing scenario termed Text-grounded Object Generation (TOG), defined as generating a new object in the real image spatially conditioned by textual descriptions. Existing diffusion models exhibit limitations of spatial perception in complex real-world scenes, relying on additional modalities to enforce constraints, and TOG imposes heightened challenges on scene comprehens… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10004v1-abstract-full').style.display = 'inline'; document.getElementById('2403.10004v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.10004v1-abstract-full" style="display: none;"> We present a novel image editing scenario termed Text-grounded Object Generation (TOG), defined as generating a new object in the real image spatially conditioned by textual descriptions. Existing diffusion models exhibit limitations of spatial perception in complex real-world scenes, relying on additional modalities to enforce constraints, and TOG imposes heightened challenges on scene comprehension under the weak supervision of linguistic information. We propose a universal framework ST-LDM based on Swin-Transformer, which can be integrated into any latent diffusion model with training-free backward guidance. ST-LDM encompasses a global-perceptual autoencoder with adaptable compression scales and hierarchical visual features, parallel with deformable multimodal transformer to generate region-wise guidance for the subsequent denoising process. We transcend the limitation of traditional attention mechanisms that only focus on existing visual features by introducing deformable feature alignment to hierarchically refine spatial positioning fused with multi-scale visual and linguistic information. Extensive Experiments demonstrate that our model enhances the localization of attention mechanisms while preserving the generative capabilities inherent to diffusion models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10004v1-abstract-full').style.display = 'none'; document.getElementById('2403.10004v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.09128">arXiv:2403.09128</a> <span> [<a href="https://arxiv.org/pdf/2403.09128">pdf</a>, <a href="https://arxiv.org/format/2403.09128">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Referring Object Removal </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xue%2C+X">Xiangtian Xue</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiasong Wu</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+Y">Youyong Kong</a>, <a href="/search/cs?searchtype=author&query=Senhadji%2C+L">Lotfi Senhadji</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+H">Huazhong Shu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.09128v1-abstract-short" style="display: inline;"> Referring object removal refers to removing the specific object in an image referred by natural language expressions and filling the missing region with reasonable semantics. To address this task, we construct the ComCOCO, a synthetic dataset consisting of 136,495 referring expressions for 34,615 objects in 23,951 image pairs. Each pair contains an image with referring expressions and the ground t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.09128v1-abstract-full').style.display = 'inline'; document.getElementById('2403.09128v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.09128v1-abstract-full" style="display: none;"> Referring object removal refers to removing the specific object in an image referred by natural language expressions and filling the missing region with reasonable semantics. To address this task, we construct the ComCOCO, a synthetic dataset consisting of 136,495 referring expressions for 34,615 objects in 23,951 image pairs. Each pair contains an image with referring expressions and the ground truth after elimination. We further propose an end-to-end syntax-aware hybrid mapping network with an encoding-decoding structure. Linguistic features are hierarchically extracted at the syntactic level and fused in the downsampling process of visual features with multi-head attention. The feature-aligned pyramid network is leveraged to generate segmentation masks and replace internal pixels with region affinity learned from external semantics in high-level feature maps. Extensive experiments demonstrate that our model outperforms diffusion models and two-stage methods which process the segmentation and inpainting task separately by a significant margin. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.09128v1-abstract-full').style.display = 'none'; document.getElementById('2403.09128v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.08157">arXiv:2403.08157</a> <span> [<a href="https://arxiv.org/pdf/2403.08157">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multiscale Low-Frequency Memory Network for Improved Feature Extraction in Convolutional Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fuzhi Wu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiasong Wu</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+Y">Youyong Kong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chunfeng Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guanyu Yang</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+H">Huazhong Shu</a>, <a href="/search/cs?searchtype=author&query=Carrault%2C+G">Guy Carrault</a>, <a href="/search/cs?searchtype=author&query=Senhadji%2C+L">Lotfi Senhadji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.08157v1-abstract-short" style="display: inline;"> Deep learning and Convolutional Neural Networks (CNNs) have driven major transformations in diverse research areas. However, their limitations in handling low-frequency information present obstacles in certain tasks like interpreting global structures or managing smooth transition images. Despite the promising performance of transformer structures in numerous tasks, their intricate optimization co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08157v1-abstract-full').style.display = 'inline'; document.getElementById('2403.08157v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.08157v1-abstract-full" style="display: none;"> Deep learning and Convolutional Neural Networks (CNNs) have driven major transformations in diverse research areas. However, their limitations in handling low-frequency information present obstacles in certain tasks like interpreting global structures or managing smooth transition images. Despite the promising performance of transformer structures in numerous tasks, their intricate optimization complexities highlight the persistent need for refined CNN enhancements using limited resources. Responding to these complexities, we introduce a novel framework, the Multiscale Low-Frequency Memory (MLFM) Network, with the goal to harness the full potential of CNNs while keeping their complexity unchanged. The MLFM efficiently preserves low-frequency information, enhancing performance in targeted computer vision tasks. Central to our MLFM is the Low-Frequency Memory Unit (LFMU), which stores various low-frequency data and forms a parallel channel to the core network. A key advantage of MLFM is its seamless compatibility with various prevalent networks, requiring no alterations to their original core structure. Testing on ImageNet demonstrated substantial accuracy improvements in multiple 2D CNNs, including ResNet, MobileNet, EfficientNet, and ConvNeXt. Furthermore, we showcase MLFM's versatility beyond traditional image classification by successfully integrating it into image-to-image translation tasks, specifically in semantic segmentation networks like FCN and U-Net. In conclusion, our work signifies a pivotal stride in the journey of optimizing the efficacy and efficiency of CNNs with limited resources. This research builds upon the existing CNN foundations and paves the way for future advancements in computer vision. Our codes are available at https://github.com/AlphaWuSeu/ MLFM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08157v1-abstract-full').style.display = 'none'; document.getElementById('2403.08157v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 10 figures,6 tables. AAAI 2024 conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.03492">arXiv:2402.03492</a> <span> [<a href="https://arxiv.org/pdf/2402.03492">pdf</a>, <a href="https://arxiv.org/format/2402.03492">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Beyond Strong labels: Weakly-supervised Learning Based on Gaussian Pseudo Labels for The Segmentation of Ellipse-like Vascular Structures in Non-contrast CTs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+Q">Qixiang Ma</a>, <a href="/search/cs?searchtype=author&query=%C5%81ucas%2C+A">Antoine 艁ucas</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+H">Huazhong Shu</a>, <a href="/search/cs?searchtype=author&query=Kaladji%2C+A">Adrien Kaladji</a>, <a href="/search/cs?searchtype=author&query=Haigron%2C+P">Pascal Haigron</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.03492v3-abstract-short" style="display: inline;"> Deep-learning-based automated segmentation of vascular structures in preoperative CT scans contributes to computer-assisted diagnosis and intervention procedure in vascular diseases. While CT angiography (CTA) is the common standard, non-contrast CT imaging is significant as a contrast-risk-free alternative, avoiding complications associated with contrast agents. However, the challenges of labor-i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.03492v3-abstract-full').style.display = 'inline'; document.getElementById('2402.03492v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.03492v3-abstract-full" style="display: none;"> Deep-learning-based automated segmentation of vascular structures in preoperative CT scans contributes to computer-assisted diagnosis and intervention procedure in vascular diseases. While CT angiography (CTA) is the common standard, non-contrast CT imaging is significant as a contrast-risk-free alternative, avoiding complications associated with contrast agents. However, the challenges of labor-intensive labeling and high labeling variability due to the ambiguity of vascular boundaries hinder conventional strong-label-based, fully-supervised learning in non-contrast CTs. This paper introduces a weakly-supervised framework using ellipses' topology in slices, including 1) an efficient annotation process based on predefined standards, 2) ellipse-fitting processing, 3) the generation of 2D Gaussian heatmaps serving as pseudo labels, 4) a training process through a combination of voxel reconstruction loss and distribution loss with the pseudo labels. We assess the effectiveness of the proposed method on one local and two public datasets comprising non-contrast CT scans, particularly focusing on the abdominal aorta. On the local dataset, our weakly-supervised learning approach based on pseudo labels outperforms strong-label-based fully-supervised learning (1.54\% of Dice score on average), reducing labeling time by around 82.0\%. The efficiency in generating pseudo labels allows the inclusion of label-agnostic external data in the training set, leading to an additional improvement in performance (2.74\% of Dice score on average) with a reduction of 66.3\% labeling time, where the labeling time remains considerably less than that of strong labels. On the public dataset, the pseudo labels achieve an overall improvement of 1.95\% in Dice score for 2D models while a reduction of 11.65 voxel spacing in Hausdorff distance for 3D model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.03492v3-abstract-full').style.display = 'none'; document.getElementById('2402.03492v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by journal of Medical Image Analysis</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.13789">arXiv:2312.13789</a> <span> [<a href="https://arxiv.org/pdf/2312.13789">pdf</a>, <a href="https://arxiv.org/format/2312.13789">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TinySAM: Pushing the Envelope for Efficient Segment Anything Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shu%2C+H">Han Shu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenshuo Li</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yehui Tang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yiman Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yihao Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Houqiang Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yunhe Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinghao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.13789v3-abstract-short" style="display: inline;"> Recently segment anything model (SAM) has shown powerful segmentation capability and has drawn great attention in computer vision fields. Massive following works have developed various applications based on the pre-trained SAM and achieved impressive performance on downstream vision tasks. However, SAM consists of heavy architectures and requires massive computational capacity, which hinders the f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.13789v3-abstract-full').style.display = 'inline'; document.getElementById('2312.13789v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.13789v3-abstract-full" style="display: none;"> Recently segment anything model (SAM) has shown powerful segmentation capability and has drawn great attention in computer vision fields. Massive following works have developed various applications based on the pre-trained SAM and achieved impressive performance on downstream vision tasks. However, SAM consists of heavy architectures and requires massive computational capacity, which hinders the further application of SAM on computation constrained edge devices. To this end, in this paper we propose a framework to obtain a tiny segment anything model (TinySAM) while maintaining the strong zero-shot performance. We first propose a full-stage knowledge distillation method with hard prompt sampling and hard mask weighting strategy to distill a lightweight student model. We also adapt the post-training quantization to the prompt-based segmentation task and further reduce the computational cost. Moreover, a hierarchical segmenting everything strategy is proposed to accelerate the everything inference by $2\times$ with almost no performance degradation. With all these proposed methods, our TinySAM leads to orders of magnitude computational reduction and pushes the envelope for efficient segment anything task. Extensive experiments on various zero-shot transfer tasks demonstrate the significantly advantageous performance of our TinySAM against counterpart methods. Codes are available at https://github.com/xinghaochen/TinySAM and https://gitee.com/mindspore/models/tree/master/research/cv/TinySAM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.13789v3-abstract-full').style.display = 'none'; document.getElementById('2312.13789v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.19295">arXiv:2310.19295</a> <span> [<a href="https://arxiv.org/pdf/2310.19295">pdf</a>, <a href="https://arxiv.org/format/2310.19295">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> ROAM: memory-efficient large DNN training via optimized operator ordering and memory layout </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shu%2C+H">Huiyao Shu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A">Ang Wang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Ziji Shi</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hanyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yong Li</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+L">Lu Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.19295v1-abstract-short" style="display: inline;"> As deep learning models continue to increase in size, the memory requirements for training have surged. While high-level techniques like offloading, recomputation, and compression can alleviate memory pressure, they also introduce overheads. However, a memory-efficient execution plan that includes a reasonable operator execution order and tensor memory layout can significantly increase the models'… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.19295v1-abstract-full').style.display = 'inline'; document.getElementById('2310.19295v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.19295v1-abstract-full" style="display: none;"> As deep learning models continue to increase in size, the memory requirements for training have surged. While high-level techniques like offloading, recomputation, and compression can alleviate memory pressure, they also introduce overheads. However, a memory-efficient execution plan that includes a reasonable operator execution order and tensor memory layout can significantly increase the models' memory efficiency and reduce overheads from high-level techniques. In this paper, we propose ROAM which operates on computation graph level to derive memory-efficient execution plan with optimized operator order and tensor memory layout for models. We first propose sophisticated theories that carefully consider model structure and training memory load to support optimization for large complex graphs that have not been well supported in the past. An efficient tree-based algorithm is further proposed to search task divisions automatically, along with delivering high performance and effectiveness to solve the problem. Experiments show that ROAM achieves a substantial memory reduction of 35.7%, 13.3%, and 27.2% compared to Pytorch and two state-of-the-art methods and offers a remarkable 53.7x speedup. The evaluation conducted on the expansive GPT2-XL further validates ROAM's scalability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.19295v1-abstract-full').style.display = 'none'; document.getElementById('2310.19295v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.13349">arXiv:2310.13349</a> <span> [<a href="https://arxiv.org/pdf/2310.13349">pdf</a>, <a href="https://arxiv.org/format/2310.13349">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DeepFDR: A Deep Learning-based False Discovery Rate Control Method for Neuroimaging Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+T">Taehyo Kim</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+H">Hai Shu</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Q">Qiran Jia</a>, <a href="/search/cs?searchtype=author&query=de+Leon%2C+M+J">Mony J. de Leon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.13349v3-abstract-short" style="display: inline;"> Voxel-based multiple testing is widely used in neuroimaging data analysis. Traditional false discovery rate (FDR) control methods often ignore the spatial dependence among the voxel-based tests and thus suffer from substantial loss of testing power. While recent spatial FDR control methods have emerged, their validity and optimality remain questionable when handling the complex spatial dependencie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.13349v3-abstract-full').style.display = 'inline'; document.getElementById('2310.13349v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.13349v3-abstract-full" style="display: none;"> Voxel-based multiple testing is widely used in neuroimaging data analysis. Traditional false discovery rate (FDR) control methods often ignore the spatial dependence among the voxel-based tests and thus suffer from substantial loss of testing power. While recent spatial FDR control methods have emerged, their validity and optimality remain questionable when handling the complex spatial dependencies of the brain. Concurrently, deep learning methods have revolutionized image segmentation, a task closely related to voxel-based multiple testing. In this paper, we propose DeepFDR, a novel spatial FDR control method that leverages unsupervised deep learning-based image segmentation to address the voxel-based multiple testing problem. Numerical studies, including comprehensive simulations and Alzheimer's disease FDG-PET image analysis, demonstrate DeepFDR's superiority over existing methods. DeepFDR not only excels in FDR control and effectively diminishes the false nondiscovery rate, but also boasts exceptional computational efficiency highly suited for tackling large-scale neuroimaging data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.13349v3-abstract-full').style.display = 'none'; document.getElementById('2310.13349v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of The 27th International Conference on Artificial Intelligence and Statistics (AISTATS 2024), PMLR 238:946-954, 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.00747">arXiv:2310.00747</a> <span> [<a href="https://arxiv.org/pdf/2310.00747">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Portfolio Management">q-fin.PM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.54364/AAIML.2023.1195">10.54364/AAIML.2023.1195 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> NoxTrader: LSTM-Based Stock Return Momentum Prediction for Quantitative Trading </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hsiang-Hui Liu</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+H">Han-Jay Shu</a>, <a href="/search/cs?searchtype=author&query=Chiu%2C+W">Wei-Ning Chiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.00747v2-abstract-short" style="display: inline;"> We introduce NoxTrader, a sophisticated system designed for portfolio construction and trading execution with the primary objective of achieving profitable outcomes in the stock market, specifically aiming to generate moderate to long-term profits. The underlying learning process of NoxTrader is rooted in the assimilation of valuable insights derived from historical trading data, particularly focu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.00747v2-abstract-full').style.display = 'inline'; document.getElementById('2310.00747v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.00747v2-abstract-full" style="display: none;"> We introduce NoxTrader, a sophisticated system designed for portfolio construction and trading execution with the primary objective of achieving profitable outcomes in the stock market, specifically aiming to generate moderate to long-term profits. The underlying learning process of NoxTrader is rooted in the assimilation of valuable insights derived from historical trading data, particularly focusing on time-series analysis due to the nature of the dataset employed. In our approach, we utilize price and volume data of US stock market for feature engineering to generate effective features, including Return Momentum, Week Price Momentum, and Month Price Momentum. We choose the Long Short-Term Memory (LSTM)model to capture continuous price trends and implement dynamic model updates during the trading execution process, enabling the model to continuously adapt to the current market trends. Notably, we have developed a comprehensive trading backtesting system - NoxTrader, which allows us to manage portfolios based on predictive scores and utilize custom evaluation metrics to conduct a thorough assessment of our trading performance. Our rigorous feature engineering and careful selection of prediction targets enable us to generate prediction data with an impressive correlation range between 0.65 and 0.75. Finally, we monitor the dispersion of our prediction data and perform a comparative analysis against actual market data. Through the use of filtering techniques, we improved the initial -60% investment return to 325%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.00747v2-abstract-full').style.display = 'none'; document.getElementById('2310.00747v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 7 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Advances in Artificial Intelligence and Machine Learning 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.00885">arXiv:2309.00885</a> <span> [<a href="https://arxiv.org/pdf/2309.00885">pdf</a>, <a href="https://arxiv.org/format/2309.00885">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.media.2023.102945">10.1016/j.media.2023.102945 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A Generic Fundus Image Enhancement Network Boosted by Frequency Self-supervised Representation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">Heng Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haofeng Liu</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+H">Huazhu Fu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yanwu Xu</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+H">Hui Shu</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+K">Ke Niu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yan Hu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.00885v1-abstract-short" style="display: inline;"> Fundus photography is prone to suffer from image quality degradation that impacts clinical examination performed by ophthalmologists or intelligent systems. Though enhancement algorithms have been developed to promote fundus observation on degraded images, high data demands and limited applicability hinder their clinical deployment. To circumvent this bottleneck, a generic fundus image enhancement… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.00885v1-abstract-full').style.display = 'inline'; document.getElementById('2309.00885v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.00885v1-abstract-full" style="display: none;"> Fundus photography is prone to suffer from image quality degradation that impacts clinical examination performed by ophthalmologists or intelligent systems. Though enhancement algorithms have been developed to promote fundus observation on degraded images, high data demands and limited applicability hinder their clinical deployment. To circumvent this bottleneck, a generic fundus image enhancement network (GFE-Net) is developed in this study to robustly correct unknown fundus images without supervised or extra data. Levering image frequency information, self-supervised representation learning is conducted to learn robust structure-aware representations from degraded images. Then with a seamless architecture that couples representation learning and image enhancement, GFE-Net can accurately correct fundus images and meanwhile preserve retinal structures. Comprehensive experiments are implemented to demonstrate the effectiveness and advantages of GFE-Net. Compared with state-of-the-art algorithms, GFE-Net achieves superior performance in data dependency, enhancement performance, deployment efficiency, and scale generalizability. Follow-up fundus image analysis is also facilitated by GFE-Net, whose modules are respectively verified to be effective for image enhancement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.00885v1-abstract-full').style.display = 'none'; document.getElementById('2309.00885v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Medical Image Analysis in Auguest, 2023</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Medical Image Analysis, 2023, 90:102945 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.06982">arXiv:2308.06982</a> <span> [<a href="https://arxiv.org/pdf/2308.06982">pdf</a>, <a href="https://arxiv.org/format/2308.06982">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Discrete Conditional Diffusion for Reranking in Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xiao Lin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaokai Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chenyang Wang</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+H">Hantao Shu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+L">Linfeng Song</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Biao Li</a>, <a href="/search/cs?searchtype=author&query=jiang%2C+P">Peng jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.06982v1-abstract-short" style="display: inline;"> Reranking plays a crucial role in modern multi-stage recommender systems by rearranging the initial ranking list to model interplay between items. Considering the inherent challenges of reranking such as combinatorial searching space, some previous studies have adopted the evaluator-generator paradigm, with a generator producing feasible sequences and a evaluator selecting the best one based on es… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.06982v1-abstract-full').style.display = 'inline'; document.getElementById('2308.06982v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.06982v1-abstract-full" style="display: none;"> Reranking plays a crucial role in modern multi-stage recommender systems by rearranging the initial ranking list to model interplay between items. Considering the inherent challenges of reranking such as combinatorial searching space, some previous studies have adopted the evaluator-generator paradigm, with a generator producing feasible sequences and a evaluator selecting the best one based on estimated listwise utility. Inspired by the remarkable success of diffusion generative models, this paper explores the potential of diffusion models for generating high-quality sequences in reranking. However, we argue that it is nontrivial to take diffusion models as the generator in the context of recommendation. Firstly, diffusion models primarily operate in continuous data space, differing from the discrete data space of item permutations. Secondly, the recommendation task is different from conventional generation tasks as the purpose of recommender systems is to fulfill user interests. Lastly, real-life recommender systems require efficiency, posing challenges for the inference of diffusion models. To overcome these challenges, we propose a novel Discrete Conditional Diffusion Reranking (DCDR) framework for recommendation. DCDR extends traditional diffusion models by introducing a discrete forward process with tractable posteriors, which adds noise to item sequences through step-wise discrete operations (e.g., swapping). Additionally, DCDR incorporates a conditional reverse process that generates item sequences conditioned on expected user responses. Extensive offline experiments conducted on public datasets demonstrate that DCDR outperforms state-of-the-art reranking methods. Furthermore, DCDR has been deployed in a real-world video app with over 300 million daily active users, significantly enhancing online recommendation quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.06982v1-abstract-full').style.display = 'none'; document.getElementById('2308.06982v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.00677">arXiv:2307.00677</a> <span> [<a href="https://arxiv.org/pdf/2307.00677">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.ins.2025.121916">10.1016/j.ins.2025.121916 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> SDC-HSDD-NDSA: Structure Detecting Cluster by Hierarchical Secondary Directed Differential with Normalized Density and Self-Adaption </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shu%2C+H">Hao Shu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.00677v5-abstract-short" style="display: inline;"> Density-based clustering is the most popular clustering algorithm since it can identify clusters of arbitrary shape as long as they are separated by low-density regions. However, a high-density region that is not separated by low-density ones might also have different structures belonging to multiple clusters. As far as we know, all previous density-based clustering algorithms fail to detect such… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.00677v5-abstract-full').style.display = 'inline'; document.getElementById('2307.00677v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.00677v5-abstract-full" style="display: none;"> Density-based clustering is the most popular clustering algorithm since it can identify clusters of arbitrary shape as long as they are separated by low-density regions. However, a high-density region that is not separated by low-density ones might also have different structures belonging to multiple clusters. As far as we know, all previous density-based clustering algorithms fail to detect such structures. In this paper, we provide a novel density-based clustering scheme to address this problem. It is the rst clustering algorithm that can detect meticulous structures in a high-density region that is not separated by low-density ones and thus extends the range of applications of clustering. The algorithm employs secondary directed differential, hierarchy, normalized density, as well as the self-adaption coefficient, called Structure Detecting Cluster by Hierarchical Secondary Directed Differential with Normalized Density and Self-Adaption, dubbed SDC-HSDD-NDSA. Experiments on synthetic and real datasets are implemented to verify the effectiveness, robustness, and granularity independence of the algorithm, and the scheme is compared to unsupervised schemes in the Python package Scikit-learn. Results demonstrate that our algorithm outperforms previous ones in many situations, especially significantly when clusters have regular internal structures. For example, averaging over the eight noiseless synthetic datasets with structures employing ARI and NMI criteria, previous algorithms obtain scores below 0.6 and 0.7, while the presented algorithm obtains scores higher than 0.9 and 0.95, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.00677v5-abstract-full').style.display = 'none'; document.getElementById('2307.00677v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Information Science (2025) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.11231">arXiv:2302.11231</a> <span> [<a href="https://arxiv.org/pdf/2302.11231">pdf</a>, <a href="https://arxiv.org/format/2302.11231">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Drugs Resistance Analysis from Scarce Health Records via Multi-task Graph Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shu%2C+H">Honglin Shu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+P">Pei Gao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Lingwei Zhu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zheng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.11231v2-abstract-short" style="display: inline;"> Clinicians prescribe antibiotics by looking at the patient's health record with an experienced eye. However, the therapy might be rendered futile if the patient has drug resistance. Determining drug resistance requires time-consuming laboratory-level testing while applying clinicians' heuristics in an automated way is difficult due to the categorical or binary medical events that constitute health… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.11231v2-abstract-full').style.display = 'inline'; document.getElementById('2302.11231v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.11231v2-abstract-full" style="display: none;"> Clinicians prescribe antibiotics by looking at the patient's health record with an experienced eye. However, the therapy might be rendered futile if the patient has drug resistance. Determining drug resistance requires time-consuming laboratory-level testing while applying clinicians' heuristics in an automated way is difficult due to the categorical or binary medical events that constitute health records. In this paper, we propose a novel framework for rapid clinical intervention by viewing health records as graphs whose nodes are mapped from medical events and edges as correspondence between events in given a time window. A novel graph-based model is then proposed to extract informative features and yield automated drug resistance analysis from those high-dimensional and scarce graphs. The proposed method integrates multi-task learning into a common feature extracting graph encoder for simultaneous analyses of multiple drugs as well as stabilizing learning. On a massive dataset comprising over 110,000 patients with urinary tract infections, we verify the proposed method is capable of attaining superior performance on the drug resistance prediction problem. Furthermore, automated drug recommendations resemblant to laboratory-level testing can also be made based on the model resistance analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.11231v2-abstract-full').style.display = 'none'; document.getElementById('2302.11231v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.14131">arXiv:2212.14131</a> <span> [<a href="https://arxiv.org/pdf/2212.14131">pdf</a>, <a href="https://arxiv.org/format/2212.14131">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> TAToo: Vision-based Joint Tracking of Anatomy and Tool for Skull-base Surgery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhaoshuo Li</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+H">Hongchao Shu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+R">Ruixing Liang</a>, <a href="/search/cs?searchtype=author&query=Goodridge%2C+A">Anna Goodridge</a>, <a href="/search/cs?searchtype=author&query=Sahu%2C+M">Manish Sahu</a>, <a href="/search/cs?searchtype=author&query=Creighton%2C+F+X">Francis X. Creighton</a>, <a href="/search/cs?searchtype=author&query=Taylor%2C+R+H">Russell H. Taylor</a>, <a href="/search/cs?searchtype=author&query=Unberath%2C+M">Mathias Unberath</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.14131v2-abstract-short" style="display: inline;"> Purpose: Tracking the 3D motion of the surgical tool and the patient anatomy is a fundamental requirement for computer-assisted skull-base surgery. The estimated motion can be used both for intra-operative guidance and for downstream skill analysis. Recovering such motion solely from surgical videos is desirable, as it is compliant with current clinical workflows and instrumentation. Methods: We… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.14131v2-abstract-full').style.display = 'inline'; document.getElementById('2212.14131v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.14131v2-abstract-full" style="display: none;"> Purpose: Tracking the 3D motion of the surgical tool and the patient anatomy is a fundamental requirement for computer-assisted skull-base surgery. The estimated motion can be used both for intra-operative guidance and for downstream skill analysis. Recovering such motion solely from surgical videos is desirable, as it is compliant with current clinical workflows and instrumentation. Methods: We present Tracker of Anatomy and Tool (TAToo). TAToo jointly tracks the rigid 3D motion of patient skull and surgical drill from stereo microscopic videos. TAToo estimates motion via an iterative optimization process in an end-to-end differentiable form. For robust tracking performance, TAToo adopts a probabilistic formulation and enforces geometric constraints on the object level. Results: We validate TAToo on both simulation data, where ground truth motion is available, as well as on anthropomorphic phantom data, where optical tracking provides a strong baseline. We report sub-millimeter and millimeter inter-frame tracking accuracy for skull and drill, respectively, with rotation errors below 1掳. We further illustrate how TAToo may be used in a surgical navigation setting. Conclusion: We present TAToo, which simultaneously tracks the surgical tool and the patient anatomy in skull-base surgery. TAToo directly predicts the motion from surgical videos, without the need of any markers. Our results show that the performance of TAToo compares favorably to competing approaches. Future work will include fine-tuning of our depth network to reach a 1 mm clinical accuracy goal desired for surgical applications in the skull base. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.14131v2-abstract-full').style.display = 'none'; document.getElementById('2212.14131v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IPCAI/IJCARS 2023, code available at: https://github.com/mli0603/TAToo</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.14434">arXiv:2211.14434</a> <span> [<a href="https://arxiv.org/pdf/2211.14434">pdf</a>, <a href="https://arxiv.org/format/2211.14434">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Atmospheric and Oceanic Physics">physics.ao-ph</span> </div> </div> <p class="title is-5 mathjax"> Multi-Step Short-Term Wind Speed Prediction with Rank Pooling and Fast Fourier Transformation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shu%2C+H">Hailong Shu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.14434v2-abstract-short" style="display: inline;"> Short-term wind speed prediction is essential for economical wind power utilization. The real-world wind speed data is typically intermittent and fluctuating, presenting great challenges to existing shallow models. In this paper, we present a novel deep hybrid model for multi-step wind speed prediction, namely LR-FFT-RP-MLP/LSTM (Linear Fast Fourier Transformation Rank Pooling Multiple-Layer Perce… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.14434v2-abstract-full').style.display = 'inline'; document.getElementById('2211.14434v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.14434v2-abstract-full" style="display: none;"> Short-term wind speed prediction is essential for economical wind power utilization. The real-world wind speed data is typically intermittent and fluctuating, presenting great challenges to existing shallow models. In this paper, we present a novel deep hybrid model for multi-step wind speed prediction, namely LR-FFT-RP-MLP/LSTM (Linear Fast Fourier Transformation Rank Pooling Multiple-Layer Perception/Long Short-Term Memory). Our hybrid model processes the local and global input features simultaneously. We leverage Rank Pooling (RP) for the local feature extraction to capture the temporal structure while maintaining the temporal order. Besides, to understand the wind periodic patterns, we exploit Fast Fourier Transformation (FFT) to extract global features and relevant frequency components in the wind speed data. The resulting local and global features are respectively integrated with the original data and are fed into an MLP/LSTM layer for the initial wind speed predictions. Finally, we leverage a linear regression layer to collaborate these initial predictions to produce the final wind speed prediction. The proposed hybrid model is evaluated using real wind speed data collected from 2010 to 2020, demonstrating superior forecasting capabilities when compared to state-of-the-art single and hybrid models. Overall, this study presents a promising approach for improving the accuracy of wind speed forecasting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.14434v2-abstract-full').style.display = 'none'; document.getElementById('2211.14434v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.11863">arXiv:2211.11863</a> <span> [<a href="https://arxiv.org/pdf/2211.11863">pdf</a>, <a href="https://arxiv.org/format/2211.11863">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Twin-S: A Digital Twin for Skull-base Surgery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shu%2C+H">Hongchao Shu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+R">Ruixing Liang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhaoshuo Li</a>, <a href="/search/cs?searchtype=author&query=Goodridge%2C+A">Anna Goodridge</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiangyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+H">Hao Ding</a>, <a href="/search/cs?searchtype=author&query=Nagururu%2C+N">Nimesh Nagururu</a>, <a href="/search/cs?searchtype=author&query=Sahu%2C+M">Manish Sahu</a>, <a href="/search/cs?searchtype=author&query=Creighton%2C+F+X">Francis X. Creighton</a>, <a href="/search/cs?searchtype=author&query=Taylor%2C+R+H">Russell H. Taylor</a>, <a href="/search/cs?searchtype=author&query=Munawar%2C+A">Adnan Munawar</a>, <a href="/search/cs?searchtype=author&query=Unberath%2C+M">Mathias Unberath</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.11863v2-abstract-short" style="display: inline;"> Purpose: Digital twins are virtual interactive models of the real world, exhibiting identical behavior and properties. In surgical applications, computational analysis from digital twins can be used, for example, to enhance situational awareness. Methods: We present a digital twin framework for skull-base surgeries, named Twin-S, which can be integrated within various image-guided interventions se… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.11863v2-abstract-full').style.display = 'inline'; document.getElementById('2211.11863v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.11863v2-abstract-full" style="display: none;"> Purpose: Digital twins are virtual interactive models of the real world, exhibiting identical behavior and properties. In surgical applications, computational analysis from digital twins can be used, for example, to enhance situational awareness. Methods: We present a digital twin framework for skull-base surgeries, named Twin-S, which can be integrated within various image-guided interventions seamlessly. Twin-S combines high-precision optical tracking and real-time simulation. We rely on rigorous calibration routines to ensure that the digital twin representation precisely mimics all real-world processes. Twin-S models and tracks the critical components of skull-base surgery, including the surgical tool, patient anatomy, and surgical camera. Significantly, Twin-S updates and reflects real-world drilling of the anatomical model in frame rate. Results: We extensively evaluate the accuracy of Twin-S, which achieves an average 1.39 mm error during the drilling process. We further illustrate how segmentation masks derived from the continuously updated digital twin can augment the surgical microscope view in a mixed reality setting, where bone requiring ablation is highlighted to provide surgeons additional situational awareness. Conclusion: We present Twin-S, a digital twin environment for skull-base surgery. Twin-S tracks and updates the virtual model in real-time given measurements from modern tracking technologies. Future research on complementing optical tracking with higher-precision vision-based approaches may further increase the accuracy of Twin-S. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.11863v2-abstract-full').style.display = 'none'; document.getElementById('2211.11863v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.01144">arXiv:2211.01144</a> <span> [<a href="https://arxiv.org/pdf/2211.01144">pdf</a>, <a href="https://arxiv.org/format/2211.01144">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.neucom.2025.129646">10.1016/j.neucom.2025.129646 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> UniASM: Binary Code Similarity Detection without Fine-tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yeming Gu</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+H">Hui Shu</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+F">Fei Kang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+F">Fan Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.01144v4-abstract-short" style="display: inline;"> Binary code similarity detection (BCSD) is widely used in various binary analysis tasks such as vulnerability search, malware detection, clone detection, and patch analysis. Recent studies have shown that the learning-based binary code embedding models perform better than the traditional feature-based approaches. However, previous studies have not delved deeply into the key factors that affect mod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.01144v4-abstract-full').style.display = 'inline'; document.getElementById('2211.01144v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.01144v4-abstract-full" style="display: none;"> Binary code similarity detection (BCSD) is widely used in various binary analysis tasks such as vulnerability search, malware detection, clone detection, and patch analysis. Recent studies have shown that the learning-based binary code embedding models perform better than the traditional feature-based approaches. However, previous studies have not delved deeply into the key factors that affect model performance. In this paper, we design extensive ablation studies to explore these influencing factors. The experimental results have provided us with many new insights. We have made innovations in both code representation and model selection: we propose a novel rich-semantic function representation technique to ensure the model captures the intricate nuances of binary code, and we introduce the first UniLM-based binary code embedding model, named UniASM, which includes two newly designed training tasks to learn representations of binary functions. The experimental results show that UniASM outperforms the state-of-the-art (SOTA) approaches on the evaluation datasets. The average scores of Recall@1 on cross-compilers, cross-optimization-levels, and cross-obfuscations have improved by 12.7%, 8.5%, and 22.3%, respectively, compared to the best of the baseline methods. Besides, in the real-world task of known vulnerability search, UniASM outperforms all the current baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.01144v4-abstract-full').style.display = 'none'; document.getElementById('2211.01144v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.15457">arXiv:2210.15457</a> <span> [<a href="https://arxiv.org/pdf/2210.15457">pdf</a>, <a href="https://arxiv.org/format/2210.15457">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TGRS.2023.3292929">10.1109/TGRS.2023.3292929 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> One-Class Risk Estimation for One-Class Hyperspectral Image Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hengwei Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+Y">Yanfei Zhong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinyu Wang</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+H">Hong Shu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.15457v2-abstract-short" style="display: inline;"> Hyperspectral imagery (HSI) one-class classification is aimed at identifying a single target class from the HSI by using only knowing positive data, which can significantly reduce the requirements for annotation. However, when one-class classification meets HSI, it is difficult for classifiers to find a balance between the overfitting and underfitting of positive data due to the problems of distri… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.15457v2-abstract-full').style.display = 'inline'; document.getElementById('2210.15457v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.15457v2-abstract-full" style="display: none;"> Hyperspectral imagery (HSI) one-class classification is aimed at identifying a single target class from the HSI by using only knowing positive data, which can significantly reduce the requirements for annotation. However, when one-class classification meets HSI, it is difficult for classifiers to find a balance between the overfitting and underfitting of positive data due to the problems of distribution overlap and distribution imbalance. Although deep learning-based methods are currently the mainstream to overcome distribution overlap in HSI multiclassification, few studies focus on deep learning-based HSI one-class classification. In this article, a weakly supervised deep HSI one-class classifier, namely, HOneCls, is proposed, where a risk estimator,the one-class risk estimator, is particularly introduced to make the fully convolutional neural network (FCN) with the ability of one class classification in the case of distribution imbalance. Extensive experiments (20 tasks in total) were conducted to demonstrate the superiority of the proposed classifier. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.15457v2-abstract-full').style.display = 'none'; document.getElementById('2210.15457v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by TGRS</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.03221">arXiv:2210.03221</a> <span> [<a href="https://arxiv.org/pdf/2210.03221">pdf</a>, <a href="https://arxiv.org/format/2210.03221">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> </div> </div> <p class="title is-5 mathjax"> PQLM -- Multilingual Decentralized Portable Quantum Language Model for Privacy Protection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+S+S">Shuyue Stella Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiangyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Shu Zhou</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+H">Hongchao Shu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+R">Ruixing Liang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hexin Liu</a>, <a href="/search/cs?searchtype=author&query=Garcia%2C+L+P">Leibny Paola Garcia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.03221v5-abstract-short" style="display: inline;"> With careful manipulation, malicious agents can reverse engineer private information encoded in pre-trained language models. Security concerns motivate the development of quantum pre-training. In this work, we propose a highly Portable Quantum Language Model (PQLM) that can easily transmit information to downstream tasks on classical machines. The framework consists of a cloud PQLM built with rand… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.03221v5-abstract-full').style.display = 'inline'; document.getElementById('2210.03221v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.03221v5-abstract-full" style="display: none;"> With careful manipulation, malicious agents can reverse engineer private information encoded in pre-trained language models. Security concerns motivate the development of quantum pre-training. In this work, we propose a highly Portable Quantum Language Model (PQLM) that can easily transmit information to downstream tasks on classical machines. The framework consists of a cloud PQLM built with random Variational Quantum Classifiers (VQC) and local models for downstream applications. We demonstrate the ad hoc portability of the quantum model by extracting only the word embeddings and effectively applying them to downstream tasks on classical machines. Our PQLM exhibits comparable performance to its classical counterpart on both intrinsic evaluation (loss, perplexity) and extrinsic evaluation (multilingual sentiment analysis accuracy) metrics. We also perform ablation studies on the factors affecting PQLM performance to analyze model stability. Our work establishes a theoretical foundation for a portable quantum pre-trained language model that could be trained on private data and made available for public use with privacy protection guarantees. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.03221v5-abstract-full').style.display = 'none'; document.getElementById('2210.03221v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 3 figures, 3 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.05913">arXiv:2209.05913</a> <span> [<a href="https://arxiv.org/pdf/2209.05913">pdf</a>, <a href="https://arxiv.org/format/2209.05913">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TIP.2022.3207571">10.1109/TIP.2022.3207571 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Dual-Scale Single Image Dehazing Via Neural Augmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhengguo Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+C">Chaobing Zheng</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+H">Haiyan Shu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shiqian Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.05913v1-abstract-short" style="display: inline;"> Model-based single image dehazing algorithms restore haze-free images with sharp edges and rich details for real-world hazy images at the expense of low PSNR and SSIM values for synthetic hazy images. Data-driven ones restore haze-free images with high PSNR and SSIM values for synthetic hazy images but with low contrast, and even some remaining haze for real world hazy images. In this paper, a nov… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.05913v1-abstract-full').style.display = 'inline'; document.getElementById('2209.05913v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.05913v1-abstract-full" style="display: none;"> Model-based single image dehazing algorithms restore haze-free images with sharp edges and rich details for real-world hazy images at the expense of low PSNR and SSIM values for synthetic hazy images. Data-driven ones restore haze-free images with high PSNR and SSIM values for synthetic hazy images but with low contrast, and even some remaining haze for real world hazy images. In this paper, a novel single image dehazing algorithm is introduced by combining model-based and data-driven approaches. Both transmission map and atmospheric light are first estimated by the model-based methods, and then refined by dual-scale generative adversarial networks (GANs) based approaches. The resultant algorithm forms a neural augmentation which converges very fast while the corresponding data-driven approach might not converge. Haze-free images are restored by using the estimated transmission map and atmospheric light as well as the Koschmiederlaw. Experimental results indicate that the proposed algorithm can remove haze well from real-world and synthetic hazy images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.05913v1-abstract-full').style.display = 'none'; document.getElementById('2209.05913v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Single image dehazing, dual-scale, neural augmentation, haze line averaging, generative adversarial network. arXiv admin note: substantial text overlap with arXiv:2111.10943</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.07349">arXiv:2206.07349</a> <span> [<a href="https://arxiv.org/pdf/2206.07349">pdf</a>, <a href="https://arxiv.org/format/2206.07349">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> XMorpher: Full Transformer for Deformable Medical Image Registration via Cross Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jiacheng Shi</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yuting He</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+Y">Youyong Kong</a>, <a href="/search/cs?searchtype=author&query=Coatrieux%2C+J">Jean-Louis Coatrieux</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+H">Huazhong Shu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guanyu Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuo Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.07349v1-abstract-short" style="display: inline;"> An effective backbone network is important to deep learning-based Deformable Medical Image Registration (DMIR), because it extracts and matches the features between two images to discover the mutual correspondence for fine registration. However, the existing deep networks focus on single image situation and are limited in registration task which is performed on paired images. Therefore, we advance… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.07349v1-abstract-full').style.display = 'inline'; document.getElementById('2206.07349v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.07349v1-abstract-full" style="display: none;"> An effective backbone network is important to deep learning-based Deformable Medical Image Registration (DMIR), because it extracts and matches the features between two images to discover the mutual correspondence for fine registration. However, the existing deep networks focus on single image situation and are limited in registration task which is performed on paired images. Therefore, we advance a novel backbone network, XMorpher, for the effective corresponding feature representation in DMIR. 1) It proposes a novel full transformer architecture including dual parallel feature extraction networks which exchange information through cross attention, thus discovering multi-level semantic correspondence while extracting respective features gradually for final effective registration. 2) It advances the Cross Attention Transformer (CAT) blocks to establish the attention mechanism between images which is able to find the correspondence automatically and prompts the features to fuse efficiently in the network. 3) It constrains the attention computation between base windows and searching windows with different sizes, and thus focuses on the local transformation of deformable registration and enhances the computing efficiency at the same time. Without any bells and whistles, our XMorpher gives Voxelmorph 2.8% improvement on DSC , demonstrating its effective representation of the features from the paired images in DMIR. We believe that our XMorpher has great application potential in more paired medical images. Our XMorpher is open on https://github.com/Solemoon/XMorpher <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.07349v1-abstract-full').style.display = 'none'; document.getElementById('2206.07349v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by MICCAI 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.04684">arXiv:2206.04684</a> <span> [<a href="https://arxiv.org/pdf/2206.04684">pdf</a>, <a href="https://arxiv.org/format/2206.04684">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Structure-consistent Restoration Network for Cataract Fundus Image Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">Heng Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haofeng Liu</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+H">Huazhu Fu</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+H">Hai Shu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yitian Zhao</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+X">Xiaoling Luo</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yan Hu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.04684v1-abstract-short" style="display: inline;"> Fundus photography is a routine examination in clinics to diagnose and monitor ocular diseases. However, for cataract patients, the fundus image always suffers quality degradation caused by the clouding lens. The degradation prevents reliable diagnosis by ophthalmologists or computer-aided systems. To improve the certainty in clinical diagnosis, restoration algorithms have been proposed to enhance… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.04684v1-abstract-full').style.display = 'inline'; document.getElementById('2206.04684v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.04684v1-abstract-full" style="display: none;"> Fundus photography is a routine examination in clinics to diagnose and monitor ocular diseases. However, for cataract patients, the fundus image always suffers quality degradation caused by the clouding lens. The degradation prevents reliable diagnosis by ophthalmologists or computer-aided systems. To improve the certainty in clinical diagnosis, restoration algorithms have been proposed to enhance the quality of fundus images. Unfortunately, challenges remain in the deployment of these algorithms, such as collecting sufficient training data and preserving retinal structures. In this paper, to circumvent the strict deployment requirement, a structure-consistent restoration network (SCR-Net) for cataract fundus images is developed from synthesized data that shares an identical structure. A cataract simulation model is firstly designed to collect synthesized cataract sets (SCS) formed by cataract fundus images sharing identical structures. Then high-frequency components (HFCs) are extracted from the SCS to constrain structure consistency such that the structure preservation in SCR-Net is enforced. The experiments demonstrate the effectiveness of SCR-Net in the comparison with state-of-the-art methods and the follow-up clinical applications. The code is available at https://github.com/liamheng/ArcNet-Medical-Image-Enhancement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.04684v1-abstract-full').style.display = 'none'; document.getElementById('2206.04684v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.14833">arXiv:2205.14833</a> <span> [<a href="https://arxiv.org/pdf/2205.14833">pdf</a>, <a href="https://arxiv.org/format/2205.14833">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Walle: An End-to-End, General-Purpose, and Large-Scale Production System for Device-Cloud Collaborative Machine Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lv%2C+C">Chengfei Lv</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+C">Chaoyue Niu</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+R">Renjie Gu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xiaotang Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhaode Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bin Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Ziqi Wu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Q">Qiulin Yao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+C">Congyu Huang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+P">Panos Huang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+T">Tao Huang</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+H">Hui Shu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jinde Song</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+B">Bin Zou</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+P">Peng Lan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+G">Guohuan Xu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fei Wu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Shaojie Tang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fan Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guihai Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.14833v1-abstract-short" style="display: inline;"> To break the bottlenecks of mainstream cloud-based machine learning (ML) paradigm, we adopt device-cloud collaborative ML and build the first end-to-end and general-purpose system, called Walle, as the foundation. Walle consists of a deployment platform, distributing ML tasks to billion-scale devices in time; a data pipeline, efficiently preparing task input; and a compute container, providing a c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.14833v1-abstract-full').style.display = 'inline'; document.getElementById('2205.14833v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.14833v1-abstract-full" style="display: none;"> To break the bottlenecks of mainstream cloud-based machine learning (ML) paradigm, we adopt device-cloud collaborative ML and build the first end-to-end and general-purpose system, called Walle, as the foundation. Walle consists of a deployment platform, distributing ML tasks to billion-scale devices in time; a data pipeline, efficiently preparing task input; and a compute container, providing a cross-platform and high-performance execution environment, while facilitating daily task iteration. Specifically, the compute container is based on Mobile Neural Network (MNN), a tensor compute engine along with the data processing and model execution libraries, which are exposed through a refined Python thread-level virtual machine (VM) to support diverse ML tasks and concurrent task execution. The core of MNN is the novel mechanisms of operator decomposition and semi-auto search, sharply reducing the workload in manually optimizing hundreds of operators for tens of hardware backends and further quickly identifying the best backend with runtime optimization for a computation graph. The data pipeline introduces an on-device stream processing framework to enable processing user behavior data at source. The deployment platform releases ML tasks with an efficient push-then-pull method and supports multi-granularity deployment policies. We evaluate Walle in practical e-commerce application scenarios to demonstrate its effectiveness, efficiency, and scalability. Extensive micro-benchmarks also highlight the superior performance of MNN and the Python thread-level VM. Walle has been in large-scale production use in Alibaba, while MNN has been open source with a broad impact in the community. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.14833v1-abstract-full').style.display = 'none'; document.getElementById('2205.14833v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by OSDI 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.09944">arXiv:2205.09944</a> <span>  </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> 6G Network AI Architecture for Everyone-Centric Customized Services </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yang Yang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+M">Mulei Ma</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Hequan Wu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Q">Quan Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Ping Zhang</a>, <a href="/search/cs?searchtype=author&query=You%2C+X">Xiaohu You</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jianjun Wu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chenghui Peng</a>, <a href="/search/cs?searchtype=author&query=Yum%2C+T+P">Tak-Shing Peter Yum</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Sherman Shen</a>, <a href="/search/cs?searchtype=author&query=Aghvami%2C+H">Hamid Aghvami</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G+Y">Geoffrey Y Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiangzhou Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+G">Guangyi Liu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+P">Peng Gao</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xiongyan Tang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+C">Chang Cao</a>, <a href="/search/cs?searchtype=author&query=Thompson%2C+J">John Thompson</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+K">Kat-Kit Wong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shanzhi Chen</a>, <a href="/search/cs?searchtype=author&query=Debbah%2C+M">Merouane Debbah</a>, <a href="/search/cs?searchtype=author&query=Dustdar%2C+S">Schahram Dustdar</a>, <a href="/search/cs?searchtype=author&query=Eliassen%2C+F">Frank Eliassen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tao Chen</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+X">Xiangyang Duan</a> , et al. (29 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.09944v5-abstract-short" style="display: inline;"> Mobile communication standards were developed for enhancing transmission and network performance by using more radio resources and improving spectrum and energy efficiency. How to effectively address diverse user requirements and guarantee everyone's Quality of Experience (QoE) remains an open problem. The Sixth Generation (6G) mobile systems will solve this problem by utilizing heterogenous netwo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.09944v5-abstract-full').style.display = 'inline'; document.getElementById('2205.09944v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.09944v5-abstract-full" style="display: none;"> Mobile communication standards were developed for enhancing transmission and network performance by using more radio resources and improving spectrum and energy efficiency. How to effectively address diverse user requirements and guarantee everyone's Quality of Experience (QoE) remains an open problem. The Sixth Generation (6G) mobile systems will solve this problem by utilizing heterogenous network resources and pervasive intelligence to support everyone-centric customized services anywhere and anytime. In this article, we first coin the concept of Service Requirement Zone (SRZ) on the user side to characterize and visualize the integrated service requirements and preferences of specific tasks of individual users. On the system side, we further introduce the concept of User Satisfaction Ratio (USR) to evaluate the system's overall service ability of satisfying a variety of tasks with different SRZs. Then, we propose a network Artificial Intelligence (AI) architecture with integrated network resources and pervasive AI capabilities for supporting customized services with guaranteed QoEs. Finally, extensive simulations show that the proposed network AI architecture can consistently offer a higher USR performance than the cloud AI and edge AI architectures with respect to different task scheduling algorithms, random service requirements, and dynamic network conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.09944v5-abstract-full').style.display = 'none'; document.getElementById('2205.09944v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The current version has partial Insufficient completion, so we would like to withdraw it. We hope you agree, thank you</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.04846">arXiv:2205.04846</a> <span> [<a href="https://arxiv.org/pdf/2205.04846">pdf</a>, <a href="https://arxiv.org/format/2205.04846">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MNet: Rethinking 2D/3D Networks for Anisotropic Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Z">Zhangfu Dong</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yuting He</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+X">Xiaoming Qi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yang Chen</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+H">Huazhong Shu</a>, <a href="/search/cs?searchtype=author&query=Coatrieux%2C+J">Jean-Louis Coatrieux</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guanyu Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuo Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.04846v1-abstract-short" style="display: inline;"> The nature of thick-slice scanning causes severe inter-slice discontinuities of 3D medical images, and the vanilla 2D/3D convolutional neural networks (CNNs) fail to represent sparse inter-slice information and dense intra-slice information in a balanced way, leading to severe underfitting to inter-slice features (for vanilla 2D CNNs) and overfitting to noise from long-range slices (for vanilla 3D… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.04846v1-abstract-full').style.display = 'inline'; document.getElementById('2205.04846v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.04846v1-abstract-full" style="display: none;"> The nature of thick-slice scanning causes severe inter-slice discontinuities of 3D medical images, and the vanilla 2D/3D convolutional neural networks (CNNs) fail to represent sparse inter-slice information and dense intra-slice information in a balanced way, leading to severe underfitting to inter-slice features (for vanilla 2D CNNs) and overfitting to noise from long-range slices (for vanilla 3D CNNs). In this work, a novel mesh network (MNet) is proposed to balance the spatial representation inter axes via learning. 1) Our MNet latently fuses plenty of representation processes by embedding multi-dimensional convolutions deeply into basic modules, making the selections of representation processes flexible, thus balancing representation for sparse inter-slice information and dense intra-slice information adaptively. 2) Our MNet latently fuses multi-dimensional features inside each basic module, simultaneously taking the advantages of 2D (high segmentation accuracy of the easily recognized regions in 2D view) and 3D (high smoothness of 3D organ contour) representations, thus obtaining more accurate modeling for target regions. Comprehensive experiments are performed on four public datasets (CT\&MR), the results consistently demonstrate the proposed MNet outperforms the other methods. The code and datasets are available at: https://github.com/zfdong-code/MNet <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.04846v1-abstract-full').style.display = 'none'; document.getElementById('2205.04846v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IJCAI 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.05757">arXiv:2203.05757</a> <span> [<a href="https://arxiv.org/pdf/2203.05757">pdf</a>, <a href="https://arxiv.org/format/2203.05757">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Solar and Stellar Astrophysics">astro-ph.SR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1080/08839514.2022.2074129">10.1080/08839514.2022.2074129 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A comparative study of non-deep learning, deep learning, and ensemble learning methods for sunspot number prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dang%2C+Y">Yuchen Dang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Ziqi Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Heng Li</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+H">Hai Shu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.05757v2-abstract-short" style="display: inline;"> Solar activity has significant impacts on human activities and health. One most commonly used measure of solar activity is the sunspot number. This paper compares three important non-deep learning models, four popular deep learning models, and their five ensemble models in forecasting sunspot numbers. In particular, we propose an ensemble model called XGBoost-DL, which uses XGBoost as a two-level… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.05757v2-abstract-full').style.display = 'inline'; document.getElementById('2203.05757v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.05757v2-abstract-full" style="display: none;"> Solar activity has significant impacts on human activities and health. One most commonly used measure of solar activity is the sunspot number. This paper compares three important non-deep learning models, four popular deep learning models, and their five ensemble models in forecasting sunspot numbers. In particular, we propose an ensemble model called XGBoost-DL, which uses XGBoost as a two-level nonlinear ensemble method to combine the deep learning models. Our XGBoost-DL achieves the best forecasting performance (RMSE = 25.70 and MAE = 19.82) in the comparison, outperforming the best non-deep learning model SARIMA (RMSE = 54.11 and MAE = 45.51), the best deep learning model Informer (RMSE = 29.90 and MAE = 22.35) and the NASA's forecast (RMSE = 48.38 and MAE = 38.45). Our XGBoost-DL forecasts a peak sunspot number of 133.47 in May 2025 for Solar Cycle 25 and 164.62 in November 2035 for Solar Cycle 26, similar to but later than the NASA's at 137.7 in October 2024 and 161.2 in December 2034. An open-source Python package of our XGBoost-DL for the sunspot number prediction is available at https://github.com/yd1008/ts_ensemble_sunspot. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.05757v2-abstract-full').style.display = 'none'; document.getElementById('2203.05757v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Applied Artificial Intelligence, 2022, 36(1) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2201.11582">arXiv:2201.11582</a> <span> [<a href="https://arxiv.org/pdf/2201.11582">pdf</a>, <a href="https://arxiv.org/format/2201.11582">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> GUDN: A novel guide network with label reinforcement strategy for extreme multi-label text classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qing Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jia Zhu</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+H">Hongji Shu</a>, <a href="/search/cs?searchtype=author&query=Asamoah%2C+K+O">Kwame Omono Asamoah</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jianyang Shi</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+C">Cong Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2201.11582v2-abstract-short" style="display: inline;"> In natural language processing, extreme multi-label text classification is an emerging but essential task. The problem of extreme multi-label text classification (XMTC) is to recall some of the most relevant labels for a text from an extremely large label set. Large-scale pre-trained models have brought a new trend to this problem. Though the large-scale pre-trained models have made significant ac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.11582v2-abstract-full').style.display = 'inline'; document.getElementById('2201.11582v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2201.11582v2-abstract-full" style="display: none;"> In natural language processing, extreme multi-label text classification is an emerging but essential task. The problem of extreme multi-label text classification (XMTC) is to recall some of the most relevant labels for a text from an extremely large label set. Large-scale pre-trained models have brought a new trend to this problem. Though the large-scale pre-trained models have made significant achievements on this problem, the valuable fine-tuned methods have yet to be studied. Though label semantics have been introduced in XMTC, the vast semantic gap between texts and labels has yet to gain enough attention. This paper builds a new guide network (GUDN) to help fine-tune the pre-trained model to instruct classification later. Furthermore, GUDN uses raw label semantics combined with a helpful label reinforcement strategy to effectively explore the latent space between texts and labels, narrowing the semantic gap, which can further improve predicted accuracy. Experimental results demonstrate that GUDN outperforms state-of-the-art methods on Eurlex-4k and has competitive results on other popular datasets. In an additional experiment, we investigated the input lengths' influence on the Transformer-based model's accuracy. Our source code is released at https://t.hk.uy/aFSH. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.11582v2-abstract-full').style.display = 'none'; document.getElementById('2201.11582v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2201.07938">arXiv:2201.07938</a> <span> [<a href="https://arxiv.org/pdf/2201.07938">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> spotFuzzer: Static Instrument and Fuzzing Windows COTs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yeming Gu</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+H">Hui Shu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+R">Rongkuan Ma</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+L">Lin Yan</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Lei Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2201.07938v1-abstract-short" style="display: inline;"> The security research on Windows has received little attention in the academic circle. Most of the new methods are usually designed for Linux system, and are difficult to transplant to Windows. Fuzzing for Windows programs always suffering from its closed source. Therefore, we need to find an appropriate way to achieve feedback from Windows programs. To our knowledge, there are no stable and scala… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.07938v1-abstract-full').style.display = 'inline'; document.getElementById('2201.07938v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2201.07938v1-abstract-full" style="display: none;"> The security research on Windows has received little attention in the academic circle. Most of the new methods are usually designed for Linux system, and are difficult to transplant to Windows. Fuzzing for Windows programs always suffering from its closed source. Therefore, we need to find an appropriate way to achieve feedback from Windows programs. To our knowledge, there are no stable and scalable static instrumentation tools for Windows yet, and dynamic tools, such as DynamoRIO, have been criticized for their performance. To make matters worse, dynamic instrumentation tools have very limited usage scenarios and are impotent for many system services or large commercial software. In this paper, we proposed spotInstr, a novel static tool for instrumenting Windows binaries. It is lightweight and can instrument most Windows PE programs in a very short time. At the same time, spotInstr provides a set of filters, which can be used to select instrumentation points or restrict the target regions. Based on these filters, we propose a novel memory-sensitive instrumentation method which can speed up both instrumentation and fuzzing. After that, we design a system called spotFuzzer, which leverage the ability of spotInstr and can fuzz most Windows binaries. We tested spotInstr and spotFuzzer in multiple dimensions to show their superior performance and stability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.07938v1-abstract-full').style.display = 'none'; document.getElementById('2201.07938v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.10943">arXiv:2111.10943</a> <span> [<a href="https://arxiv.org/pdf/2111.10943">pdf</a>, <a href="https://arxiv.org/format/2111.10943">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Model-Based Single Image Deep Dehazing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhengguo Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+C">Chaobing Zheng</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+H">Haiyan Shu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shiqian Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.10943v3-abstract-short" style="display: inline;"> Model-based single image dehazing algorithms restore images with sharp edges and rich details at the expense of low PSNR values. Data-driven ones restore images with high PSNR values but with low contrast, and even some remaining haze. In this paper, a novel single image dehazing algorithm is introduced by fusing model-based and data-driven approaches. Both transmission map and atmospheric light a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.10943v3-abstract-full').style.display = 'inline'; document.getElementById('2111.10943v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.10943v3-abstract-full" style="display: none;"> Model-based single image dehazing algorithms restore images with sharp edges and rich details at the expense of low PSNR values. Data-driven ones restore images with high PSNR values but with low contrast, and even some remaining haze. In this paper, a novel single image dehazing algorithm is introduced by fusing model-based and data-driven approaches. Both transmission map and atmospheric light are initialized by the model-based methods, and refined by deep learning approaches which form a neural augmentation. Haze-free images are restored by using the transmission map and atmospheric light. Experimental results indicate that the proposed algorithm can remove haze well from real-world and synthetic hazy images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.10943v3-abstract-full').style.display = 'none'; document.getElementById('2111.10943v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> 2022 IEEE International Conference on Image Processing </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.05700">arXiv:2111.05700</a> <span> [<a href="https://arxiv.org/pdf/2111.05700">pdf</a>, <a href="https://arxiv.org/format/2111.05700">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TIP.2021.3123551">10.1109/TIP.2021.3123551 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Multi-Scale Single Image Dehazing Using Laplacian and Gaussian Pyramids </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhengguo Li</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+H">Haiyan Shu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+C">Chaobing Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.05700v2-abstract-short" style="display: inline;"> Model driven single image dehazing was widely studied on top of different priors due to its extensive applications. Ambiguity between object radiance and haze and noise amplification in sky regions are two inherent problems of model driven single image dehazing. In this paper, a dark direct attenuation prior (DDAP) is proposed to address the former problem. A novel haze line averaging is proposed… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.05700v2-abstract-full').style.display = 'inline'; document.getElementById('2111.05700v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.05700v2-abstract-full" style="display: none;"> Model driven single image dehazing was widely studied on top of different priors due to its extensive applications. Ambiguity between object radiance and haze and noise amplification in sky regions are two inherent problems of model driven single image dehazing. In this paper, a dark direct attenuation prior (DDAP) is proposed to address the former problem. A novel haze line averaging is proposed to reduce the morphological artifacts caused by the DDAP which enables a weighted guided image filter with a smaller radius to further reduce the morphological artifacts while preserve the fine structure in the image. A multi-scale dehazing algorithm is then proposed to address the latter problem by adopting Laplacian and Guassian pyramids to decompose the hazy image into different levels and applying different haze removal and noise reduction approaches to restore the scene radiance at different levels of the pyramid. The resultant pyramid is collapsed to restore a haze-free image. Experiment results demonstrate that the proposed algorithm outperforms state of the art dehazing algorithms and the noise is indeed prevented from being amplified in the sky region. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.05700v2-abstract-full').style.display = 'none'; document.getElementById('2111.05700v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Shu%2C+H&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Shu%2C+H&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Shu%2C+H&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository