CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;43 of 43 results for author: <span class="mathjax">Tong, R</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Tong%2C+R">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Tong, R"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Tong%2C+R&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Tong, R"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11244">arXiv:2411.11244</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11244">pdf</a>, <a href="https://arxiv.org/format/2411.11244">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Geometry">cs.CG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> </div> </div> <p class="title is-5 mathjax"> gDist: Efficient Distance Computation between 3D Meshes on GPU </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Fang%2C+P">Peng Fang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+W">Wei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R">Ruofeng Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Hailong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+M">Min Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11244v1-abstract-short" style="display: inline;"> Computing maximum/minimum distances between 3D meshes is crucial for various applications, i.e., robotics, CAD, VR/AR, etc. In this work, we introduce a highly parallel algorithm (gDist) optimized for Graphics Processing Units (GPUs), which is capable of computing the distance between two meshes with over 15 million triangles in less than 0.4 milliseconds (Fig. 1). By testing on benchmarks with va&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11244v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11244v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11244v1-abstract-full" style="display: none;"> Computing maximum/minimum distances between 3D meshes is crucial for various applications, i.e., robotics, CAD, VR/AR, etc. In this work, we introduce a highly parallel algorithm (gDist) optimized for Graphics Processing Units (GPUs), which is capable of computing the distance between two meshes with over 15 million triangles in less than 0.4 milliseconds (Fig. 1). By testing on benchmarks with varying characteristics, the algorithm achieves remarkable speedups over prior CPU-based and GPU-based algorithms on a commodity GPU (NVIDIA GeForce RTX 4090). Notably, the algorithm consistently maintains high-speed performance, even in challenging scenarios that pose difficulties for prior algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11244v1-abstract-full').style.display = 'none'; document.getElementById('2411.11244v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06691">arXiv:2411.06691</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.06691">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Autonomous Droplet Microfluidic Design Framework with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+D">Dinh-Nguyen Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R+K">Raymond Kai-Yu Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Dinh%2C+N">Ngoc-Duy Dinh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06691v1-abstract-short" style="display: inline;"> Droplet-based microfluidic devices have substantial promise as cost-effective alternatives to current assessment tools in biological research. Moreover, machine learning models that leverage tabular data, including input design parameters and their corresponding efficiency outputs, are increasingly utilised to automate the design process of these devices and to predict their performance. However,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06691v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06691v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06691v1-abstract-full" style="display: none;"> Droplet-based microfluidic devices have substantial promise as cost-effective alternatives to current assessment tools in biological research. Moreover, machine learning models that leverage tabular data, including input design parameters and their corresponding efficiency outputs, are increasingly utilised to automate the design process of these devices and to predict their performance. However, these models fail to fully leverage the data presented in the tables, neglecting crucial contextual information, including column headings and their associated descriptions. This study presents MicroFluidic-LLMs, a framework designed for processing and feature extraction, which effectively captures contextual information from tabular data formats. MicroFluidic-LLMs overcomes processing challenges by transforming the content into a linguistic format and leveraging pre-trained large language models (LLMs) for analysis. We evaluate our MicroFluidic-LLMs framework on 11 prediction tasks, covering aspects such as geometry, flow conditions, regimes, and performance, utilising a publicly available dataset on flow-focusing droplet microfluidics. We demonstrate that our MicroFluidic-LLMs framework can empower deep neural network models to be highly effective and straightforward while minimising the need for extensive data preprocessing. Moreover, the exceptional performance of deep neural network models, particularly when combined with advanced natural language processing models such as DistilBERT and GPT-2, reduces the mean absolute error in the droplet diameter and generation rate by nearly 5- and 7-fold, respectively, and enhances the regime classification accuracy by over 4%, compared with the performance reported in a previous study. This study lays the foundation for the huge potential applications of LLMs and machine learning in a wider spectrum of microfluidic applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06691v1-abstract-full').style.display = 'none'; document.getElementById('2411.06691v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15959">arXiv:2410.15959</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.15959">pdf</a>, <a href="https://arxiv.org/format/2410.15959">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Diffusion Transformer Policy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hou%2C+Z">Zhi Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+T">Tianyi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+Y">Yuwen Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Pu%2C+H">Hengjun Pu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+C">Chengyang Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R">Ronglei Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+J">Jifeng Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yuntao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15959v1-abstract-short" style="display: inline;"> Recent large visual-language action models pretrained on diverse robot datasets have demonstrated the potential for generalizing to new environments with a few in-domain data. However, those approaches usually predict discretized or continuous actions by a small action head, which limits the ability in handling diverse action spaces. In contrast, we model the continuous action with a large multi-m&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15959v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15959v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15959v1-abstract-full" style="display: none;"> Recent large visual-language action models pretrained on diverse robot datasets have demonstrated the potential for generalizing to new environments with a few in-domain data. However, those approaches usually predict discretized or continuous actions by a small action head, which limits the ability in handling diverse action spaces. In contrast, we model the continuous action with a large multi-modal diffusion transformer, dubbed as Diffusion Transformer Policy, in which we directly denoise action chunks by a large transformer model rather than a small action head. By leveraging the scaling capability of transformers, the proposed approach can effectively model continuous end-effector actions across large diverse robot datasets, and achieve better generalization performance. Extensive experiments demonstrate Diffusion Transformer Policy pretrained on diverse robot data can generalize to different embodiments, including simulation environments like Maniskill2 and Calvin, as well as the real-world Franka arm. Specifically, without bells and whistles, the proposed approach achieves state-of-the-art performance with only a single third-view camera stream in the Calvin novel task setting (ABC-&gt;D), improving the average number of tasks completed in a row of 5 to 3.6, and the pretraining stage significantly facilitates the success sequence length on the Calvin by over 1.2. The code will be publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15959v1-abstract-full').style.display = 'none'; document.getElementById('2410.15959v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19407">arXiv:2409.19407</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.19407">pdf</a>, <a href="https://arxiv.org/format/2409.19407">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Brain-JEPA: Brain Dynamics Foundation Model with Gradient Positioning and Spatiotemporal Masking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Z">Zijian Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+R">Ruilin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Y">Yilei Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+T+T">Thuan Tinh Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Chong%2C+J+S+X">Joanna Su Xian Chong</a>, <a href="/search/cs?searchtype=author&amp;query=Ji%2C+F">Fang Ji</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+N+R+J">Nathanael Ren Jie Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+C+L+H">Christopher Li Hsian Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J+H">Juan Helen Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19407v1-abstract-short" style="display: inline;"> We introduce Brain-JEPA, a brain dynamics foundation model with the Joint-Embedding Predictive Architecture (JEPA). This pioneering model achieves state-of-the-art performance in demographic prediction, disease diagnosis/prognosis, and trait prediction through fine-tuning. Furthermore, it excels in off-the-shelf evaluations (e.g., linear probing) and demonstrates superior generalizability across d&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19407v1-abstract-full').style.display = 'inline'; document.getElementById('2409.19407v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19407v1-abstract-full" style="display: none;"> We introduce Brain-JEPA, a brain dynamics foundation model with the Joint-Embedding Predictive Architecture (JEPA). This pioneering model achieves state-of-the-art performance in demographic prediction, disease diagnosis/prognosis, and trait prediction through fine-tuning. Furthermore, it excels in off-the-shelf evaluations (e.g., linear probing) and demonstrates superior generalizability across different ethnic groups, surpassing the previous large model for brain activity significantly. Brain-JEPA incorporates two innovative techniques: Brain Gradient Positioning and Spatiotemporal Masking. Brain Gradient Positioning introduces a functional coordinate system for brain functional parcellation, enhancing the positional encoding of different Regions of Interest (ROIs). Spatiotemporal Masking, tailored to the unique characteristics of fMRI data, addresses the challenge of heterogeneous time-series patches. These methodologies enhance model performance and advance our understanding of the neural circuits underlying cognition. Overall, Brain-JEPA is paving the way to address pivotal questions of building brain functional coordinate system and masking brain activity at the AI-neuroscience interface, and setting a potentially new paradigm in brain activity analysis through downstream adaptation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19407v1-abstract-full').style.display = 'none'; document.getElementById('2409.19407v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The first two authors contributed equally. NeurIPS 2024 Spotlight</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05024">arXiv:2409.05024</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.05024">pdf</a>, <a href="https://arxiv.org/format/2409.05024">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Deep Self-Cleansing for Medical Image Segmentation with Noisy Labels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dong%2C+J">Jiahua Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yue Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Q">Qiuli Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R">Ruofeng Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Ying%2C+S">Shihong Ying</a>, <a href="/search/cs?searchtype=author&amp;query=Gong%2C+S">Shaolin Gong</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xuanpu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+L">Lanfen Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yen-Wei Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+S+K">S. Kevin Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05024v2-abstract-short" style="display: inline;"> Medical image segmentation is crucial in the field of medical imaging, aiding in disease diagnosis and surgical planning. Most established segmentation methods rely on supervised deep learning, in which clean and precise labels are essential for supervision and significantly impact the performance of models. However, manually delineated labels often contain noise, such as missing labels and inaccu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05024v2-abstract-full').style.display = 'inline'; document.getElementById('2409.05024v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05024v2-abstract-full" style="display: none;"> Medical image segmentation is crucial in the field of medical imaging, aiding in disease diagnosis and surgical planning. Most established segmentation methods rely on supervised deep learning, in which clean and precise labels are essential for supervision and significantly impact the performance of models. However, manually delineated labels often contain noise, such as missing labels and inaccurate boundary delineation, which can hinder networks from correctly modeling target characteristics. In this paper, we propose a deep self-cleansing segmentation framework that can preserve clean labels while cleansing noisy ones in the training phase. To achieve this, we devise a gaussian mixture model-based label filtering module that distinguishes noisy labels from clean labels. Additionally, we develop a label cleansing module to generate pseudo low-noise labels for identified noisy samples. The preserved clean labels and pseudo-labels are then used jointly to supervise the network. Validated on a clinical liver tumor dataset and a public cardiac diagnosis dataset, our method can effectively suppress the interference from noisy labels and achieve prominent segmentation performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05024v2-abstract-full').style.display = 'none'; document.getElementById('2409.05024v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">31 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21045">arXiv:2407.21045</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.21045">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Unlocking the Potential: Benchmarking Large Language Models in Water Engineering and Research </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+B">Boyan Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+L">Liang Wen</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zihao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yuxing Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+G">Guanlan Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+X">Xiongpeng Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yu Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Zihao Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Su%2C+Q">Qingxian Su</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+X">Xueqing Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yue Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R">Rui Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Ng%2C+H+Y">How Yong Ng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21045v1-abstract-short" style="display: inline;"> Recent advancements in Large Language Models (LLMs) have sparked interest in their potential applications across various fields. This paper embarked on a pivotal inquiry: Can existing LLMs effectively serve as &#34;water expert models&#34; for water engineering and research tasks? This study was the first to evaluate LLMs&#39; contributions across various water engineering and research tasks by establishing a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21045v1-abstract-full').style.display = 'inline'; document.getElementById('2407.21045v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21045v1-abstract-full" style="display: none;"> Recent advancements in Large Language Models (LLMs) have sparked interest in their potential applications across various fields. This paper embarked on a pivotal inquiry: Can existing LLMs effectively serve as &#34;water expert models&#34; for water engineering and research tasks? This study was the first to evaluate LLMs&#39; contributions across various water engineering and research tasks by establishing a domain-specific benchmark suite, namely, WaterER. Herein, we prepared 983 tasks related to water engineering and research, categorized into &#34;wastewater treatment&#34;, &#34;environmental restoration&#34;, &#34;drinking water treatment and distribution&#34;, &#34;sanitation&#34;, &#34;anaerobic digestion&#34; and &#34;contaminants assessment&#34;. We evaluated the performance of seven LLMs (i.e., GPT-4, GPT-3.5, Gemini, GLM-4, ERNIE, QWEN and Llama3) on these tasks. We highlighted the strengths of GPT-4 in handling diverse and complex tasks of water engineering and water research, the specialized capabilities of Gemini in academic contexts, Llama3&#39;s strongest capacity to answer Chinese water engineering questions and the competitive performance of Chinese-oriented models like GLM-4, ERNIE and QWEN in some water engineering tasks. More specifically, current LLMs excelled particularly in generating precise research gaps for papers on &#34;contaminants and related water quality monitoring and assessment&#34;. Additionally, they were more adept at creating appropriate titles for research papers on &#34;treatment processes for wastewaters&#34;, &#34;environmental restoration&#34;, and &#34;drinking water treatment&#34;. Overall, this study pioneered evaluating LLMs in water engineering and research by introducing the WaterER benchmark to assess the trustworthiness of their predictions. This standardized evaluation framework would also drive future advancements in LLM technology by using targeting datasets, propelling these models towards becoming true &#34;water expert&#34;. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21045v1-abstract-full').style.display = 'none'; document.getElementById('2407.21045v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.16244">arXiv:2407.16244</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.16244">pdf</a>, <a href="https://arxiv.org/format/2407.16244">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3581783.3612159">10.1145/3581783.3612159 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> HSVLT: Hierarchical Scale-Aware Vision-Language Transformer for Multi-Label Image Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ouyang%2C+S">Shuyi Ouyang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Hongyi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Niu%2C+Z">Ziwei Niu</a>, <a href="/search/cs?searchtype=author&amp;query=Bai%2C+Z">Zhenjia Bai</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+S">Shiao Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Y">Yingying Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R">Ruofeng Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yen-Wei Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+L">Lanfen Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.16244v1-abstract-short" style="display: inline;"> The task of multi-label image classification involves recognizing multiple objects within a single image. Considering both valuable semantic information contained in the labels and essential visual features presented in the image, tight visual-linguistic interactions play a vital role in improving classification performance. Moreover, given the potential variance in object size and appearance with&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16244v1-abstract-full').style.display = 'inline'; document.getElementById('2407.16244v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.16244v1-abstract-full" style="display: none;"> The task of multi-label image classification involves recognizing multiple objects within a single image. Considering both valuable semantic information contained in the labels and essential visual features presented in the image, tight visual-linguistic interactions play a vital role in improving classification performance. Moreover, given the potential variance in object size and appearance within a single image, attention to features of different scales can help to discover possible objects in the image. Recently, Transformer-based methods have achieved great success in multi-label image classification by leveraging the advantage of modeling long-range dependencies, but they have several limitations. Firstly, existing methods treat visual feature extraction and cross-modal fusion as separate steps, resulting in insufficient visual-linguistic alignment in the joint semantic space. Additionally, they only extract visual features and perform cross-modal fusion at a single scale, neglecting objects with different characteristics. To address these issues, we propose a Hierarchical Scale-Aware Vision-Language Transformer (HSVLT) with two appealing designs: (1)~A hierarchical multi-scale architecture that involves a Cross-Scale Aggregation module, which leverages joint multi-modal features extracted from multiple scales to recognize objects of varying sizes and appearances in images. (2)~Interactive Visual-Linguistic Attention, a novel attention mechanism module that tightly integrates cross-modal interaction, enabling the joint updating of visual, linguistic and multi-modal features. We have evaluated our method on three benchmark datasets. The experimental results demonstrate that HSVLT surpasses state-of-the-art methods with lower computational cost. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16244v1-abstract-full').style.display = 'none'; document.getElementById('2407.16244v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 6 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of the 31st ACM International Conference on Multimedia. 2023: 4768-4777 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.10959">arXiv:2405.10959</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.10959">pdf</a>, <a href="https://arxiv.org/format/2405.10959">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Foundation Models for Education: Promises and Prospects </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+T">Tianlong Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R">Richard Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+J">Jing Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xing Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Haoyang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+Q">Qingsong Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.10959v1-abstract-short" style="display: inline;"> With the advent of foundation models like ChatGPT, educators are excited about the transformative role that AI might play in propelling the next education revolution. The developing speed and the profound impact of foundation models in various industries force us to think deeply about the changes they will make to education, a domain that is critically important for the future of humans. In this p&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10959v1-abstract-full').style.display = 'inline'; document.getElementById('2405.10959v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.10959v1-abstract-full" style="display: none;"> With the advent of foundation models like ChatGPT, educators are excited about the transformative role that AI might play in propelling the next education revolution. The developing speed and the profound impact of foundation models in various industries force us to think deeply about the changes they will make to education, a domain that is critically important for the future of humans. In this paper, we discuss the strengths of foundation models, such as personalized learning, education inequality, and reasoning capabilities, as well as the development of agent architecture tailored for education, which integrates AI agents with pedagogical frameworks to create adaptive learning environments. Furthermore, we highlight the risks and opportunities of AI overreliance and creativity. Lastly, we envision a future where foundation models in education harmonize human and AI capabilities, fostering a dynamic, inclusive, and adaptive educational ecosystem. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10959v1-abstract-full').style.display = 'none'; document.getElementById('2405.10959v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE Intelligent Systems</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.14689">arXiv:2403.14689</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.14689">pdf</a>, <a href="https://arxiv.org/format/2403.14689">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Developing and Deploying Industry Standards for Artificial Intelligence in Education (AIED): Challenges, Strategies, and Future Directions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R">Richard Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Haoyang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+J">Joleen Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+Q">Qingsong Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.14689v2-abstract-short" style="display: inline;"> The adoption of Artificial Intelligence in Education (AIED) holds the promise of revolutionizing educational practices by offering personalized learning experiences, automating administrative and pedagogical tasks, and reducing the cost of content creation. However, the lack of standardized practices in the development and deployment of AIED solutions has led to fragmented ecosystems, which presen&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.14689v2-abstract-full').style.display = 'inline'; document.getElementById('2403.14689v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.14689v2-abstract-full" style="display: none;"> The adoption of Artificial Intelligence in Education (AIED) holds the promise of revolutionizing educational practices by offering personalized learning experiences, automating administrative and pedagogical tasks, and reducing the cost of content creation. However, the lack of standardized practices in the development and deployment of AIED solutions has led to fragmented ecosystems, which presents challenges in interoperability, scalability, and ethical governance. This article aims to address the critical need to develop and implement industry standards in AIED, offering a comprehensive analysis of the current landscape, challenges, and strategic approaches to overcome these obstacles. We begin by examining the various applications of AIED in various educational settings and identify key areas lacking in standardization, including system interoperability, ontology mapping, data integration, evaluation, and ethical governance. Then, we propose a multi-tiered framework for establishing robust industry standards for AIED. In addition, we discuss methodologies for the iterative development and deployment of standards, incorporating feedback loops from real-world applications to refine and adapt standards over time. The paper also highlights the role of emerging technologies and pedagogical theories in shaping future standards for AIED. Finally, we outline a strategic roadmap for stakeholders to implement these standards, fostering a cohesive and ethical AIED ecosystem. By establishing comprehensive industry standards, such as those by IEEE Artificial Intelligence Standards Committee (AISC) and International Organization for Standardization (ISO), we can accelerate and scale AIED solutions to improve educational outcomes, ensuring that technological advances align with the principles of inclusivity, fairness, and educational excellence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.14689v2-abstract-full').style.display = 'none'; document.getElementById('2403.14689v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.13598">arXiv:2401.13598</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.13598">pdf</a>, <a href="https://arxiv.org/format/2401.13598">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Consistency Guided Knowledge Retrieval and Denoising in LLMs for Zero-shot Document-level Relation Triplet Extraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sun%2C+Q">Qi Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+K">Kun Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+X">Xiaocui Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R">Rong Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+K">Kun Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Poria%2C+S">Soujanya Poria</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.13598v1-abstract-short" style="display: inline;"> Document-level Relation Triplet Extraction (DocRTE) is a fundamental task in information systems that aims to simultaneously extract entities with semantic relations from a document. Existing methods heavily rely on a substantial amount of fully labeled data. However, collecting and annotating data for newly emerging relations is time-consuming and labor-intensive. Recent advanced Large Language M&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.13598v1-abstract-full').style.display = 'inline'; document.getElementById('2401.13598v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.13598v1-abstract-full" style="display: none;"> Document-level Relation Triplet Extraction (DocRTE) is a fundamental task in information systems that aims to simultaneously extract entities with semantic relations from a document. Existing methods heavily rely on a substantial amount of fully labeled data. However, collecting and annotating data for newly emerging relations is time-consuming and labor-intensive. Recent advanced Large Language Models (LLMs), such as ChatGPT and LLaMA, exhibit impressive long-text generation capabilities, inspiring us to explore an alternative approach for obtaining auto-labeled documents with new relations. In this paper, we propose a Zero-shot Document-level Relation Triplet Extraction (ZeroDocRTE) framework, which generates labeled data by retrieval and denoising knowledge from LLMs, called GenRDK. Specifically, we propose a chain-of-retrieval prompt to guide ChatGPT to generate labeled long-text data step by step. To improve the quality of synthetic data, we propose a denoising strategy based on the consistency of cross-document knowledge. Leveraging our denoised synthetic data, we proceed to fine-tune the LLaMA2-13B-Chat for extracting document-level relation triplets. We perform experiments for both zero-shot document-level relation and triplet extraction on two public datasets. The experimental results illustrate that our GenRDK framework outperforms strong baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.13598v1-abstract-full').style.display = 'none'; document.getElementById('2401.13598v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by WWW 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.08631">arXiv:2312.08631</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.08631">pdf</a>, <a href="https://arxiv.org/format/2312.08631">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Semi-supervised Semantic Segmentation Meets Masked Modeling:Fine-grained Locality Learning Matters in Consistency Regularization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pan%2C+W">Wentao Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Z">Zhe Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+J">Jiangpeng Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Zihan Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R+K">Raymond Kai-yu Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xiu Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+J">Jianhua Yao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.08631v1-abstract-short" style="display: inline;"> Semi-supervised semantic segmentation aims to utilize limited labeled images and abundant unlabeled images to achieve label-efficient learning, wherein the weak-to-strong consistency regularization framework, popularized by FixMatch, is widely used as a benchmark scheme. Despite its effectiveness, we observe that such scheme struggles with satisfactory segmentation for the local regions. This can&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.08631v1-abstract-full').style.display = 'inline'; document.getElementById('2312.08631v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.08631v1-abstract-full" style="display: none;"> Semi-supervised semantic segmentation aims to utilize limited labeled images and abundant unlabeled images to achieve label-efficient learning, wherein the weak-to-strong consistency regularization framework, popularized by FixMatch, is widely used as a benchmark scheme. Despite its effectiveness, we observe that such scheme struggles with satisfactory segmentation for the local regions. This can be because it originally stems from the image classification task and lacks specialized mechanisms to capture fine-grained local semantics that prioritizes in dense prediction. To address this issue, we propose a novel framework called \texttt{MaskMatch}, which enables fine-grained locality learning to achieve better dense segmentation. On top of the original teacher-student framework, we design a masked modeling proxy task that encourages the student model to predict the segmentation given the unmasked image patches (even with 30\% only) and enforces the predictions to be consistent with pseudo-labels generated by the teacher model using the complete image. Such design is motivated by the intuition that if the predictions are more consistent given insufficient neighboring information, stronger fine-grained locality perception is achieved. Besides, recognizing the importance of reliable pseudo-labels in the above locality learning and the original consistency learning scheme, we design a multi-scale ensembling strategy that considers context at different levels of abstraction for pseudo-label generation. Extensive experiments on benchmark datasets demonstrate the superiority of our method against previous approaches and its plug-and-play flexibility. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.08631v1-abstract-full').style.display = 'none'; document.getElementById('2312.08631v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.01099">arXiv:2312.01099</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.01099">pdf</a>, <a href="https://arxiv.org/format/2312.01099">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Multiple Instance Learning for Whole Slide Image Classification: A Bag-Level Classifier is a Good Instance-Level Teacher </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Hongyi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+L">Luyang Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+F">Fang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R">Ruofeng Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yen-Wei Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+H">Hongjie Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+L">Lanfen Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.01099v1-abstract-short" style="display: inline;"> Multiple Instance Learning (MIL) has demonstrated promise in Whole Slide Image (WSI) classification. However, a major challenge persists due to the high computational cost associated with processing these gigapixel images. Existing methods generally adopt a two-stage approach, comprising a non-learnable feature embedding stage and a classifier training stage. Though it can greatly reduce the memor&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.01099v1-abstract-full').style.display = 'inline'; document.getElementById('2312.01099v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.01099v1-abstract-full" style="display: none;"> Multiple Instance Learning (MIL) has demonstrated promise in Whole Slide Image (WSI) classification. However, a major challenge persists due to the high computational cost associated with processing these gigapixel images. Existing methods generally adopt a two-stage approach, comprising a non-learnable feature embedding stage and a classifier training stage. Though it can greatly reduce the memory consumption by using a fixed feature embedder pre-trained on other domains, such scheme also results in a disparity between the two stages, leading to suboptimal classification accuracy. To address this issue, we propose that a bag-level classifier can be a good instance-level teacher. Based on this idea, we design Iteratively Coupled Multiple Instance Learning (ICMIL) to couple the embedder and the bag classifier at a low cost. ICMIL initially fix the patch embedder to train the bag classifier, followed by fixing the bag classifier to fine-tune the patch embedder. The refined embedder can then generate better representations in return, leading to a more accurate classifier for the next iteration. To realize more flexible and more effective embedder fine-tuning, we also introduce a teacher-student framework to efficiently distill the category knowledge in the bag classifier to help the instance-level embedder fine-tuning. Thorough experiments were conducted on four distinct datasets to validate the effectiveness of ICMIL. The experimental results consistently demonstrate that our method significantly improves the performance of existing MIL backbones, achieving state-of-the-art results. The code is available at: https://github.com/Dootmaan/ICMIL/tree/confidence_based <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.01099v1-abstract-full').style.display = 'none'; document.getElementById('2312.01099v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.04811">arXiv:2311.04811</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.04811">pdf</a>, <a href="https://arxiv.org/format/2311.04811">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Image-Based Virtual Try-On: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Song%2C+D">Dan Song</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xuanpu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Juan Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+W">Weizhi Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R">Ruofeng Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Kankanhalli%2C+M">Mohan Kankanhalli</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+A">An-An Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.04811v4-abstract-short" style="display: inline;"> Image-based virtual try-on aims to synthesize a naturally dressed person image with a clothing image, which revolutionizes online shopping and inspires related topics within image generation, showing both research significance and commercial potential. However, there is a gap between current research progress and commercial applications and an absence of comprehensive overview of this field to acc&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.04811v4-abstract-full').style.display = 'inline'; document.getElementById('2311.04811v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.04811v4-abstract-full" style="display: none;"> Image-based virtual try-on aims to synthesize a naturally dressed person image with a clothing image, which revolutionizes online shopping and inspires related topics within image generation, showing both research significance and commercial potential. However, there is a gap between current research progress and commercial applications and an absence of comprehensive overview of this field to accelerate the development.In this survey, we provide a comprehensive analysis of the state-of-the-art techniques and methodologies in aspects of pipeline architecture, person representation and key modules such as try-on indication, clothing warping and try-on stage. We additionally apply CLIP to assess the semantic alignment of try-on results, and evaluate representative methods with uniformly implemented evaluation metrics on the same dataset.In addition to quantitative and qualitative evaluation of current open-source methods, unresolved issues are highlighted and future research directions are prospected to identify key trends and inspire further exploration. The uniformly implemented evaluation metrics, dataset and collected methods will be made public available at https://github.com/little-misfit/Survey-Of-Virtual-Try-On. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.04811v4-abstract-full').style.display = 'none'; document.getElementById('2311.04811v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">30 pages, 20 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.03990">arXiv:2308.03990</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2308.03990">pdf</a>, <a href="https://arxiv.org/ps/2308.03990">ps</a>, <a href="https://arxiv.org/format/2308.03990">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> NEOLAF, an LLM-powered neural-symbolic cognitive architecture </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R+J">Richard Jiarui Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+C+C">Cassie Chen Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+T+X">Timothy Xueqian Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+G">Guodong Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+R">Ray Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+F">Feiyue Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+X">Xiangen Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Schmucker%2C+R">Robin Schmucker</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+J">Jinsheng Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Quevedo%2C+J">Julian Quevedo</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Y">Yu Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.03990v1-abstract-short" style="display: inline;"> This paper presents the Never Ending Open Learning Adaptive Framework (NEOLAF), an integrated neural-symbolic cognitive architecture that models and constructs intelligent agents. The NEOLAF framework is a superior approach to constructing intelligent agents than both the pure connectionist and pure symbolic approaches due to its explainability, incremental learning, efficiency, collaborative and&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.03990v1-abstract-full').style.display = 'inline'; document.getElementById('2308.03990v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.03990v1-abstract-full" style="display: none;"> This paper presents the Never Ending Open Learning Adaptive Framework (NEOLAF), an integrated neural-symbolic cognitive architecture that models and constructs intelligent agents. The NEOLAF framework is a superior approach to constructing intelligent agents than both the pure connectionist and pure symbolic approaches due to its explainability, incremental learning, efficiency, collaborative and distributed learning, human-in-the-loop enablement, and self-improvement. The paper further presents a compelling experiment where a NEOLAF agent, built as a problem-solving agent, is fed with complex math problems from the open-source MATH dataset. The results demonstrate NEOLAF&#39;s superior learning capability and its potential to revolutionize the field of cognitive architectures and self-improving adaptive instructional systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.03990v1-abstract-full').style.display = 'none'; document.getElementById('2308.03990v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.18808">arXiv:2305.18808</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.18808">pdf</a>, <a href="https://arxiv.org/format/2305.18808">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CTSN: Predicting Cloth Deformation for Skeleton-based Characters with a Two-stream Skinning Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yudi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+M">Min Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yun Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R">Ruofeng Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+S">Shuangcai Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yao Li</a>, <a href="/search/cs?searchtype=author&amp;query=An%2C+B">Bailin An</a>, <a href="/search/cs?searchtype=author&amp;query=Kou%2C+Q">Qilong Kou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.18808v1-abstract-short" style="display: inline;"> We present a novel learning method to predict the cloth deformation for skeleton-based characters with a two-stream network. The characters processed in our approach are not limited to humans, and can be other skeletal-based representations of non-human targets such as fish or pets. We use a novel network architecture which consists of skeleton-based and mesh-based residual networks to learn the c&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.18808v1-abstract-full').style.display = 'inline'; document.getElementById('2305.18808v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.18808v1-abstract-full" style="display: none;"> We present a novel learning method to predict the cloth deformation for skeleton-based characters with a two-stream network. The characters processed in our approach are not limited to humans, and can be other skeletal-based representations of non-human targets such as fish or pets. We use a novel network architecture which consists of skeleton-based and mesh-based residual networks to learn the coarse and wrinkle features as the overall residual from the template cloth mesh. Our network is used to predict the deformation for loose or tight-fitting clothing or dresses. We ensure that the memory footprint of our network is low, and thereby result in reduced storage and computational requirements. In practice, our prediction for a single cloth mesh for the skeleton-based character takes about 7 milliseconds on an NVIDIA GeForce RTX 3090 GPU. Compared with prior methods, our network can generate fine deformation results with details and wrinkles. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.18808v1-abstract-full').style.display = 'none'; document.getElementById('2305.18808v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.07123">arXiv:2304.07123</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2304.07123">pdf</a>, <a href="https://arxiv.org/format/2304.07123">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Tailored Multi-Organ Segmentation with Model Adaptation and Ensemble </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dong%2C+J">Jiahua Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+G">Guohua Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yue Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+C">Chengtao Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+Y">Yu Song</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R">Ruofeng Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+L">Lanfen Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yen-Wei Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.07123v1-abstract-short" style="display: inline;"> Multi-organ segmentation, which identifies and separates different organs in medical images, is a fundamental task in medical image analysis. Recently, the immense success of deep learning motivated its wide adoption in multi-organ segmentation tasks. However, due to expensive labor costs and expertise, the availability of multi-organ annotations is usually limited and hence poses a challenge in o&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.07123v1-abstract-full').style.display = 'inline'; document.getElementById('2304.07123v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.07123v1-abstract-full" style="display: none;"> Multi-organ segmentation, which identifies and separates different organs in medical images, is a fundamental task in medical image analysis. Recently, the immense success of deep learning motivated its wide adoption in multi-organ segmentation tasks. However, due to expensive labor costs and expertise, the availability of multi-organ annotations is usually limited and hence poses a challenge in obtaining sufficient training data for deep learning-based methods. In this paper, we aim to address this issue by combining off-the-shelf single-organ segmentation models to develop a multi-organ segmentation model on the target dataset, which helps get rid of the dependence on annotated data for multi-organ segmentation. To this end, we propose a novel dual-stage method that consists of a Model Adaptation stage and a Model Ensemble stage. The first stage enhances the generalization of each off-the-shelf segmentation model on the target domain, while the second stage distills and integrates knowledge from multiple adapted single-organ segmentation models. Extensive experiments on four abdomen datasets demonstrate that our proposed method can effectively leverage off-the-shelf single-organ segmentation models to obtain a tailored model for multi-organ segmentation with high accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.07123v1-abstract-full').style.display = 'none'; document.getElementById('2304.07123v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.15749">arXiv:2303.15749</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2303.15749">pdf</a>, <a href="https://arxiv.org/format/2303.15749">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Iteratively Coupled Multiple Instance Learning from Instance to Bag Classifier for Whole Slide Image Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Hongyi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+L">Luyang Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+F">Fang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R">Ruofeng Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yen-Wei Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+H">Hongjie Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+L">Lanfen Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.15749v2-abstract-short" style="display: inline;"> Whole Slide Image (WSI) classification remains a challenge due to their extremely high resolution and the absence of fine-grained labels. Presently, WSI classification is usually regarded as a Multiple Instance Learning (MIL) problem when only slide-level labels are available. MIL methods involve a patch embedding module and a bag-level classification module, but they are prohibitively expensive t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.15749v2-abstract-full').style.display = 'inline'; document.getElementById('2303.15749v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.15749v2-abstract-full" style="display: none;"> Whole Slide Image (WSI) classification remains a challenge due to their extremely high resolution and the absence of fine-grained labels. Presently, WSI classification is usually regarded as a Multiple Instance Learning (MIL) problem when only slide-level labels are available. MIL methods involve a patch embedding module and a bag-level classification module, but they are prohibitively expensive to be trained in an end-to-end manner. Therefore, existing methods usually train them separately, or directly skip the training of the embedder. Such schemes hinder the patch embedder&#39;s access to slide-level semantic labels, resulting in inconsistency within the entire MIL pipeline. To overcome this issue, we propose a novel framework called Iteratively Coupled MIL (ICMIL), which bridges the loss back-propagation process from the bag-level classifier to the patch embedder. In ICMIL, we use category information in the bag-level classifier to guide the patch-level fine-tuning of the patch feature extractor. The refined embedder then generates better instance representations for achieving a more accurate bag-level classifier. By coupling the patch embedder and bag classifier at a low cost, our proposed framework enables information exchange between the two modules, benefiting the entire MIL classification model. We tested our framework on two datasets using three different backbones, and our experimental results demonstrate consistent performance improvements over state-of-the-art MIL methods. The code is available at: https://github.com/Dootmaan/ICMIL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.15749v2-abstract-full').style.display = 'none'; document.getElementById('2303.15749v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.14645">arXiv:2210.14645</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2210.14645">pdf</a>, <a href="https://arxiv.org/format/2210.14645">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Super-Resolution Based Patch-Free 3D Image Segmentation with High-Frequency Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Hongyi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+L">Lanfen Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+H">Hongjie Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Q">Qingqing Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yinhao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Iwamoto%2C+Y">Yutaro Iwamoto</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xian-Hua Han</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yen-Wei Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R">Ruofeng Tong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.14645v2-abstract-short" style="display: inline;"> High resolution (HR) 3D images are widely used nowadays, such as medical images like Magnetic Resonance Imaging (MRI) and Computed Tomography (CT). However, segmentation of these 3D images remains a challenge due to their high spatial resolution and dimensionality in contrast to currently limited GPU memory. Therefore, most existing 3D image segmentation methods use patch-based models, which have&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.14645v2-abstract-full').style.display = 'inline'; document.getElementById('2210.14645v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.14645v2-abstract-full" style="display: none;"> High resolution (HR) 3D images are widely used nowadays, such as medical images like Magnetic Resonance Imaging (MRI) and Computed Tomography (CT). However, segmentation of these 3D images remains a challenge due to their high spatial resolution and dimensionality in contrast to currently limited GPU memory. Therefore, most existing 3D image segmentation methods use patch-based models, which have low inference efficiency and ignore global contextual information. To address these problems, we propose a super-resolution (SR) based patch-free 3D image segmentation framework that can realize HR segmentation from a global-wise low-resolution (LR) input. The framework contains two sub-tasks, of which semantic segmentation is the main task and super resolution is an auxiliary task aiding in rebuilding the high frequency information from the LR input. To furthermore balance the information loss with the LR input, we propose a High-Frequency Guidance Module (HGM), and design an efficient selective cropping algorithm to crop an HR patch from the original image as restoration guidance for it. In addition, we also propose a Task-Fusion Module (TFM) to exploit the inter connections between segmentation and SR task, realizing joint optimization of the two tasks. When predicting, only the main segmentation task is needed, while other modules can be removed for acceleration. The experimental results on two different datasets show that our framework has a four times higher inference speed compared to traditional patch-based methods, while its performance also surpasses other patch-based and patch-free models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.14645v2-abstract-full').style.display = 'none'; document.getElementById('2210.14645v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Version #2 uploaded in Jul 10, 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.03603">arXiv:2210.03603</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2210.03603">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.21437/Interspeech.2014-353">10.21437/Interspeech.2014-353 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Pronunciation Modeling of Foreign Words for Mandarin ASR by Considering the Effect of Language Transfer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Lei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R">Rong Tong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.03603v1-abstract-short" style="display: inline;"> One of the challenges in automatic speech recognition is foreign words recognition. It is observed that a speaker&#39;s pronunciation of a foreign word is influenced by his native language knowledge, and such phenomenon is known as the effect of language transfer. This paper focuses on examining the phonetic effect of language transfer in automatic speech recognition. A set of lexical rules is propose&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.03603v1-abstract-full').style.display = 'inline'; document.getElementById('2210.03603v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.03603v1-abstract-full" style="display: none;"> One of the challenges in automatic speech recognition is foreign words recognition. It is observed that a speaker&#39;s pronunciation of a foreign word is influenced by his native language knowledge, and such phenomenon is known as the effect of language transfer. This paper focuses on examining the phonetic effect of language transfer in automatic speech recognition. A set of lexical rules is proposed to convert an English word into Mandarin phonetic representation. In this way, a Mandarin lexicon can be augmented by including English words. Hence, the Mandarin ASR system becomes capable to recognize English words without retraining or re-estimation of the acoustic model parameters. Using the lexicon that derived from the proposed rules, the ASR performance of Mandarin English mixed speech is improved without harming the accuracy of Mandarin only speech. The proposed lexical rules are generalized and they can be directly applied to unseen English words. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.03603v1-abstract-full').style.display = 'none'; document.getElementById('2210.03603v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published by INTERSPEECH 2014</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.03580">arXiv:2210.03580</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2210.03580">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ICOT.2017.8336109">10.1109/ICOT.2017.8336109 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Cloud-based Automatic Speech Recognition Systems for Southeast Asian Languages </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Lei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R">Rong Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Leung%2C+C+C">Cheung Chi Leung</a>, <a href="/search/cs?searchtype=author&amp;query=Sivadas%2C+S">Sunil Sivadas</a>, <a href="/search/cs?searchtype=author&amp;query=Ni%2C+C">Chongjia Ni</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+B">Bin Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.03580v1-abstract-short" style="display: inline;"> This paper provides an overall introduction of our Automatic Speech Recognition (ASR) systems for Southeast Asian languages. As not much existing work has been carried out on such regional languages, a few difficulties should be addressed before building the systems: limitation on speech and text resources, lack of linguistic knowledge, etc. This work takes Bahasa Indonesia and Thai as examples to&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.03580v1-abstract-full').style.display = 'inline'; document.getElementById('2210.03580v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.03580v1-abstract-full" style="display: none;"> This paper provides an overall introduction of our Automatic Speech Recognition (ASR) systems for Southeast Asian languages. As not much existing work has been carried out on such regional languages, a few difficulties should be addressed before building the systems: limitation on speech and text resources, lack of linguistic knowledge, etc. This work takes Bahasa Indonesia and Thai as examples to illustrate the strategies of collecting various resources required for building ASR systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.03580v1-abstract-full').style.display = 'none'; document.getElementById('2210.03580v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published by the 2017 IEEE International Conference on Orange Technologies (ICOT 2017)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.14552">arXiv:2207.14552</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2207.14552">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ScaleFormer: Revisiting the Transformer-based Backbones from a Scale-wise Perspective for Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Huang%2C+H">Huimin Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Xie1%2C+S">Shiao Xie1</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+L">Lanfen Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Iwamoto%2C+Y">Yutaro Iwamoto</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xianhua Han</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yen-Wei Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R">Ruofeng Tong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.14552v1-abstract-short" style="display: inline;"> Recently, a variety of vision transformers have been developed as their capability of modeling long-range dependency. In current transformer-based backbones for medical image segmentation, convolutional layers were replaced with pure transformers, or transformers were added to the deepest encoder to learn global context. However, there are mainly two challenges in a scale-wise perspective: (1) int&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.14552v1-abstract-full').style.display = 'inline'; document.getElementById('2207.14552v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.14552v1-abstract-full" style="display: none;"> Recently, a variety of vision transformers have been developed as their capability of modeling long-range dependency. In current transformer-based backbones for medical image segmentation, convolutional layers were replaced with pure transformers, or transformers were added to the deepest encoder to learn global context. However, there are mainly two challenges in a scale-wise perspective: (1) intra-scale problem: the existing methods lacked in extracting local-global cues in each scale, which may impact the signal propagation of small objects; (2) inter-scale problem: the existing methods failed to explore distinctive information from multiple scales, which may hinder the representation learning from objects with widely variable size, shape and location. To address these limitations, we propose a novel backbone, namely ScaleFormer, with two appealing designs: (1) A scale-wise intra-scale transformer is designed to couple the CNN-based local features with the transformer-based global cues in each scale, where the row-wise and column-wise global dependencies can be extracted by a lightweight Dual-Axis MSA. (2) A simple and effective spatial-aware inter-scale transformer is designed to interact among consensual regions in multiple scales, which can highlight the cross-scale dependency and resolve the complex scale variations. Experimental results on different benchmarks demonstrate that our Scale-Former outperforms the current state-of-the-art methods. The code is publicly available at: https://github.com/ZJUGiveLab/ScaleFormer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.14552v1-abstract-full').style.display = 'none'; document.getElementById('2207.14552v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to IJCAI 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.03951">arXiv:2203.03951</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2203.03951">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Efficient and Accurate Hyperspectral Pansharpening Using 3D VolumeNet and 2.5D Texture Transfer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yinao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Iwamoto%2C+Y">Yutaro Iwamoto</a>, <a href="/search/cs?searchtype=author&amp;query=Nakamura%2C+R">Ryousuke Nakamura</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+L">Lanfen Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R">Ruofeng Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yen-Wei Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.03951v1-abstract-short" style="display: inline;"> Recently, convolutional neural networks (CNN) have obtained promising results in single-image SR for hyperspectral pansharpening. However, enhancing CNNs&#39; representation ability with fewer parameters and a shorter prediction time is a challenging and critical task. In this paper, we propose a novel multi-spectral image fusion method using a combination of the previously proposed 3D CNN model Volum&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.03951v1-abstract-full').style.display = 'inline'; document.getElementById('2203.03951v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.03951v1-abstract-full" style="display: none;"> Recently, convolutional neural networks (CNN) have obtained promising results in single-image SR for hyperspectral pansharpening. However, enhancing CNNs&#39; representation ability with fewer parameters and a shorter prediction time is a challenging and critical task. In this paper, we propose a novel multi-spectral image fusion method using a combination of the previously proposed 3D CNN model VolumeNet and 2.5D texture transfer method using other modality high resolution (HR) images. Since a multi-spectral (MS) image consists of several bands and each band is a 2D image slice, MS images can be seen as 3D data. Thus, we use the previously proposed VolumeNet to fuse HR panchromatic (PAN) images and bicubic interpolated MS images. Because the proposed 3D VolumeNet can effectively improve the accuracy by expanding the receptive field of the model, and due to its lightweight structure, we can achieve better performance against the existing method without purchasing a large number of remote sensing images for training. In addition, VolumeNet can restore the high-frequency information lost in the HR MR image as much as possible, reducing the difficulty of feature extraction in the following step: 2.5D texture transfer. As one of the latest technologies, deep learning-based texture transfer has been demonstrated to effectively and efficiently improve the visual performance and quality evaluation indicators of image reconstruction. Different from the texture transfer processing of RGB image, we use HR PAN images as the reference images and perform texture transfer for each frequency band of MS images, which is named 2.5D texture transfer. The experimental results show that the proposed method outperforms the existing methods in terms of objective accuracy assessment, method efficiency, and visual subjective evaluation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.03951v1-abstract-full').style.display = 'none'; document.getElementById('2203.03951v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.13310">arXiv:2202.13310</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2202.13310">pdf</a>, <a href="https://arxiv.org/format/2202.13310">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Attention-based Cross-Layer Domain Alignment for Unsupervised Domain Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xu Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+J">Junkun Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yen-wei Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R">Ruofeng Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+L">Lanfen Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.13310v1-abstract-short" style="display: inline;"> Unsupervised domain adaptation (UDA) aims to learn transferable knowledge from a labeled source domain and adapts a trained model to an unlabeled target domain. To bridge the gap between source and target domains, one prevailing strategy is to minimize the distribution discrepancy by aligning their semantic features extracted by deep models. The existing alignment-based methods mainly focus on red&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.13310v1-abstract-full').style.display = 'inline'; document.getElementById('2202.13310v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.13310v1-abstract-full" style="display: none;"> Unsupervised domain adaptation (UDA) aims to learn transferable knowledge from a labeled source domain and adapts a trained model to an unlabeled target domain. To bridge the gap between source and target domains, one prevailing strategy is to minimize the distribution discrepancy by aligning their semantic features extracted by deep models. The existing alignment-based methods mainly focus on reducing domain divergence in the same model layer. However, the same level of semantic information could distribute across model layers due to the domain shifts. To further boost model adaptation performance, we propose a novel method called Attention-based Cross-layer Domain Alignment (ACDA), which captures the semantic relationship between the source and target domains across model layers and calibrates each level of semantic information automatically through a dynamic attention mechanism. An elaborate attention mechanism is designed to reweight each cross-layer pair based on their semantic similarity for precise domain alignment, effectively matching each level of semantic information during model adaptation. Extensive experiments on multiple benchmark datasets consistently show that the proposed method ACDA yields state-of-the-art performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.13310v1-abstract-full').style.display = 'none'; document.getElementById('2202.13310v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Neurocomputing</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2201.06500">arXiv:2201.06500</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2201.06500">pdf</a>, <a href="https://arxiv.org/format/2201.06500">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Growing Neural Network with Shared Parameter </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R">Ruilin Tong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2201.06500v1-abstract-short" style="display: inline;"> We propose a general method for growing neural network with shared parameter by matching trained network to new input. By leveraging Hoeffding&#39;s inequality, we provide a theoretical base for improving performance by adding subnetwork to existing network. With the theoretical base of adding new subnetwork, we implement a matching method to apply trained subnetwork of existing network to new input.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.06500v1-abstract-full').style.display = 'inline'; document.getElementById('2201.06500v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2201.06500v1-abstract-full" style="display: none;"> We propose a general method for growing neural network with shared parameter by matching trained network to new input. By leveraging Hoeffding&#39;s inequality, we provide a theoretical base for improving performance by adding subnetwork to existing network. With the theoretical base of adding new subnetwork, we implement a matching method to apply trained subnetwork of existing network to new input. Our method has shown the ability to improve performance with higher parameter efficiency. It can also be applied to trans-task case and realize transfer learning by changing the combination of subnetworks without training on new task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.06500v1-abstract-full').style.display = 'none'; document.getElementById('2201.06500v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.06397">arXiv:2112.06397</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2112.06397">pdf</a>, <a href="https://arxiv.org/format/2112.06397">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> N-Cloth: Predicting 3D Cloth Deformation with Mesh-Based Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yudi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+M">Min Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yun Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Z">Zi Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R">Ruofeng Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+S">Shuangcai Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Manocha%2C+D">Dinesh Manocha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.06397v3-abstract-short" style="display: inline;"> We present a novel mesh-based learning approach (N-Cloth) for plausible 3D cloth deformation prediction. Our approach is general and can handle cloth or obstacles represented by triangle meshes with arbitrary topologies. We use graph convolution to transform the cloth and object meshes into a latent space to reduce the non-linearity in the mesh space. Our network can predict the target 3D cloth me&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.06397v3-abstract-full').style.display = 'inline'; document.getElementById('2112.06397v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.06397v3-abstract-full" style="display: none;"> We present a novel mesh-based learning approach (N-Cloth) for plausible 3D cloth deformation prediction. Our approach is general and can handle cloth or obstacles represented by triangle meshes with arbitrary topologies. We use graph convolution to transform the cloth and object meshes into a latent space to reduce the non-linearity in the mesh space. Our network can predict the target 3D cloth mesh deformation based on the initial state of the cloth mesh template and the target obstacle mesh. Our approach can handle complex cloth meshes with up to 100K triangles and scenes with various objects corresponding to SMPL humans, non-SMPL humans or rigid bodies. In practice, our approach can be used to generate plausible cloth simulation at 30-45 fps on an NVIDIA GeForce RTX 3090 GPU. We highlight its benefits over prior learning-based methods and physically-based cloth simulators. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.06397v3-abstract-full').style.display = 'none'; document.getElementById('2112.06397v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.02238">arXiv:2112.02238</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2112.02238">pdf</a>, <a href="https://arxiv.org/format/2112.02238">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Sphere Face Model:A 3D Morphable Model with Hypersphere Manifold Latent Space </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+D">Diqiong Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+Y">Yiwei Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+F">Fanglue Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Z">Zhe Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yun Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R">Ruofeng Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+M">Min Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.02238v1-abstract-short" style="display: inline;"> 3D Morphable Models (3DMMs) are generative models for face shape and appearance. However, the shape parameters of traditional 3DMMs satisfy the multivariate Gaussian distribution while the identity embeddings satisfy the hypersphere distribution, and this conflict makes it challenging for face reconstruction models to preserve the faithfulness and the shape consistency simultaneously. To address t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.02238v1-abstract-full').style.display = 'inline'; document.getElementById('2112.02238v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.02238v1-abstract-full" style="display: none;"> 3D Morphable Models (3DMMs) are generative models for face shape and appearance. However, the shape parameters of traditional 3DMMs satisfy the multivariate Gaussian distribution while the identity embeddings satisfy the hypersphere distribution, and this conflict makes it challenging for face reconstruction models to preserve the faithfulness and the shape consistency simultaneously. To address this issue, we propose the Sphere Face Model(SFM), a novel 3DMM for monocular face reconstruction, which can preserve both shape fidelity and identity consistency. The core of our SFM is the basis matrix which can be used to reconstruct 3D face shapes, and the basic matrix is learned by adopting a two-stage training approach where 3D and 2D training data are used in the first and second stages, respectively. To resolve the distribution mismatch, we design a novel loss to make the shape parameters have a hyperspherical latent space. Extensive experiments show that SFM has high representation ability and shape parameter space&#39;s clustering performance. Moreover, it produces fidelity face shapes, and the shapes are consistent in challenging conditions in monocular face reconstruction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.02238v1-abstract-full').style.display = 'none'; document.getElementById('2112.02238v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.04734">arXiv:2111.04734</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2111.04734">pdf</a>, <a href="https://arxiv.org/format/2111.04734">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Mixed Transformer U-Net For Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Hongyi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+S">Shiao Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+L">Lanfen Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Iwamoto%2C+Y">Yutaro Iwamoto</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xian-Hua Han</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yen-Wei Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R">Ruofeng Tong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.04734v2-abstract-short" style="display: inline;"> Though U-Net has achieved tremendous success in medical image segmentation tasks, it lacks the ability to explicitly model long-range dependencies. Therefore, Vision Transformers have emerged as alternative segmentation structures recently, for their innate ability of capturing long-range correlations through Self-Attention (SA). However, Transformers usually rely on large-scale pre-training and h&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.04734v2-abstract-full').style.display = 'inline'; document.getElementById('2111.04734v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.04734v2-abstract-full" style="display: none;"> Though U-Net has achieved tremendous success in medical image segmentation tasks, it lacks the ability to explicitly model long-range dependencies. Therefore, Vision Transformers have emerged as alternative segmentation structures recently, for their innate ability of capturing long-range correlations through Self-Attention (SA). However, Transformers usually rely on large-scale pre-training and have high computational complexity. Furthermore, SA can only model self-affinities within a single sample, ignoring the potential correlations of the overall dataset. To address these problems, we propose a novel Transformer module named Mixed Transformer Module (MTM) for simultaneous inter- and intra- affinities learning. MTM first calculates self-affinities efficiently through our well-designed Local-Global Gaussian-Weighted Self-Attention (LGG-SA). Then, it mines inter-connections between data samples through External Attention (EA). By using MTM, we construct a U-shaped model named Mixed Transformer U-Net (MT-UNet) for accurate medical image segmentation. We test our method on two different public datasets, and the experimental results show that the proposed method achieves better performance over other state-of-the-art methods. The code is available at: https://github.com/Dootmaan/MT-UNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.04734v2-abstract-full').style.display = 'none'; document.getElementById('2111.04734v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2109.13930">arXiv:2109.13930</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2109.13930">pdf</a>, <a href="https://arxiv.org/format/2109.13930">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> All-Around Real Label Supervision: Cyclic Prototype Consistency Learning for Semi-supervised Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Z">Zhe Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yixin Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+D">Donghuan Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+L">Lequan Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+J">Jiangpeng Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+J">Jie Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+K">Kai Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+Y">Yefeng Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R+K">Raymond Kai-yu Tong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2109.13930v2-abstract-short" style="display: inline;"> Semi-supervised learning has substantially advanced medical image segmentation since it alleviates the heavy burden of acquiring the costly expert-examined annotations. Especially, the consistency-based approaches have attracted more attention for their superior performance, wherein the real labels are only utilized to supervise their paired images via supervised loss while the unlabeled images ar&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.13930v2-abstract-full').style.display = 'inline'; document.getElementById('2109.13930v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2109.13930v2-abstract-full" style="display: none;"> Semi-supervised learning has substantially advanced medical image segmentation since it alleviates the heavy burden of acquiring the costly expert-examined annotations. Especially, the consistency-based approaches have attracted more attention for their superior performance, wherein the real labels are only utilized to supervise their paired images via supervised loss while the unlabeled images are exploited by enforcing the perturbation-based \textit{&#34;unsupervised&#34;} consistency without explicit guidance from those real labels. However, intuitively, the expert-examined real labels contain more reliable supervision signals. Observing this, we ask an unexplored but interesting question: can we exploit the unlabeled data via explicit real label supervision for semi-supervised training? To this end, we discard the previous perturbation-based consistency but absorb the essence of non-parametric prototype learning. Based on the prototypical network, we then propose a novel cyclic prototype consistency learning (CPCL) framework, which is constructed by a labeled-to-unlabeled (L2U) prototypical forward process and an unlabeled-to-labeled (U2L) backward process. Such two processes synergistically enhance the segmentation network by encouraging more discriminative and compact features. In this way, our framework turns previous \textit{&#34;unsupervised&#34;} consistency into new \textit{&#34;supervised&#34;} consistency, obtaining the \textit{&#34;all-around real label supervision&#34;} property of our method. Extensive experiments on brain tumor segmentation from MRI and kidney segmentation from CT images show that our CPCL can effectively exploit the unlabeled data and outperform other state-of-the-art semi-supervised medical image segmentation methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.13930v2-abstract-full').style.display = 'none'; document.getElementById('2109.13930v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2108.00911">arXiv:2108.00911</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2108.00911">pdf</a>, <a href="https://arxiv.org/ps/2108.00911">ps</a>, <a href="https://arxiv.org/format/2108.00911">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multi-phase Liver Tumor Segmentation with Spatial Aggregation and Uncertain Region Inpainting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yue Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+C">Chengtao Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+L">Liying Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+H">Huimin Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R">Ruofeng Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+L">Lanfen Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jingsong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yen-Wei Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Q">Qingqing Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+H">Hongjie Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+Z">Zhiyi Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2108.00911v2-abstract-short" style="display: inline;"> Multi-phase computed tomography (CT) images provide crucial complementary information for accurate liver tumor segmentation (LiTS). State-of-the-art multi-phase LiTS methods usually fused cross-phase features through phase-weighted summation or channel-attention based concatenation. However, these methods ignored the spatial (pixel-wise) relationships between different phases, hence leading to ins&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.00911v2-abstract-full').style.display = 'inline'; document.getElementById('2108.00911v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2108.00911v2-abstract-full" style="display: none;"> Multi-phase computed tomography (CT) images provide crucial complementary information for accurate liver tumor segmentation (LiTS). State-of-the-art multi-phase LiTS methods usually fused cross-phase features through phase-weighted summation or channel-attention based concatenation. However, these methods ignored the spatial (pixel-wise) relationships between different phases, hence leading to insufficient feature integration. In addition, the performance of existing methods remains subject to the uncertainty in segmentation, which is particularly acute in tumor boundary regions. In this work, we propose a novel LiTS method to adequately aggregate multi-phase information and refine uncertain region segmentation. To this end, we introduce a spatial aggregation module (SAM), which encourages per-pixel interactions between different phases, to make full use of cross-phase information. Moreover, we devise an uncertain region inpainting module (URIM) to refine uncertain pixels using neighboring discriminative features. Experiments on an in-house multi-phase CT dataset of focal liver lesions (MPCT-FLLs) demonstrate that our method achieves promising liver tumor segmentation and outperforms state-of-the-arts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.00911v2-abstract-full').style.display = 'none'; document.getElementById('2108.00911v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 August, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 August, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in MICCAI 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2107.02433">arXiv:2107.02433</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2107.02433">pdf</a>, <a href="https://arxiv.org/format/2107.02433">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Double-Uncertainty Guided Spatial and Temporal Consistency Regularization Weighting for Learning-based Abdominal Registration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Z">Zhe Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+J">Jie Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+D">Donghuan Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+J">Jiangpeng Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Frisken%2C+S">Sarah Frisken</a>, <a href="/search/cs?searchtype=author&amp;query=Jagadeesan%2C+J">Jayender Jagadeesan</a>, <a href="/search/cs?searchtype=author&amp;query=Wells%2C+W">William Wells III</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xiu Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+Y">Yefeng Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R">Raymond Tong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2107.02433v3-abstract-short" style="display: inline;"> In order to tackle the difficulty associated with the ill-posed nature of the image registration problem, regularization is often used to constrain the solution space. For most learning-based registration approaches, the regularization usually has a fixed weight and only constrains the spatial transformation. Such convention has two limitations: (i) Besides the laborious grid search for the optima&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.02433v3-abstract-full').style.display = 'inline'; document.getElementById('2107.02433v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2107.02433v3-abstract-full" style="display: none;"> In order to tackle the difficulty associated with the ill-posed nature of the image registration problem, regularization is often used to constrain the solution space. For most learning-based registration approaches, the regularization usually has a fixed weight and only constrains the spatial transformation. Such convention has two limitations: (i) Besides the laborious grid search for the optimal fixed weight, the regularization strength of a specific image pair should be associated with the content of the images, thus the &#34;one value fits all&#34; training scheme is not ideal; (ii) Only spatially regularizing the transformation may neglect some informative clues related to the ill-posedness. In this study, we propose a mean-teacher based registration framework, which incorporates an additional temporal consistency regularization term by encouraging the teacher model&#39;s prediction to be consistent with that of the student model. More importantly, instead of searching for a fixed weight, the teacher enables automatically adjusting the weights of the spatial regularization and the temporal consistency regularization by taking advantage of the transformation uncertainty and appearance uncertainty. Extensive experiments on the challenging abdominal CT-MRI registration show that our training strategy can promisingly advance the original learning-based method in terms of efficient hyperparameter tuning and a better tradeoff between accuracy and smoothness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.02433v3-abstract-full').style.display = 'none'; document.getElementById('2107.02433v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 July, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.03515">arXiv:2104.03515</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2104.03515">pdf</a>, <a href="https://arxiv.org/format/2104.03515">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Reconstructing Recognizable 3D Face Shapes based on 3D Morphable Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+D">Diqiong Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+Y">Yiwei Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+F">Fanglue Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Yai%2C+Y">Yukun Yai</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+R">Risheng Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R">Ruofeng Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+M">Min Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.03515v2-abstract-short" style="display: inline;"> Many recent works have reconstructed distinctive 3D face shapes by aggregating shape parameters of the same identity and separating those of different people based on parametric models (e.g., 3D morphable models (3DMMs)). However, despite the high accuracy in the face recognition task using these shape parameters, the visual discrimination of face shapes reconstructed from those parameters is unsa&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.03515v2-abstract-full').style.display = 'inline'; document.getElementById('2104.03515v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.03515v2-abstract-full" style="display: none;"> Many recent works have reconstructed distinctive 3D face shapes by aggregating shape parameters of the same identity and separating those of different people based on parametric models (e.g., 3D morphable models (3DMMs)). However, despite the high accuracy in the face recognition task using these shape parameters, the visual discrimination of face shapes reconstructed from those parameters is unsatisfactory. The following research question has not been answered in previous works: Do discriminative shape parameters guarantee visual discrimination in represented 3D face shapes? This paper analyzes the relationship between shape parameters and reconstructed shape geometry and proposes a novel shape identity-aware regularization(SIR) loss for shape parameters, aiming at increasing discriminability in both the shape parameter and shape geometry domains. Moreover, to cope with the lack of training data containing both landmark and identity annotations, we propose a network structure and an associated training strategy to leverage mixed data containing either identity or landmark labels. We compare our method with existing methods in terms of the reconstruction error, visual distinguishability, and face recognition accuracy of the shape parameters. Experimental results show that our method outperforms the state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.03515v2-abstract-full').style.display = 'none'; document.getElementById('2104.03515v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2103.04235">arXiv:2103.04235</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2103.04235">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Graph-based Pyramid Global Context Reasoning with a Saliency-aware Projection for COVID-19 Lung Infections Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Huang%2C+H">Huimin Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+M">Ming Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+L">Lanfen Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+J">Jing Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Mao%2C+X">Xiongwei Mao</a>, <a href="/search/cs?searchtype=author&amp;query=Qian%2C+X">Xiaohan Qian</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+Z">Zhiyi Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jianying Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Iwamoto%2C+Y">Yutaro Iwamoto</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xian-Hua Han</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yen-Wei Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R">Ruofeng Tong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2103.04235v1-abstract-short" style="display: inline;"> Coronavirus Disease 2019 (COVID-19) has rapidly spread in 2020, emerging a mass of studies for lung infection segmentation from CT images. Though many methods have been proposed for this issue, it is a challenging task because of infections of various size appearing in different lobe zones. To tackle these issues, we propose a Graph-based Pyramid Global Context Reasoning (Graph-PGCR) module, which&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.04235v1-abstract-full').style.display = 'inline'; document.getElementById('2103.04235v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2103.04235v1-abstract-full" style="display: none;"> Coronavirus Disease 2019 (COVID-19) has rapidly spread in 2020, emerging a mass of studies for lung infection segmentation from CT images. Though many methods have been proposed for this issue, it is a challenging task because of infections of various size appearing in different lobe zones. To tackle these issues, we propose a Graph-based Pyramid Global Context Reasoning (Graph-PGCR) module, which is capable of modeling long-range dependencies among disjoint infections as well as adapt size variation. We first incorporate graph convolution to exploit long-term contextual information from multiple lobe zones. Different from previous average pooling or maximum object probability, we propose a saliency-aware projection mechanism to pick up infection-related pixels as a set of graph nodes. After graph reasoning, the relation-aware features are reversed back to the original coordinate space for the down-stream tasks. We further construct multiple graphs with different sampling rates to handle the size variation problem. To this end, distinct multi-scale long-range contextual patterns can be captured. Our Graph-PGCR module is plug-and-play, which can be integrated into any architecture to improve its performance. Experiments demonstrated that the proposed method consistently boost the performance of state-of-the-art backbone architectures on both of public and our private COVID-19 datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.04235v1-abstract-full').style.display = 'none'; document.getElementById('2103.04235v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2103.00274">arXiv:2103.00274</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2103.00274">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1002/mp.14922">10.1002/mp.14922 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> PA-ResSeg: A Phase Attention Residual Network for Liver Tumor Segmentation from Multi-phase CT Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Y">Yingying Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+M">Ming Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+L">Lanfen Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yue Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+H">Hongjie Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+Z">Zhiyi Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qiaowei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Q">Qingqing Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Mao%2C+X">Xiongwei Mao</a>, <a href="/search/cs?searchtype=author&amp;query=Iwamoto%2C+Y">Yutaro Iwamoto</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xian-Hua Han</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yen-Wei Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R">Ruofeng Tong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2103.00274v1-abstract-short" style="display: inline;"> In this paper, we propose a phase attention residual network (PA-ResSeg) to model multi-phase features for accurate liver tumor segmentation, in which a phase attention (PA) is newly proposed to additionally exploit the images of arterial (ART) phase to facilitate the segmentation of portal venous (PV) phase. The PA block consists of an intra-phase attention (Intra-PA) module and an inter-phase at&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.00274v1-abstract-full').style.display = 'inline'; document.getElementById('2103.00274v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2103.00274v1-abstract-full" style="display: none;"> In this paper, we propose a phase attention residual network (PA-ResSeg) to model multi-phase features for accurate liver tumor segmentation, in which a phase attention (PA) is newly proposed to additionally exploit the images of arterial (ART) phase to facilitate the segmentation of portal venous (PV) phase. The PA block consists of an intra-phase attention (Intra-PA) module and an inter-phase attention (Inter-PA) module to capture channel-wise self-dependencies and cross-phase interdependencies, respectively. Thus it enables the network to learn more representative multi-phase features by refining the PV features according to the channel dependencies and recalibrating the ART features based on the learned interdependencies between phases. We propose a PA-based multi-scale fusion (MSF) architecture to embed the PA blocks in the network at multiple levels along the encoding path to fuse multi-scale features from multi-phase images. Moreover, a 3D boundary-enhanced loss (BE-loss) is proposed for training to make the network more sensitive to boundaries. To evaluate the performance of our proposed PA-ResSeg, we conducted experiments on a multi-phase CT dataset of focal liver lesions (MPCT-FLLs). Experimental results show the effectiveness of the proposed method by achieving a dice per case (DPC) of 0.77.87, a dice global (DG) of 0.8682, a volumetric overlap error (VOE) of 0.3328 and a relative volume difference (RVD) of 0.0443 on the MPCT-FLLs. Furthermore, to validate the effectiveness and robustness of PA-ResSeg, we conducted extra experiments on another multi-phase liver tumor dataset and obtained a DPC of 0.8290, a DG of 0.9132, a VOE of 0.2637 and a RVD of 0.0163. The proposed method shows its robustness and generalization capability in different datasets and different backbones. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.00274v1-abstract-full').style.display = 'none'; document.getElementById('2103.00274v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">A self-archive version to be published in Medical Physics, awaiting minor revision</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2010.11657">arXiv:2010.11657</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2010.11657">pdf</a>, <a href="https://arxiv.org/format/2010.11657">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> The HUAWEI Speaker Diarisation System for the VoxCeleb Speaker Diarisation Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Renyu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R">Ruilin Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Yeung%2C+Y+T">Yu Ting Yeung</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xiao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2010.11657v2-abstract-short" style="display: inline;"> This paper describes system setup of our submission to speaker diarisation track (Track 4) of VoxCeleb Speaker Recognition Challenge 2020. Our diarisation system consists of a well-trained neural network based speech enhancement model as pre-processing front-end of input speech signals. We replace conventional energy-based voice activity detection (VAD) with a neural network based VAD. The neural&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.11657v2-abstract-full').style.display = 'inline'; document.getElementById('2010.11657v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2010.11657v2-abstract-full" style="display: none;"> This paper describes system setup of our submission to speaker diarisation track (Track 4) of VoxCeleb Speaker Recognition Challenge 2020. Our diarisation system consists of a well-trained neural network based speech enhancement model as pre-processing front-end of input speech signals. We replace conventional energy-based voice activity detection (VAD) with a neural network based VAD. The neural network based VAD provides more accurate annotation of speech segments containing only background music, noise, and other interference, which is crucial to diarisation performance. We apply agglomerative hierarchical clustering (AHC) of x-vectors and variational Bayesian hidden Markov model (VB-HMM) based iterative clustering for speaker clustering. Experimental results demonstrate that our proposed system achieves substantial improvements over the baseline system, yielding diarisation error rate (DER) of 10.45%, and Jacard error rate (JER) of 22.46% on the evaluation set. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.11657v2-abstract-full').style.display = 'none'; document.getElementById('2010.11657v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2 figures, A report about our diarisation system for VoxCeleb Challenge, Interspeech conference workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2008.00409">arXiv:2008.00409</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2008.00409">pdf</a>, <a href="https://arxiv.org/format/2008.00409">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> P-Cloth: Interactive Complex Cloth Simulation on Multi-GPU Systems using Dynamic Matrix Assembly and Pipelined Implicit Integrators </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Cheng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+M">Min Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R">Ruofeng Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+M">Ming Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+J">Jieyi Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Manocha%2C+D">Dinesh Manocha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2008.00409v2-abstract-short" style="display: inline;"> We present a novel parallel algorithm for cloth simulation that exploits multiple GPUs for fast computation and the handling of very high resolution meshes. To accelerate implicit integration, we describe new parallel algorithms for sparse matrix-vector multiplication (SpMV) and for dynamic matrix assembly on a multi-GPU workstation. Our algorithms use a novel work queue generation scheme for a fa&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.00409v2-abstract-full').style.display = 'inline'; document.getElementById('2008.00409v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2008.00409v2-abstract-full" style="display: none;"> We present a novel parallel algorithm for cloth simulation that exploits multiple GPUs for fast computation and the handling of very high resolution meshes. To accelerate implicit integration, we describe new parallel algorithms for sparse matrix-vector multiplication (SpMV) and for dynamic matrix assembly on a multi-GPU workstation. Our algorithms use a novel work queue generation scheme for a fat-tree GPU interconnect topology. Furthermore, we present a novel collision handling scheme that uses spatial hashing for discrete and continuous collision detection along with a non-linear impact zone solver. Our parallel schemes can distribute the computation and storage overhead among multiple GPUs and enable us to perform almost interactive simulation on complex cloth meshes, which can hardly be handled on a single GPU due to memory limitations. We have evaluated the performance with two multi-GPU workstations (with 4 and 8 GPUs, respectively) on cloth meshes with 0.5-1.65M triangles. Our approach can reliably handle the collisions and generate vivid wrinkles and folds at 2-5 fps, which is significantly faster than prior cloth simulation systems. We observe almost linear speedups with respect to the number of GPUs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.00409v2-abstract-full').style.display = 'none'; document.getElementById('2008.00409v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 August, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 August, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2006.15320">arXiv:2006.15320</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2006.15320">pdf</a>, <a href="https://arxiv.org/format/2006.15320">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Interactive Deep Refinement Network for Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kitrungrotsakul%2C+T">Titinunt Kitrungrotsakul</a>, <a href="/search/cs?searchtype=author&amp;query=Yutaro%2C+I">Iwamoto Yutaro</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+L">Lanfen Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R">Ruofeng Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jingsong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yen-Wei Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2006.15320v1-abstract-short" style="display: inline;"> Deep learning techniques have successfully been employed in numerous computer vision tasks including image segmentation. The techniques have also been applied to medical image segmentation, one of the most critical tasks in computer-aided diagnosis. Compared with natural images, the medical image is a gray-scale image with low-contrast (even with some invisible parts). Because some organs have sim&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.15320v1-abstract-full').style.display = 'inline'; document.getElementById('2006.15320v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2006.15320v1-abstract-full" style="display: none;"> Deep learning techniques have successfully been employed in numerous computer vision tasks including image segmentation. The techniques have also been applied to medical image segmentation, one of the most critical tasks in computer-aided diagnosis. Compared with natural images, the medical image is a gray-scale image with low-contrast (even with some invisible parts). Because some organs have similar intensity and texture with neighboring organs, there is usually a need to refine automatic segmentation results. In this paper, we propose an interactive deep refinement framework to improve the traditional semantic segmentation networks such as U-Net and fully convolutional network. In the proposed framework, we added a refinement network to traditional segmentation network to refine the segmentation results.Experimental results with public dataset revealed that the proposed method could achieve higher accuracy than other state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.15320v1-abstract-full').style.display = 'none'; document.getElementById('2006.15320v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 June, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2004.08790">arXiv:2004.08790</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2004.08790">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> UNet 3+: A Full-Scale Connected UNet for Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Huang%2C+H">Huimin Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+L">Lanfen Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R">Ruofeng Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+H">Hongjie Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qiaowei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Iwamoto%2C+Y">Yutaro Iwamoto</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xianhua Han</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yen-Wei Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+J">Jian Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2004.08790v1-abstract-short" style="display: inline;"> Recently, a growing interest has been seen in deep learning-based semantic segmentation. UNet, which is one of deep learning networks with an encoder-decoder architecture, is widely used in medical image segmentation. Combining multi-scale features is one of important factors for accurate segmentation. UNet++ was developed as a modified Unet by designing an architecture with nested and dense skip&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.08790v1-abstract-full').style.display = 'inline'; document.getElementById('2004.08790v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2004.08790v1-abstract-full" style="display: none;"> Recently, a growing interest has been seen in deep learning-based semantic segmentation. UNet, which is one of deep learning networks with an encoder-decoder architecture, is widely used in medical image segmentation. Combining multi-scale features is one of important factors for accurate segmentation. UNet++ was developed as a modified Unet by designing an architecture with nested and dense skip connections. However, it does not explore sufficient information from full scales and there is still a large room for improvement. In this paper, we propose a novel UNet 3+, which takes advantage of full-scale skip connections and deep supervisions. The full-scale skip connections incorporate low-level details with high-level semantics from feature maps in different scales; while the deep supervision learns hierarchical representations from the full-scale aggregated feature maps. The proposed method is especially benefiting for organs that appear at varying scales. In addition to accuracy improvements, the proposed UNet 3+ can reduce the network parameters to improve the computation efficiency. We further propose a hybrid loss function and devise a classification-guided module to enhance the organ boundary and reduce the over-segmentation in a non-organ image, yielding more accurate segmentation results. The effectiveness of the proposed method is demonstrated on two datasets. The code is available at: github.com/ZJUGiveLab/UNet-Version <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.08790v1-abstract-full').style.display = 'none'; document.getElementById('2004.08790v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 April, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1910.06078">arXiv:1910.06078</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1910.06078">pdf</a>, <a href="https://arxiv.org/format/1910.06078">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> MUTLA: A Large-Scale Dataset for Multimodal Teaching and Learning Analytics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+F">Fangli Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+L">Lingfei Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Thai%2C+K">KP Thai</a>, <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+C">Carol Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+W">Wei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R">Richard Tong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1910.06078v2-abstract-short" style="display: inline;"> Automatic analysis of teacher and student interactions could be very important to improve the quality of teaching and student engagement. However, despite some recent progress in utilizing multimodal data for teaching and learning analytics, a thorough analysis of a rich multimodal dataset coming for a complex real learning environment has yet to be done. To bridge this gap, we present a large-sca&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.06078v2-abstract-full').style.display = 'inline'; document.getElementById('1910.06078v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1910.06078v2-abstract-full" style="display: none;"> Automatic analysis of teacher and student interactions could be very important to improve the quality of teaching and student engagement. However, despite some recent progress in utilizing multimodal data for teaching and learning analytics, a thorough analysis of a rich multimodal dataset coming for a complex real learning environment has yet to be done. To bridge this gap, we present a large-scale MUlti-modal Teaching and Learning Analytics (MUTLA) dataset. This dataset includes time-synchronized multimodal data records of students (learning logs, videos, EEG brainwaves) as they work in various subjects from Squirrel AI Learning System (SAIL) to solve problems of varying difficulty levels. The dataset resources include user records from the learner records store of SAIL, brainwave data collected by EEG headset devices, and video data captured by web cameras while students worked in the SAIL products. Our hope is that by analyzing real-world student learning activities, facial expressions, and brainwave patterns, researchers can better predict engagement, which can then be used to improve adaptive learning selection and student learning outcomes. An additional goal is to provide a dataset gathered from real-world educational activities versus those from controlled lab environments to benefit the educational learning community. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.06078v2-abstract-full').style.display = 'none'; document.getElementById('1910.06078v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 October, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">3 pages, 1 figure, 2 tables workshop paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1808.04818">arXiv:1808.04818</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1808.04818">pdf</a>, <a href="https://arxiv.org/format/1808.04818">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multispectral Pedestrian Detection via Simultaneous Detection and Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chengyang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+D">Dan Song</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R">Ruofeng Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+M">Min Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1808.04818v1-abstract-short" style="display: inline;"> Multispectral pedestrian detection has attracted increasing attention from the research community due to its crucial competence for many around-the-clock applications (e.g., video surveillance and autonomous driving), especially under insufficient illumination conditions. We create a human baseline over the KAIST dataset and reveal that there is still a large gap between current top detectors and&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1808.04818v1-abstract-full').style.display = 'inline'; document.getElementById('1808.04818v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1808.04818v1-abstract-full" style="display: none;"> Multispectral pedestrian detection has attracted increasing attention from the research community due to its crucial competence for many around-the-clock applications (e.g., video surveillance and autonomous driving), especially under insufficient illumination conditions. We create a human baseline over the KAIST dataset and reveal that there is still a large gap between current top detectors and human performance. To narrow this gap, we propose a network fusion architecture, which consists of a multispectral proposal network to generate pedestrian proposals, and a subsequent multispectral classification network to distinguish pedestrian instances from hard negatives. The unified network is learned by jointly optimizing pedestrian detection and semantic segmentation tasks. The final detections are obtained by integrating the outputs from different modalities as well as the two stages. The approach significantly outperforms state-of-the-art methods on the KAIST dataset while remain fast. Additionally, we contribute a sanitized version of training annotations for the KAIST dataset, and examine the effects caused by different kinds of annotation errors. Future research of this problem will benefit from the sanitized version which eliminates the interference of annotation errors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1808.04818v1-abstract-full').style.display = 'none'; document.getElementById('1808.04818v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 August, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">British Machine Vision Conference (BMVC) 2018</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1803.05347">arXiv:1803.05347</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1803.05347">pdf</a>, <a href="https://arxiv.org/format/1803.05347">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Illumination-aware Faster R-CNN for Robust Multispectral Pedestrian Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chengyang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+D">Dan Song</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R">Ruofeng Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+M">Min Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1803.05347v2-abstract-short" style="display: inline;"> Multispectral images of color-thermal pairs have shown more effective than a single color channel for pedestrian detection, especially under challenging illumination conditions. However, there is still a lack of studies on how to fuse the two modalities effectively. In this paper, we deeply compare six different convolutional network fusion architectures and analyse their adaptations, enabling a v&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1803.05347v2-abstract-full').style.display = 'inline'; document.getElementById('1803.05347v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1803.05347v2-abstract-full" style="display: none;"> Multispectral images of color-thermal pairs have shown more effective than a single color channel for pedestrian detection, especially under challenging illumination conditions. However, there is still a lack of studies on how to fuse the two modalities effectively. In this paper, we deeply compare six different convolutional network fusion architectures and analyse their adaptations, enabling a vanilla architecture to obtain detection performances comparable to the state-of-the-art results. Further, we discover that pedestrian detection confidences from color or thermal images are correlated with illumination conditions. With this in mind, we propose an Illumination-aware Faster R-CNN (IAF RCNN). Specifically, an Illumination-aware Network is introduced to give an illumination measure of the input image. Then we adaptively merge color and thermal sub-networks via a gate function defined over the illumination value. The experimental results on KAIST Multispectral Pedestrian Benchmark validate the effectiveness of the proposed IAF R-CNN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1803.05347v2-abstract-full').style.display = 'none'; document.getElementById('1803.05347v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 August, 2018; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 March, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for Publication in Pattern Recognition</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1304.3113">arXiv:1304.3113</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1304.3113">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A General Purpose Inference Engine for Evidential Reasoning Research </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R+M">Richard M. Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Appelbaum%2C+L+A">Lee A. Appelbaum</a>, <a href="/search/cs?searchtype=author&amp;query=Shapiro%2C+D+G">D. G. Shapiro</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1304.3113v1-abstract-short" style="display: inline;"> The purpose of this paper is to report on the most recent developments in our ongoing investigation of the representation and manipulation of uncertainty in automated reasoning systems. In our earlier studies (Tong and Shapiro, 1985) we described a series of experiments with RUBRIC (Tong et al., 1985), a system for full-text document retrieval, that generated some interesting insights into the eff&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1304.3113v1-abstract-full').style.display = 'inline'; document.getElementById('1304.3113v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1304.3113v1-abstract-full" style="display: none;"> The purpose of this paper is to report on the most recent developments in our ongoing investigation of the representation and manipulation of uncertainty in automated reasoning systems. In our earlier studies (Tong and Shapiro, 1985) we described a series of experiments with RUBRIC (Tong et al., 1985), a system for full-text document retrieval, that generated some interesting insights into the effects of choosing among a class of scalar valued uncertainty calculi. [n order to extend these results we have begun a new series of experiments with a larger class of representations and calculi, and to help perform these experiments we have developed a general purpose inference engine. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1304.3113v1-abstract-full').style.display = 'none'; document.getElementById('1304.3113v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2013; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2013. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Appears in Proceedings of the Second Conference on Uncertainty in Artificial Intelligence (UAI1986)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Report number:</span> UAI-P-1986-PG-297-302 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1304.2746">arXiv:1304.2746</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1304.2746">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Problem Structure and Evidential Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R+M">Richard M. Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Appelbaum%2C+L+A">Lee A. Appelbaum</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1304.2746v1-abstract-short" style="display: inline;"> In our previous series of studies to investigate the role of evidential reasoning in the RUBRIC system for full-text document retrieval (Tong et al., 1985; Tong and Shapiro, 1985; Tong and Appelbaum, 1987), we identified the important role that problem structure plays in the overall performance of the system. In this paper, we focus on these structural elements (which we now call &#34;semantic structu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1304.2746v1-abstract-full').style.display = 'inline'; document.getElementById('1304.2746v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1304.2746v1-abstract-full" style="display: none;"> In our previous series of studies to investigate the role of evidential reasoning in the RUBRIC system for full-text document retrieval (Tong et al., 1985; Tong and Shapiro, 1985; Tong and Appelbaum, 1987), we identified the important role that problem structure plays in the overall performance of the system. In this paper, we focus on these structural elements (which we now call &#34;semantic structure&#34;) and show how explicit consideration of their properties reduces what previously were seen as difficult evidential reasoning problems to more tractable questions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1304.2746v1-abstract-full').style.display = 'none'; document.getElementById('1304.2746v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2013; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2013. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Appears in Proceedings of the Third Conference on Uncertainty in Artificial Intelligence (UAI1987)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Report number:</span> UAI-P-1987-PG-313-320 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1304.1128">arXiv:1304.1128</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1304.1128">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> An Architecture for Probabilistic Concept-Based Information Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Fung%2C+R">Robert Fung</a>, <a href="/search/cs?searchtype=author&amp;query=Crawford%2C+S+L">S. L. Crawford</a>, <a href="/search/cs?searchtype=author&amp;query=Appelbaum%2C+L+A">Lee A. Appelbaum</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+R+M">Richard M. Tong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1304.1128v1-abstract-short" style="display: inline;"> While concept-based methods for information retrieval can provide improved performance over more conventional techniques, they require large amounts of effort to acquire the concepts and their qualitative and quantitative relationships. This paper discusses an architecture for probabilistic concept-based information retrieval which addresses the knowledge acquisition problem. The architecture make&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1304.1128v1-abstract-full').style.display = 'inline'; document.getElementById('1304.1128v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1304.1128v1-abstract-full" style="display: none;"> While concept-based methods for information retrieval can provide improved performance over more conventional techniques, they require large amounts of effort to acquire the concepts and their qualitative and quantitative relationships. This paper discusses an architecture for probabilistic concept-based information retrieval which addresses the knowledge acquisition problem. The architecture makes use of the probabilistic networks technology for representing and reasoning about concepts and includes a knowledge acquisition component which partially automates the construction of concept knowledge bases from data. We describe two experiments that apply the architecture to the task of retrieving documents about terrorism from a set of documents from the Reuters news service. The experiments provide positive evidence that the architecture design is feasible and that there are advantages to concept-based methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1304.1128v1-abstract-full').style.display = 'none'; document.getElementById('1304.1128v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2013; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2013. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Appears in Proceedings of the Sixth Conference on Uncertainty in Artificial Intelligence (UAI1990)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Report number:</span> UAI-P-1990-PG-392-404 </p> </li> </ol> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10