CINXE.COM
Image and Video Processing
<!DOCTYPE html> <html lang="en"> <head> <title>Image and Video Processing </title> <meta name="viewport" content="width=device-width, initial-scale=1"> <link rel="apple-touch-icon" sizes="180x180" href="/static/browse/0.3.4/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="/static/browse/0.3.4/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="/static/browse/0.3.4/images/icons/favicon-16x16.png"> <link rel="manifest" href="/static/browse/0.3.4/images/icons/site.webmanifest"> <link rel="mask-icon" href="/static/browse/0.3.4/images/icons/safari-pinned-tab.svg" color="#5bbad5"> <meta name="msapplication-TileColor" content="#da532c"> <meta name="theme-color" content="#ffffff"> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/arXiv.css?v=20241206" /> <link rel="stylesheet" type="text/css" media="print" href="/static/browse/0.3.4/css/arXiv-print.css?v=20200611" /> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/browse_search.css" /> <script language="javascript" src="/static/browse/0.3.4/js/accordion.js" /></script> <script src="/static/browse/0.3.4/js/mathjaxToggle.min.js" type="text/javascript"></script> <script type="text/javascript" language="javascript">mathjaxToggle();</script> </head> <body class="with-cu-identity"> <div class="flex-wrap-footer"> <header> <a href="#content" class="is-sr-only">Skip to main content</a> <!-- start desktop header --> <div class="columns is-vcentered is-hidden-mobile" id="cu-identity"> <div class="column" id="cu-logo"> <a href="https://www.cornell.edu/"><img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University" /></a> </div><div class="column" id="support-ack"> <span id="support-ack-url">We gratefully acknowledge support from the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors.</span> <a href="https://info.arxiv.org/about/donate.html" class="btn-header-donate">Donate</a> </div> </div> <div id="header" class="is-hidden-mobile"> <a aria-hidden="true" tabindex="-1" href="/IgnoreMe"></a> <div class="header-breadcrumbs"> <a href="/"><img src="/static/browse/0.3.4/images/arxiv-logo-one-color-white.svg" alt="arxiv logo" style="height:40px;"/></a> <span>></span> <a href="/list/eess.IV/recent">eess.IV</a> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div><!-- /end desktop header --> <div class="mobile-header"> <div class="columns is-mobile"> <div class="column logo-arxiv"><a href="https://arxiv.org/"><img src="/static/browse/0.3.4/images/arxiv-logomark-small-white.svg" alt="arXiv logo" style="height:60px;" /></a></div> <div class="column logo-cornell"><a href="https://www.cornell.edu/"> <picture> <source media="(min-width: 501px)" srcset="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg 400w" sizes="400w" /> <source srcset="/static/browse/0.3.4/images/icons/cu/cornell_seal_simple_black.svg 2x" /> <img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University Logo" /> </picture> </a></div> <div class="column nav" id="toggle-container" role="menubar"> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-white"><title>open search</title><path d="M505 442.7L405.3 343c-4.5-4.5-10.6-7-17-7H372c27.6-35.3 44-79.7 44-128C416 93.1 322.9 0 208 0S0 93.1 0 208s93.1 208 208 208c48.3 0 92.7-16.4 128-44v16.3c0 6.4 2.5 12.5 7 17l99.7 99.7c9.4 9.4 24.6 9.4 33.9 0l28.3-28.3c9.4-9.4 9.4-24.6.1-34zM208 336c-70.7 0-128-57.2-128-128 0-70.7 57.2-128 128-128 70.7 0 128 57.2 128 128 0 70.7-57.2 128-128 128z"/></svg></button> <div class="mobile-toggle-block toggle-target"> <form class="mobile-search-form" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <input class="input" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <input type="hidden" name="source" value="header"> <input type="hidden" name="searchtype" value="all"> <button class="button">GO</button> </div> </form> </div> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-white" role="menu"><title>open navigation menu</title><path d="M16 132h416c8.837 0 16-7.163 16-16V76c0-8.837-7.163-16-16-16H16C7.163 60 0 67.163 0 76v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16z"/ ></svg></button> <div class="mobile-toggle-block toggle-target"> <nav class="mobile-menu" aria-labelledby="mobilemenulabel"> <h2 id="mobilemenulabel">quick links</h2> <ul> <li><a href="https://arxiv.org/login">Login</a></li> <li><a href="https://info.arxiv.org/help">Help Pages</a></li> <li><a href="https://info.arxiv.org/about">About</a></li> </ul> </nav> </div> </div> </div> </div><!-- /end mobile-header --> </header> <main> <div id="content"> <div id='content-inner'> <div id='dlpage'> <h1>Image and Video Processing</h1> <ul> <li><a href="#item0">New submissions</a></li> <li><a href="#item12">Cross-lists</a></li> <li><a href="#item14">Replacements</a></li> </ul> <p>See <a id="recent-eess.IV" aria-labelledby="recent-eess.IV" href="/list/eess.IV/recent">recent</a> articles</p> <h3>Showing new listings for Friday, 21 March 2025</h3> <div class='paging'>Total of 20 entries </div> <div class='morefewer'>Showing up to 2000 entries per page: <a href=/list/eess.IV/new?skip=0&show=1000 rel="nofollow"> fewer</a> | <span style="color: #454545">more</span> | <span style="color: #454545">all</span> </div> <dl id='articles'> <h3>New submissions (showing 11 of 11 entries)</h3> <dt> <a name='item1'>[1]</a> <a href ="/abs/2503.15555" title="Abstract" id="2503.15555"> arXiv:2503.15555 </a> [<a href="/pdf/2503.15555" title="Download PDF" id="pdf-2503.15555" aria-labelledby="pdf-2503.15555">pdf</a>, <a href="https://arxiv.org/html/2503.15555v1" title="View HTML" id="html-2503.15555" aria-labelledby="html-2503.15555" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15555" title="Other formats" id="oth-2503.15555" aria-labelledby="oth-2503.15555">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Whole-Body Image-to-Image Translation for a Virtual Scanner in a Healthcare Digital Twin </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Guarrasi,+V">Valerio Guarrasi</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Di+Feola,+F">Francesco Di Feola</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Restivo,+R">Rebecca Restivo</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Tronchin,+L">Lorenzo Tronchin</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Soda,+P">Paolo Soda</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Generating positron emission tomography (PET) images from computed tomography (CT) scans via deep learning offers a promising pathway to reduce radiation exposure and costs associated with PET imaging, improving patient care and accessibility to functional imaging. Whole-body image translation presents challenges due to anatomical heterogeneity, often limiting generalized models. We propose a framework that segments whole-body CT images into four regions-head, trunk, arms, and legs-and uses district-specific Generative Adversarial Networks (GANs) for tailored CT-to-PET translation. Synthetic PET images from each region are stitched together to reconstruct the whole-body scan. Comparisons with a baseline non-segmented GAN and experiments with Pix2Pix and CycleGAN architectures tested paired and unpaired scenarios. Quantitative evaluations at district, whole-body, and lesion levels demonstrated significant improvements with our district-specific GANs. Pix2Pix yielded superior metrics, ensuring precise, high-quality image synthesis. By addressing anatomical heterogeneity, this approach achieves state-of-the-art results in whole-body CT-to-PET translation. This methodology supports healthcare Digital Twins by enabling accurate virtual PET scans from CT data, creating virtual imaging representations to monitor, predict, and optimize health outcomes. </p> </div> </dd> <dt> <a name='item2'>[2]</a> <a href ="/abs/2503.15861" title="Abstract" id="2503.15861"> arXiv:2503.15861 </a> [<a href="/pdf/2503.15861" title="Download PDF" id="pdf-2503.15861" aria-labelledby="pdf-2503.15861">pdf</a>, <a href="https://arxiv.org/html/2503.15861v1" title="View HTML" id="html-2503.15861" aria-labelledby="html-2503.15861" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15861" title="Other formats" id="oth-2503.15861" aria-labelledby="oth-2503.15861">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Sequential Spatial-Temporal Network for Interpretable Automatic Ultrasonic Assessment of Fetal Head during labor </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Gan,+J">Jie Gan</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Liang,+Z">Zhuonan Liang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Fan,+J">Jianan Fan</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Mcguire,+L">Lisa Mcguire</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Watson,+C">Caterina Watson</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Spurway,+J">Jacqueline Spurway</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Clarke,+J">Jillian Clarke</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Cai,+W">Weidong Cai</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> This work has been accepted to 2025 IEEE 22nd International Symposium on Biomedical Imaging (ISBI) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> The intrapartum ultrasound guideline established by ISUOG highlights the Angle of Progression (AoP) and Head Symphysis Distance (HSD) as pivotal metrics for assessing fetal head descent and predicting delivery outcomes. Accurate measurement of the AoP and HSD requires a structured process. This begins with identifying standardized ultrasound planes, followed by the detection of specific anatomical landmarks within the regions of the pubic symphysis and fetal head that correlate with the delivery parameters AoP and HSD. Finally, these measurements are derived based on the identified anatomical landmarks. Addressing the clinical demands and standard operation process outlined in the ISUOG guideline, we introduce the Sequential Spatial-Temporal Network (SSTN), the first interpretable model specifically designed for the video of intrapartum ultrasound analysis. The SSTN operates by first identifying ultrasound planes, then segmenting anatomical structures such as the pubic symphysis and fetal head, and finally detecting key landmarks for precise measurement of HSD and AoP. Furthermore, the cohesive framework leverages task-related information to improve accuracy and reliability. Experimental evaluations on clinical datasets demonstrate that SSTN significantly surpasses existing models, reducing the mean absolute error by 18% for AoP and 22% for HSD. </p> </div> </dd> <dt> <a name='item3'>[3]</a> <a href ="/abs/2503.16010" title="Abstract" id="2503.16010"> arXiv:2503.16010 </a> [<a href="/pdf/2503.16010" title="Download PDF" id="pdf-2503.16010" aria-labelledby="pdf-2503.16010">pdf</a>, <a href="https://arxiv.org/html/2503.16010v1" title="View HTML" id="html-2503.16010" aria-labelledby="html-2503.16010" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.16010" title="Other formats" id="oth-2503.16010" aria-labelledby="oth-2503.16010">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Patch-based learning of adaptive Total Variation parameter maps for blind image denoising </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Fantasia,+C">Claudio Fantasia</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Calatroni,+L">Luca Calatroni</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Descombes,+X">Xavier Descombes</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Rekik,+R">Rim Rekik</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Machine Learning (cs.LG); Numerical Analysis (math.NA) </div> <p class='mathjax'> We consider a patch-based learning approach defined in terms of neural networks to estimate spatially adaptive regularisation parameter maps for image denoising with weighted Total Variation and test it to situations when the noise distribution is unknown. As an example, we consider situations where noise could be either Gaussian or Poisson and perform preliminary model selection by a standard binary classification network. Then, we define a patch-based approach where at each image pixel an optimal weighting between TV regularisation and the corresponding data fidelity is learned in a supervised way using reference natural image patches upon optimisation of SSIM and in a sliding window fashion. Extensive numerical results are reported for both noise models, showing significant improvement w.r.t. results obtained by means of optimal scalar regularisation. </p> </div> </dd> <dt> <a name='item4'>[4]</a> <a href ="/abs/2503.16055" title="Abstract" id="2503.16055"> arXiv:2503.16055 </a> [<a href="/pdf/2503.16055" title="Download PDF" id="pdf-2503.16055" aria-labelledby="pdf-2503.16055">pdf</a>, <a href="https://arxiv.org/html/2503.16055v1" title="View HTML" id="html-2503.16055" aria-labelledby="html-2503.16055" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.16055" title="Other formats" id="oth-2503.16055" aria-labelledby="oth-2503.16055">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SALT: Singular Value Adaptation with Low-Rank Transformation </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Elsayed,+A">Abdelrahman Elsayed</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Hashmi,+S">Sarim Hashmi</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Elseiagy,+M">Mohammed Elseiagy</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Wang,+H">Hu Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Yaqub,+M">Mohammad Yaqub</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Almakky,+I">Ibrahim Almakky</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> The complex nature of medical image segmentation calls for models that are specifically designed to capture detailed, domain-specific features. Large foundation models offer considerable flexibility, yet the cost of fine-tuning these models remains a significant barrier. Parameter-Efficient Fine-Tuning (PEFT) methods, such as Low-Rank Adaptation (LoRA), efficiently update model weights with low-rank matrices but may suffer from underfitting when the chosen rank is insufficient to capture domain-specific nuances. Conversely, full-rank Singular Value Decomposition (SVD) based methods provide comprehensive updates by modifying all singular values, yet they often lack flexibility and exhibit variable performance across datasets. We propose SALT (Singular Value Adaptation with Low-Rank Transformation), a method that selectively adapts the most influential singular values using trainable scale and shift parameters while complementing this with a low-rank update for the remaining subspace. This hybrid approach harnesses the advantages of both LoRA and SVD, enabling effective adaptation without relying on increasing model size or depth. Evaluated on 5 challenging medical datasets, ranging from as few as 20 samples to 1000, SALT outperforms state-of-the-art PEFT (LoRA and SVD) by 2% to 5% in Dice with only 3.9% trainable parameters, demonstrating robust adaptation even in low-resource settings. The code for SALT is available at: <a href="https://github.com/BioMedIA-MBZUAI/SALT" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item5'>[5]</a> <a href ="/abs/2503.16075" title="Abstract" id="2503.16075"> arXiv:2503.16075 </a> [<a href="/pdf/2503.16075" title="Download PDF" id="pdf-2503.16075" aria-labelledby="pdf-2503.16075">pdf</a>, <a href="https://arxiv.org/html/2503.16075v1" title="View HTML" id="html-2503.16075" aria-labelledby="html-2503.16075" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.16075" title="Other formats" id="oth-2503.16075" aria-labelledby="oth-2503.16075">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> 3-D Image-to-Image Fusion in Lightsheet Microscopy by Two-Step Adversarial Network: Contribution to the FuseMyCells Challenge </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Wodzinski,+M">Marek Wodzinski</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=M%C3%BCller,+H">Henning M眉ller</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Lightsheet microscopy is a powerful 3-D imaging technique that addresses limitations of traditional optical and confocal microscopy but suffers from a low penetration depth and reduced image quality at greater depths. Multiview lightsheet microscopy improves 3-D resolution by combining multiple views but simultaneously increasing the complexity and the photon budget, leading to potential photobleaching and phototoxicity. The FuseMyCells challenge, organized in conjunction with the IEEE ISBI 2025 conference, aims to benchmark deep learning-based solutions for fusing high-quality 3-D volumes from single 3-D views, potentially simplifying procedures and conserving the photon budget. In this work, we propose a contribution to the FuseMyCells challenge based on a two-step procedure. The first step processes a downsampled version of the image to capture the entire region of interest, while the second step uses a patch-based approach for high-resolution inference, incorporating adversarial loss to enhance visual outcomes. This method addresses challenges related to high data resolution, the necessity of global context, and the preservation of high-frequency details. Experimental results demonstrate the effectiveness of our approach, highlighting its potential to improve 3-D image fusion quality and extend the capabilities of lightsheet microscopy. The average SSIM for the nucleus and membranes is greater than 0.85 and 0.91, respectively. </p> </div> </dd> <dt> <a name='item6'>[6]</a> <a href ="/abs/2503.16149" title="Abstract" id="2503.16149"> arXiv:2503.16149 </a> [<a href="/pdf/2503.16149" title="Download PDF" id="pdf-2503.16149" aria-labelledby="pdf-2503.16149">pdf</a>, <a href="https://arxiv.org/html/2503.16149v1" title="View HTML" id="html-2503.16149" aria-labelledby="html-2503.16149" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.16149" title="Other formats" id="oth-2503.16149" aria-labelledby="oth-2503.16149">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Selective Complementary Feature Fusion and Modal Feature Compression Interaction for Brain Tumor Segmentation </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Chen,+D">Dong Chen</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Zhao,+B">Boyue Zhao</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Zhang,+Y">Yi Zhang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Zhao,+M">Meng Zhao</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Efficient modal feature fusion strategy is the key to achieve accurate segmentation of brain glioma. However, due to the specificity of different MRI modes, it is difficult to carry out cross-modal fusion with large differences in modal features, resulting in the model ignoring rich feature information. On the other hand, the problem of multi-modal feature redundancy interaction occurs in parallel networks due to the proliferation of feature dimensions, further increase the difficulty of multi-modal feature fusion at the bottom end. In order to solve the above problems, we propose a noval complementary feature compression interaction network (CFCI-Net), which realizes the complementary fusion and compression interaction of multi-modal feature information with an efficient mode fusion strategy. Firstly, we propose a selective complementary feature fusion (SCFF) module, which adaptively fuses rich cross-modal feature information by complementary soft selection weights. Secondly, a modal feature compression interaction (MFCI) transformer is proposed to deal with the multi-mode fusion redundancy problem when the feature dimension surges. The MFCI transformer is composed of modal feature compression (MFC) and modal feature interaction (MFI) to realize redundancy feature compression and multi-mode feature interactive learning. %In MFI, we propose a hierarchical interactive attention mechanism based on multi-head attention. Evaluations on the BraTS2019 and BraTS2020 datasets demonstrate that CFCI-Net achieves superior results compared to state-of-the-art models. Code: <a href="https://github.com/CDmm0/CFCI-Net" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item7'>[7]</a> <a href ="/abs/2503.16264" title="Abstract" id="2503.16264"> arXiv:2503.16264 </a> [<a href="/pdf/2503.16264" title="Download PDF" id="pdf-2503.16264" aria-labelledby="pdf-2503.16264">pdf</a>, <a href="https://arxiv.org/html/2503.16264v1" title="View HTML" id="html-2503.16264" aria-labelledby="html-2503.16264" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.16264" title="Other formats" id="oth-2503.16264" aria-labelledby="oth-2503.16264">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Do image and video quality metrics model low-level human vision? </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Hammou,+D">Dounia Hammou</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Cai,+Y">Yancheng Cai</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Madhusudanarao,+P">Pavan Madhusudanarao</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Bampis,+C+G">Christos G. Bampis</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Mantiuk,+R+K">Rafa艂 K. Mantiuk</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Computer Vision and Pattern Recognition (cs.CV); Multimedia (cs.MM) </div> <p class='mathjax'> Image and video quality metrics, such as SSIM, LPIPS, and VMAF, are aimed to predict the perceived quality of the evaluated content and are often claimed to be "perceptual". Yet, few metrics directly model human visual perception, and most rely on hand-crafted formulas or training datasets to achieve alignment with perceptual data. In this paper, we propose a set of tests for full-reference quality metrics that examine their ability to model several aspects of low-level human vision: contrast sensitivity, contrast masking, and contrast matching. The tests are meant to provide additional scrutiny for newly proposed metrics. We use our tests to analyze 33 existing image and video quality metrics and find their strengths and weaknesses, such as the ability of LPIPS and MS-SSIM to predict contrast masking and poor performance of VMAF in this task. We further find that the popular SSIM metric overemphasizes differences in high spatial frequencies, but its multi-scale counterpart, MS-SSIM, addresses this shortcoming. Such findings cannot be easily made using existing evaluation protocols. </p> </div> </dd> <dt> <a name='item8'>[8]</a> <a href ="/abs/2503.16288" title="Abstract" id="2503.16288"> arXiv:2503.16288 </a> [<a href="/pdf/2503.16288" title="Download PDF" id="pdf-2503.16288" aria-labelledby="pdf-2503.16288">pdf</a>, <a href="https://arxiv.org/html/2503.16288v1" title="View HTML" id="html-2503.16288" aria-labelledby="html-2503.16288" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.16288" title="Other formats" id="oth-2503.16288" aria-labelledby="oth-2503.16288">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Overview of Variable Rate Coding in JPEG AI </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Jia,+P">Panqi Jia</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Brand,+F">Fabian Brand</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Yu,+D">Dequan Yu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Karabutov,+A">Alexander Karabutov</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Alshina,+E">Elena Alshina</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Kaup,+A">Andre Kaup</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span> </div> <p class='mathjax'> Empirical evidence has demonstrated that learning-based image compression can outperform classical compression frameworks. This has led to the ongoing standardization of learned-based image codecs, namely Joint Photographic Experts Group (JPEG) AI. The objective of JPEG AI is to enhance compression efficiency and provide a software and hardwarefriendly solution. Based on our research, JPEG AI represents the first standardization that can facilitate the implementation of a learned image codec on a mobile device. This article presents an overview of the variable rate coding functionality in JPEG AI, which includes three variable rate adaptations: a threedimensional quality map, a fast bit rate matching algorithm, and a training strategy. The variable rate adaptations offer a continuous rate function up to 2.0 bpp, exhibiting a high level of performance, a flexible bit allocation between different color components, and a region of interest function for the specified use case. The evaluation of performance encompasses both objective and subjective results. With regard to the objective bit rate matching, the main profile with low complexity yielded a 13.1% BD-rate gain over VVC intra, while the high profile with high complexity achieved a 19.2% BD-rate gain over VVC intra. The BD-rate result is calculated as the mean of the seven perceptual metrics defined in the JPEG AI common test conditions. With respect to subjective results, the example of improving the quality of the region of interest is illustrated. </p> </div> </dd> <dt> <a name='item9'>[9]</a> <a href ="/abs/2503.16298" title="Abstract" id="2503.16298"> arXiv:2503.16298 </a> [<a href="/pdf/2503.16298" title="Download PDF" id="pdf-2503.16298" aria-labelledby="pdf-2503.16298">pdf</a>, <a href="https://arxiv.org/html/2503.16298v1" title="View HTML" id="html-2503.16298" aria-labelledby="html-2503.16298" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.16298" title="Other formats" id="oth-2503.16298" aria-labelledby="oth-2503.16298">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Hyperspectral Unmixing using Iterative, Sparse and Ensambling Approaches for Large Spectral Libraries Applied to Soils and Minerals </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Preston,+J">Jade Preston</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Basener,+W">William Basener</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Data Analysis, Statistics and Probability (physics.data-an) </div> <p class='mathjax'> Unmixing is a fundamental process in hyperspectral image processing in which the materials present in a mixed pixel are determined based on the spectra of candidate materials and the pixel spectrum. Practical and general utility requires a large spectral library with sample measurements covering the full variation in each candidate material as well as a sufficiently varied collection of potential materials. However, any spectral library with more spectra than bands will lead to an ill-posed inversion problem when using classical least-squares regression-based unmixing methods. Moreover, for numerical and dimensionality reasons, libraries with over 10 or 20 spectra behave computationally as though they are ill-posed. In current practice, unmixing is often applied to imagery using manually-selected materials or image endmembers. General unmixing of a spectrum from an unknown material with a large spectral library requires some form of sparse regression; regression where only a small number of coefficients are nonzero. This requires a trade-off between goodness-of-fit and model size. In this study we compare variations of two sparse regression techniques, focusing on the relationship between structure and chemistry of materials and the accuracy of the various models for identifying the correct mixture of materials present. Specifically, we examine LASSO regression and ElasticNet in contrast with variations of iterative feature selection, Bayesian Model Averaging (BMA), and quadratic BMA (BMA-Q) -- incorporating LASSO regression and ElasticNet as their base model. To evaluate the the effectiveness of these methods, we consider the molecular composition similarities and differences of substances selected in the models compared to the ground truth. </p> </div> </dd> <dt> <a name='item10'>[10]</a> <a href ="/abs/2503.16309" title="Abstract" id="2503.16309"> arXiv:2503.16309 </a> [<a href="/pdf/2503.16309" title="Download PDF" id="pdf-2503.16309" aria-labelledby="pdf-2503.16309">pdf</a>, <a href="https://arxiv.org/html/2503.16309v1" title="View HTML" id="html-2503.16309" aria-labelledby="html-2503.16309" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.16309" title="Other formats" id="oth-2503.16309" aria-labelledby="oth-2503.16309">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Rapid patient-specific neural networks for intraoperative X-ray to volume registration </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Gopalakrishnan,+V">Vivek Gopalakrishnan</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Dey,+N">Neel Dey</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Chlorogiannis,+D">David-Dimitris Chlorogiannis</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Abumoussa,+A">Andrew Abumoussa</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Larson,+A+M">Anna M. Larson</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Orbach,+D+B">Darren B. Orbach</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Frisken,+S">Sarah Frisken</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Golland,+P">Polina Golland</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Computer Vision and Pattern Recognition (cs.CV); Medical Physics (physics.med-ph) </div> <p class='mathjax'> The integration of artificial intelligence in image-guided interventions holds transformative potential, promising to extract 3D geometric and quantitative information from conventional 2D imaging modalities during complex procedures. Achieving this requires the rapid and precise alignment of 2D intraoperative images (e.g., X-ray) with 3D preoperative volumes (e.g., CT, MRI). However, current 2D/3D registration methods fail across the broad spectrum of procedures dependent on X-ray guidance: traditional optimization techniques require custom parameter tuning for each subject, whereas neural networks trained on small datasets do not generalize to new patients or require labor-intensive manual annotations, increasing clinical burden and precluding application to new anatomical targets. To address these challenges, we present xvr, a fully automated framework for training patient-specific neural networks for 2D/3D registration. xvr uses physics-based simulation to generate abundant high-quality training data from a patient's own preoperative volumetric imaging, thereby overcoming the inherently limited ability of supervised models to generalize to new patients and procedures. Furthermore, xvr requires only 5 minutes of training per patient, making it suitable for emergency interventions as well as planned procedures. We perform the largest evaluation of a 2D/3D registration algorithm on real X-ray data to date and find that xvr robustly generalizes across a diverse dataset comprising multiple anatomical structures, imaging modalities, and hospitals. Across surgical tasks, xvr achieves submillimeter-accurate registration at intraoperative speeds, improving upon existing methods by an order of magnitude. xvr is released as open-source software freely available at <a href="https://github.com/eigenvivek/xvr" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item11'>[11]</a> <a href ="/abs/2503.16389" title="Abstract" id="2503.16389"> arXiv:2503.16389 </a> [<a href="/pdf/2503.16389" title="Download PDF" id="pdf-2503.16389" aria-labelledby="pdf-2503.16389">pdf</a>, <a href="https://arxiv.org/html/2503.16389v1" title="View HTML" id="html-2503.16389" aria-labelledby="html-2503.16389" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.16389" title="Other formats" id="oth-2503.16389" aria-labelledby="oth-2503.16389">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Attentional Triple-Encoder Network in Spatiospectral Domains for Medical Image Segmentation </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Qi,+K">Kristin Qi</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Di,+X">Xinhan Di</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> IEEE Conference on Artificial Intelligence (IEEE CAI) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Retinal Optical Coherence Tomography (OCT) segmentation is essential for diagnosing pathology. Traditional methods focus on either spatial or spectral domains, overlooking their combined dependencies. We propose a triple-encoder network that integrates CNNs for spatial features, Fast Fourier Convolution (FFC) for spectral features, and attention mechanisms to capture global relationships across both domains. Attention fusion modules integrate convolution and cross-attention to further enhance features. Our method achieves an average Dice score improvement from 0.855 to 0.864, outperforming prior work. </p> </div> </dd> </dl> <dl id='articles'> <h3>Cross submissions (showing 2 of 2 entries)</h3> <dt> <a name='item12'>[12]</a> <a href ="/abs/2503.15984" title="Abstract" id="2503.15984"> arXiv:2503.15984 </a> (cross-list from cs.CV) [<a href="/pdf/2503.15984" title="Download PDF" id="pdf-2503.15984" aria-labelledby="pdf-2503.15984">pdf</a>, <a href="https://arxiv.org/html/2503.15984v1" title="View HTML" id="html-2503.15984" aria-labelledby="html-2503.15984" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15984" title="Other formats" id="oth-2503.15984" aria-labelledby="oth-2503.15984">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> DIPLI: Deep Image Prior Lucky Imaging for Blind Astronomical Image Restoration </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Singh,+S">Suraj Singh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Batsheva,+A">Anastasia Batsheva</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rogov,+O+Y">Oleg Y. Rogov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bouridane,+A">Ahmed Bouridane</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 10 pages, 7 figures, 2 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Instrumentation and Methods for Astrophysics (astro-ph.IM); Artificial Intelligence (cs.AI); Image and Video Processing (eess.IV) </div> <p class='mathjax'> Contemporary image restoration and super-resolution techniques effectively harness deep neural networks, markedly outperforming traditional methods. However, astrophotography presents unique challenges for deep learning due to limited training data. This work explores hybrid strategies, such as the Deep Image Prior (DIP) model, which facilitates blind training but is susceptible to overfitting, artifact generation, and instability when handling noisy images. We propose enhancements to the DIP model's baseline performance through several advanced techniques. First, we refine the model to process multiple frames concurrently, employing the Back Projection method and the TVNet model. Next, we adopt a Markov approach incorporating Monte Carlo estimation, Langevin dynamics, and a variational input technique to achieve unbiased estimates with minimal variance and counteract overfitting effectively. Collectively, these modifications reduce the likelihood of noise learning and mitigate loss function fluctuations during training, enhancing result stability. We validated our algorithm across multiple image sets of astronomical and celestial objects, achieving performance that not only mitigates limitations of Lucky Imaging, a classical computer vision technique that remains a standard in astronomical image reconstruction but surpasses the original DIP model, state of the art transformer- and diffusion-based models, underscoring the significance of our improvements. </p> </div> </dd> <dt> <a name='item13'>[13]</a> <a href ="/abs/2503.16302" title="Abstract" id="2503.16302"> arXiv:2503.16302 </a> (cross-list from cs.CV) [<a href="/pdf/2503.16302" title="Download PDF" id="pdf-2503.16302" aria-labelledby="pdf-2503.16302">pdf</a>, <a href="https://arxiv.org/html/2503.16302v1" title="View HTML" id="html-2503.16302" aria-labelledby="html-2503.16302" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.16302" title="Other formats" id="oth-2503.16302" aria-labelledby="oth-2503.16302">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Unleashing Vecset Diffusion Model for Fast Shape Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lai,+Z">Zeqiang Lai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+Y">Yunfei Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+Z">Zibo Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+H">Haolin Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+F">Fuyun Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shi,+H">Huiwen Shi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+X">Xianghui Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+Q">Qinxiang Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+J">Jinwei Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Y">Yuhong Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+J">Jie Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+C">Chunchao Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yue,+X">Xiangyu Yue</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Technical report </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Image and Video Processing (eess.IV) </div> <p class='mathjax'> 3D shape generation has greatly flourished through the development of so-called "native" 3D diffusion, particularly through the Vecset Diffusion Model (VDM). While recent advancements have shown promising results in generating high-resolution 3D shapes, VDM still struggles with high-speed generation. Challenges exist because of difficulties not only in accelerating diffusion sampling but also VAE decoding in VDM, areas under-explored in previous works. To address these challenges, we present FlashVDM, a systematic framework for accelerating both VAE and DiT in VDM. For DiT, FlashVDM enables flexible diffusion sampling with as few as 5 inference steps and comparable quality, which is made possible by stabilizing consistency distillation with our newly introduced Progressive Flow Distillation. For VAE, we introduce a lightning vecset decoder equipped with Adaptive KV Selection, Hierarchical Volume Decoding, and Efficient Network Design. By exploiting the locality of the vecset and the sparsity of shape surface in the volume, our decoder drastically lowers FLOPs, minimizing the overall decoding overhead. We apply FlashVDM to Hunyuan3D-2 to obtain Hunyuan3D-2 Turbo. Through systematic evaluation, we show that our model significantly outperforms existing fast 3D generation methods, achieving comparable performance to the state-of-the-art while reducing inference time by over 45x for reconstruction and 32x for generation. Code and models are available at <a href="https://github.com/Tencent/FlashVDM" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> </dl> <dl id='articles'> <h3>Replacement submissions (showing 7 of 7 entries)</h3> <dt> <a name='item14'>[14]</a> <a href ="/abs/2209.12075" title="Abstract" id="2209.12075"> arXiv:2209.12075 </a> (replaced) [<a href="/pdf/2209.12075" title="Download PDF" id="pdf-2209.12075" aria-labelledby="pdf-2209.12075">pdf</a>, <a href="https://arxiv.org/html/2209.12075v3" title="View HTML" id="html-2209.12075" aria-labelledby="html-2209.12075" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2209.12075" title="Other formats" id="oth-2209.12075" aria-labelledby="oth-2209.12075">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> S^2-Transformer for Mask-Aware Hyperspectral Image Reconstruction </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Wang,+J">Jiamian Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Li,+K">Kunpeng Li</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Zhang,+Y">Yulun Zhang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Yuan,+X">Xin Yuan</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Tao,+Z">Zhiqiang Tao</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted by TPAMI </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Snapshot compressive imaging (SCI) surges as a novel way of capturing hyperspectral images. It operates an optical encoder to compress the 3D data into a 2D measurement and adopts a software decoder for the signal reconstruction. Recently, a representative SCI set-up of coded aperture snapshot compressive imager (CASSI) with Transformer reconstruction backend remarks high-fidelity sensing performance. However, dominant spatial and spectral attention designs show limitations in hyperspectral modeling. The spatial attention values describe the inter-pixel correlation but overlook the across-spectra variation within each pixel. The spectral attention size is unscalable to the token spatial size and thus bottlenecks information allocation. Besides, CASSI entangles the spatial and spectral information into a 2D measurement, placing a barrier for information disentanglement and modeling. In addition, CASSI blocks the light with a physical binary mask, yielding the masked data loss. To tackle above challenges, we propose a spatial-spectral (S2-) Transformer implemented by a paralleled attention design and a mask-aware learning strategy. Firstly, we systematically explore pros and cons of different spatial (-spectral) attention designs, based on which we find performing both attentions in parallel well disentangles and models the blended information. Secondly, the masked pixels induce higher prediction difficulty and should be treated differently from unmasked ones. We adaptively prioritize the loss penalty attributing to the mask structure by referring to the mask-encoded prediction as an uncertainty estimator. We theoretically discuss the distinct convergence tendencies between masked/unmasked regions of the proposed learning strategy. Extensive experiments demonstrate that on average, the results of the proposed method are superior over the state-of-the-art method. </p> </div> </dd> <dt> <a name='item15'>[15]</a> <a href ="/abs/2403.05906" title="Abstract" id="2403.05906"> arXiv:2403.05906 </a> (replaced) [<a href="/pdf/2403.05906" title="Download PDF" id="pdf-2403.05906" aria-labelledby="pdf-2403.05906">pdf</a>, <a href="https://arxiv.org/html/2403.05906v2" title="View HTML" id="html-2403.05906" aria-labelledby="html-2403.05906" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2403.05906" title="Other formats" id="oth-2403.05906" aria-labelledby="oth-2403.05906">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Segmentation Guided Sparse Transformer for Under-Display Camera Image Restoration </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Xue,+J">Jingyun Xue</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Wang,+T">Tao Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Dai,+P">Pengwen Dai</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Zhang,+K">Kaihao Zhang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 13 pages, 10 figures, conference or other essential info </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Under-Display Camera (UDC) is an emerging technology that achieves full-screen display via hiding the camera under the display panel. However, the current implementation of UDC causes serious degradation. The incident light required for camera imaging undergoes attenuation and diffraction when passing through the display panel, leading to various artifacts in UDC imaging. Presently, the prevailing UDC image restoration methods predominantly utilize convolutional neural network architectures, whereas Transformer-based methods have exhibited superior performance in the majority of image restoration tasks. This is attributed to the Transformer's capability to sample global features for the local reconstruction of images, thereby achieving high-quality image restoration. In this paper, we observe that when using the Vision Transformer for UDC degraded image restoration, the global attention samples a large amount of redundant information and noise. Furthermore, compared to the ordinary Transformer employing dense attention, the Transformer utilizing sparse attention can alleviate the adverse impact of redundant information and noise. Building upon this discovery, we propose a Segmentation Guided Sparse Transformer method (SGSFormer) for the task of restoring high-quality images from UDC degraded images. Specifically, we utilize sparse self-attention to filter out redundant information and noise, directing the model's attention to focus on the features more relevant to the degraded regions in need of reconstruction. Moreover, we integrate the instance segmentation map as prior information to guide the sparse self-attention in filtering and focusing on the correct regions. </p> </div> </dd> <dt> <a name='item16'>[16]</a> <a href ="/abs/2503.14523" title="Abstract" id="2503.14523"> arXiv:2503.14523 </a> (replaced) [<a href="/pdf/2503.14523" title="Download PDF" id="pdf-2503.14523" aria-labelledby="pdf-2503.14523">pdf</a>, <a href="https://arxiv.org/html/2503.14523v2" title="View HTML" id="html-2503.14523" aria-labelledby="html-2503.14523" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.14523" title="Other formats" id="oth-2503.14523" aria-labelledby="oth-2503.14523">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SDF-TopoNet: A Two-Stage Framework for Tubular Structure Segmentation via SDF Pre-training and Topology-Aware Fine-Tuning </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Wu,+S">Siyi Wu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Zhao,+L">Leyi Zhao</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Ma,+H">Haotian Ma</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Song,+X">Xinyuan Song</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Accurate segmentation of tubular and curvilinear structures, such as blood vessels, neurons, and road networks, is crucial in various applications. A key challenge is ensuring topological correctness while maintaining computational efficiency. Existing approaches often employ topological loss functions based on persistent homology, such as Betti error, to enforce structural consistency. However, these methods suffer from high computational costs and are insensitive to pixel-level accuracy, often requiring additional loss terms like Dice or MSE to compensate. To address these limitations, we propose \textbf{SDF-TopoNet}, an improved topology-aware segmentation framework that enhances both segmentation accuracy and training efficiency. Our approach introduces a novel two-stage training strategy. In the pre-training phase, we utilize the signed distance function (SDF) as an auxiliary learning target, allowing the model to encode topological information without directly relying on computationally expensive topological loss functions. In the fine-tuning phase, we incorporate a dynamic adapter alongside a refined topological loss to ensure topological correctness while mitigating overfitting and computational overhead. We evaluate our method on five benchmark datasets. Experimental results demonstrate that SDF-TopoNet outperforms existing methods in both topological accuracy and quantitative segmentation metrics, while significantly reducing training complexity. </p> </div> </dd> <dt> <a name='item17'>[17]</a> <a href ="/abs/2503.14538" title="Abstract" id="2503.14538"> arXiv:2503.14538 </a> (replaced) [<a href="/pdf/2503.14538" title="Download PDF" id="pdf-2503.14538" aria-labelledby="pdf-2503.14538">pdf</a>, <a href="https://arxiv.org/html/2503.14538v2" title="View HTML" id="html-2503.14538" aria-labelledby="html-2503.14538" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.14538" title="Other formats" id="oth-2503.14538" aria-labelledby="oth-2503.14538">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Vision-Language Models for Acute Tuberculosis Diagnosis: A Multimodal Approach Combining Imaging and Clinical Data </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Ganapthy,+A">Ananya Ganapthy</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Shastry,+P">Praveen Shastry</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Kumarasami,+N">Naveen Kumarasami</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=D,+A">Anandakumar D</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=R,+K">Keerthana R</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=M,+M">Mounigasri M</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=M,+V">Varshinipriya M</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Venkatesh,+K+P">Kishore Prasath Venkatesh</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Subramanian,+B">Bargava Subramanian</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Sivasailam,+K">Kalyan Sivasailam</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 11 pages, 3 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV); Machine Learning (cs.LG) </div> <p class='mathjax'> Background: This study introduces a Vision-Language Model (VLM) leveraging SIGLIP and Gemma-3b architectures for automated acute tuberculosis (TB) screening. By integrating chest X-ray images and clinical notes, the model aims to enhance diagnostic accuracy and efficiency, particularly in resource-limited settings. <br>Methods: The VLM combines visual data from chest X-rays with clinical context to generate detailed, context-aware diagnostic reports. The architecture employs SIGLIP for visual encoding and Gemma-3b for decoding, ensuring effective representation of acute TB-specific pathologies and clinical insights. <br>Results: Key acute TB pathologies, including consolidation, cavities, and nodules, were detected with high precision (97percent) and recall (96percent). The model demonstrated strong spatial localization capabilities and robustness in distinguishing TB-positive cases, making it a reliable tool for acute TB diagnosis. <br>Conclusion: The multimodal capability of the VLM reduces reliance on radiologists, providing a scalable solution for acute TB screening. Future work will focus on improving the detection of subtle pathologies and addressing dataset biases to enhance its generalizability and application in diverse global healthcare settings. </p> </div> </dd> <dt> <a name='item18'>[18]</a> <a href ="/abs/2211.14312" title="Abstract" id="2211.14312"> arXiv:2211.14312 </a> (replaced) [<a href="/pdf/2211.14312" title="Download PDF" id="pdf-2211.14312" aria-labelledby="pdf-2211.14312">pdf</a>, <a href="https://arxiv.org/html/2211.14312v4" title="View HTML" id="html-2211.14312" aria-labelledby="html-2211.14312" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2211.14312" title="Other formats" id="oth-2211.14312" aria-labelledby="oth-2211.14312">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Karyotype AI for Precision Oncology </div> <div class='list-authors'><a href="https://arxiv.org/search/q-bio?searchtype=author&query=Shamsi,+Z">Zahra Shamsi</a>, <a href="https://arxiv.org/search/q-bio?searchtype=author&query=Bryant,+D">Drew Bryant</a>, <a href="https://arxiv.org/search/q-bio?searchtype=author&query=Wilson,+J">Jacob Wilson</a>, <a href="https://arxiv.org/search/q-bio?searchtype=author&query=Qu,+X">Xiaoyu Qu</a>, <a href="https://arxiv.org/search/q-bio?searchtype=author&query=Dubey,+A">Avinava Dubey</a>, <a href="https://arxiv.org/search/q-bio?searchtype=author&query=Kothari,+K">Konik Kothari</a>, <a href="https://arxiv.org/search/q-bio?searchtype=author&query=Dehghani,+M">Mostafa Dehghani</a>, <a href="https://arxiv.org/search/q-bio?searchtype=author&query=Chavarha,+M">Mariya Chavarha</a>, <a href="https://arxiv.org/search/q-bio?searchtype=author&query=Likhosherstov,+V">Valerii Likhosherstov</a>, <a href="https://arxiv.org/search/q-bio?searchtype=author&query=Williams,+B">Brian Williams</a>, <a href="https://arxiv.org/search/q-bio?searchtype=author&query=Frumkin,+M">Michael Frumkin</a>, <a href="https://arxiv.org/search/q-bio?searchtype=author&query=Appelbaum,+F">Fred Appelbaum</a>, <a href="https://arxiv.org/search/q-bio?searchtype=author&query=Choromanski,+K">Krzysztof Choromanski</a>, <a href="https://arxiv.org/search/q-bio?searchtype=author&query=Bashir,+A">Ali Bashir</a>, <a href="https://arxiv.org/search/q-bio?searchtype=author&query=Fang,+M">Min Fang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Quantitative Methods (q-bio.QM)</span>; Computer Vision and Pattern Recognition (cs.CV); Machine Learning (cs.LG); Image and Video Processing (eess.IV) </div> <p class='mathjax'> We present a machine learning method capable of accurately detecting chromosome abnormalities that cause blood cancers directly from microscope images of the metaphase stage of cell division. The pipeline is built on a series of fine-tuned Vision Transformers. Current state of the art (and standard clinical practice) requires expensive, manual expert analysis, whereas our pipeline takes only 15 seconds per metaphase image. Using a novel pretraining-finetuning strategy to mitigate the challenge of data scarcity, we achieve a high precision-recall score of 94% AUC for the clinically significant del(5q) and t(9;22) anomalies. Our method also unlocks zero-shot detection of rare aberrations based on model latent embeddings. The ability to quickly, accurately, and scalably diagnose genetic abnormalities directly from metaphase images could transform karyotyping practice and improve patient outcomes. We will make code publicly available. </p> </div> </dd> <dt> <a name='item19'>[19]</a> <a href ="/abs/2411.14501" title="Abstract" id="2411.14501"> arXiv:2411.14501 </a> (replaced) [<a href="/pdf/2411.14501" title="Download PDF" id="pdf-2411.14501" aria-labelledby="pdf-2411.14501">pdf</a>, <a href="https://arxiv.org/html/2411.14501v4" title="View HTML" id="html-2411.14501" aria-labelledby="html-2411.14501" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14501" title="Other formats" id="oth-2411.14501" aria-labelledby="oth-2411.14501">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> U-Motion: Learned Point Cloud Video Compression with U-Structured Temporal Context Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Fan,+T">Tingyu Fan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+Y">Yueyu Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gong,+R">Ran Gong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yao Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Image and Video Processing (eess.IV) </div> <p class='mathjax'> Point cloud video (PCV) is a versatile 3D representation of dynamic scenes with emerging applications. This paper introduces U-Motion, a learning-based compression scheme for both PCV geometry and attributes. We propose a U-Structured inter-frame prediction framework, U-Inter, which performs explicit motion estimation and compensation (ME/MC) at different scales with varying levels of detail. It integrates Top-Down (Fine-to-Coarse) Motion Propagation, Bottom-Up Motion Predictive Coding and Multi-scale Group Motion Compensation to enable accurate motion estimation and efficient motion compression at each scale. In addition, we design a multi-scale spatial-temporal predictive coding module to capture the cross-scale spatial redundancy remaining after U-Inter prediction. We conduct experiments following the MPEG Common Test Condition for dense dynamic point clouds and demonstrate that U-Motion can achieve significant gains over MPEG G-PCC-GesTM v3.0 and recently published learning-based methods for both geometry and attribute compression. </p> </div> </dd> <dt> <a name='item20'>[20]</a> <a href ="/abs/2503.12461" title="Abstract" id="2503.12461"> arXiv:2503.12461 </a> (replaced) [<a href="/pdf/2503.12461" title="Download PDF" id="pdf-2503.12461" aria-labelledby="pdf-2503.12461">pdf</a>, <a href="https://arxiv.org/html/2503.12461v2" title="View HTML" id="html-2503.12461" aria-labelledby="html-2503.12461" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.12461" title="Other formats" id="oth-2503.12461" aria-labelledby="oth-2503.12461">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MambaIC: State Space Models for High-Performance Learned Image Compression </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zeng,+F">Fanhu Zeng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+H">Hao Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shao,+Y">Yihua Shao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+S">Siyu Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shao,+L">Ling Shao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yan Wang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted to CVPR 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Image and Video Processing (eess.IV) </div> <p class='mathjax'> A high-performance image compression algorithm is crucial for real-time information transmission across numerous fields. Despite rapid progress in image compression, computational inefficiency and poor redundancy modeling still pose significant bottlenecks, limiting practical applications. Inspired by the effectiveness of state space models (SSMs) in capturing long-range dependencies, we leverage SSMs to address computational inefficiency in existing methods and improve image compression from multiple perspectives. In this paper, we integrate the advantages of SSMs for better efficiency-performance trade-off and propose an enhanced image compression approach through refined context modeling, which we term MambaIC. Specifically, we explore context modeling to adaptively refine the representation of hidden states. Additionally, we introduce window-based local attention into channel-spatial entropy modeling to reduce potential spatial redundancy during compression, thereby increasing efficiency. Comprehensive qualitative and quantitative results validate the effectiveness and efficiency of our approach, particularly for high-resolution image compression. Code is released at <a href="https://github.com/AuroraZengfh/MambaIC" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> </dl> <div class='paging'>Total of 20 entries </div> <div class='morefewer'>Showing up to 2000 entries per page: <a href=/list/eess.IV/new?skip=0&show=1000 rel="nofollow"> fewer</a> | <span style="color: #454545">more</span> | <span style="color: #454545">all</span> </div> </div> </div> </div> </main> <footer style="clear: both;"> <div class="columns is-desktop" role="navigation" aria-label="Secondary" style="margin: -0.75em -0.75em 0.75em -0.75em"> <!-- Macro-Column 1 --> <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- End Macro-Column 1 --> <!-- Macro-Column 2 --> <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> <!-- End Macro-Column 2 --> </div> </footer> </div> <script src="/static/base/1.0.1/js/member_acknowledgement.js"></script> </body> </html>