CINXE.COM

<!DOCTYPE html> <html lang="en"> <head> <title>Computer Vision and Pattern Recognition </title> <meta name="viewport" content="width=device-width, initial-scale=1"> <link rel="apple-touch-icon" sizes="180x180" href="/static/browse/0.3.4/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="/static/browse/0.3.4/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="/static/browse/0.3.4/images/icons/favicon-16x16.png"> <link rel="manifest" href="/static/browse/0.3.4/images/icons/site.webmanifest"> <link rel="mask-icon" href="/static/browse/0.3.4/images/icons/safari-pinned-tab.svg" color="#5bbad5"> <meta name="msapplication-TileColor" content="#da532c"> <meta name="theme-color" content="#ffffff"> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/arXiv.css?v=20241206" /> <link rel="stylesheet" type="text/css" media="print" href="/static/browse/0.3.4/css/arXiv-print.css?v=20200611" /> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/browse_search.css" /> <script language="javascript" src="/static/browse/0.3.4/js/accordion.js" /></script> <script src="/static/browse/0.3.4/js/mathjaxToggle.min.js" type="text/javascript"></script> <script type="text/javascript" language="javascript">mathjaxToggle();</script> </head> <body class="with-cu-identity"> <div class="flex-wrap-footer"> <header> <a href="#content" class="is-sr-only">Skip to main content</a>  <div class="columns is-vcentered is-hidden-mobile" id="cu-identity"> <div class="column" id="cu-logo"> <a href="https://www.cornell.edu/"><img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University" /></a> </div><div class="column" id="support-ack"> <span id="support-ack-url">We gratefully acknowledge support from the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors.</span> <a href="https://info.arxiv.org/about/donate.html" class="btn-header-donate">Donate</a> </div> </div> <div id="header" class="is-hidden-mobile"> <a aria-hidden="true" tabindex="-1" href="/IgnoreMe"></a> <div class="header-breadcrumbs"> <a href="/"><img src="/static/browse/0.3.4/images/arxiv-logo-one-color-white.svg" alt="arxiv logo" style="height:40px;"/></a> <span>></span> <a href="/list/cs.CV/recent">cs.CV</a> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <div class="mobile-header"> <div class="columns is-mobile"> <div class="column logo-arxiv"><a href="https://arxiv.org/"><img src="/static/browse/0.3.4/images/arxiv-logomark-small-white.svg" alt="arXiv logo" style="height:60px;" /></a></div> <div class="column logo-cornell"><a href="https://www.cornell.edu/"> <picture> <source media="(min-width: 501px)" srcset="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg 400w" sizes="400w" /> <source srcset="/static/browse/0.3.4/images/icons/cu/cornell_seal_simple_black.svg 2x" /> <img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University Logo" /> </picture> </a></div> <div class="column nav" id="toggle-container" role="menubar"> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-white"><title>open search</title><path d="M505 442.7L405.3 343c-4.5-4.5-10.6-7-17-7H372c27.6-35.3 44-79.7 44-128C416 93.1 322.9 0 208 0S0 93.1 0 208s93.1 208 208 208c48.3 0 92.7-16.4 128-44v16.3c0 6.4 2.5 12.5 7 17l99.7 99.7c9.4 9.4 24.6 9.4 33.9 0l28.3-28.3c9.4-9.4 9.4-24.6.1-34zM208 336c-70.7 0-128-57.2-128-128 0-70.7 57.2-128 128-128 70.7 0 128 57.2 128 128 0 70.7-57.2 128-128 128z"/></svg></button> <div class="mobile-toggle-block toggle-target"> <form class="mobile-search-form" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <input class="input" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <input type="hidden" name="source" value="header"> <input type="hidden" name="searchtype" value="all"> <button class="button">GO</button> </div> </form> </div> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-white" role="menu"><title>open navigation menu</title><path d="M16 132h416c8.837 0 16-7.163 16-16V76c0-8.837-7.163-16-16-16H16C7.163 60 0 67.163 0 76v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16z"/ ></svg></button> <div class="mobile-toggle-block toggle-target"> <nav class="mobile-menu" aria-labelledby="mobilemenulabel"> <h2 id="mobilemenulabel">quick links</h2> <ul> <li><a href="https://arxiv.org/login">Login</a></li> <li><a href="https://info.arxiv.org/help">Help Pages</a></li> <li><a href="https://info.arxiv.org/about">About</a></li> </ul> </nav> </div> </div> </div> </div> </header> <main> <div id="content"> <div id='content-inner'> <div id='dlpage'> <h1>Computer Vision and Pattern Recognition</h1> <ul> <li><a href="#item0">New submissions</a></li> <li><a href="#item66">Cross-lists</a></li> <li><a href="#item94">Replacements</a></li> </ul> <p>See <a id="recent-cs.CV" aria-labelledby="recent-cs.CV" href="/list/cs.CV/recent">recent</a> articles</p> <h3>Showing new listings for Friday, 21 February 2025</h3> <div class='paging'>Total of 144 entries </div> <div class='morefewer'>Showing up to 2000 entries per page: <a href=/list/cs.CV/new?skip=0&show=1000 rel="nofollow"> fewer</a> | <span style="color: #454545">more</span> | <span style="color: #454545">all</span> </div> <dl id='articles'> <h3>New submissions (showing 65 of 65 entries)</h3> <dt> <a name='item1'>[1]</a> <a href ="/abs/2502.14044" title="Abstract" id="2502.14044"> arXiv:2502.14044 </a> [<a href="/pdf/2502.14044" title="Download PDF" id="pdf-2502.14044" aria-labelledby="pdf-2502.14044">pdf</a>, <a href="https://arxiv.org/html/2502.14044v1" title="View HTML" id="html-2502.14044" aria-labelledby="html-2502.14044" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14044" title="Other formats" id="oth-2502.14044" aria-labelledby="oth-2502.14044">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Enhancing Cognition and Explainability of Multimodal Foundation Models with Self-Synthesized Data </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Shi,+Y">Yucheng Shi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Q">Quanzheng Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+J">Jin Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+X">Xiang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+N">Ninghao Liu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted by ICLR 2025. Code: <a href="https://github.com/sycny/SelfSynthX" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Large multimodal models (LMMs) have shown impressive capabilities in a wide range of visual tasks. However, they often struggle with fine-grained visual reasoning, failing to identify domain-specific objectives and provide justifiable explanations for their predictions. To address this, we propose a novel visual rejection sampling framework to improve the cognition and explainability of LMMs using self-synthesized data. Specifically, visual fine-tuning requires images, queries, and target answers. Our approach begins by synthesizing interpretable answers that include human-verifiable visual features. These features are based on expert-defined concepts, carefully selected based on their alignment with the image content. After each round of fine-tuning, we apply a reward model-free filtering mechanism to select the highest-quality interpretable answers for the next round of tuning. This iterative process of data synthesis and fine-tuning progressively improves the model's ability to generate accurate and reasonable explanations. Experimental results demonstrate the effectiveness of our method in improving both the accuracy and explainability of specialized visual classification tasks. </p> </div> </dd> <dt> <a name='item2'>[2]</a> <a href ="/abs/2502.14061" title="Abstract" id="2502.14061"> arXiv:2502.14061 </a> [<a href="/pdf/2502.14061" title="Download PDF" id="pdf-2502.14061" aria-labelledby="pdf-2502.14061">pdf</a>, <a href="https://arxiv.org/html/2502.14061v1" title="View HTML" id="html-2502.14061" aria-labelledby="html-2502.14061" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14061" title="Other formats" id="oth-2502.14061" aria-labelledby="oth-2502.14061">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> EfficientPose 6D: Scalable and Efficient 6D Object Pose Estimation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Fang,+Z">Zixuan Fang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=P%C3%B6llabauer,+T">Thomas P枚llabauer</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wirth,+T">Tristan Wirth</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Berkei,+S">Sarah Berkei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Knauthe,+V">Volker Knauthe</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kuijper,+A">Arjan Kuijper</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> In industrial applications requiring real-time feedback, such as quality control and robotic manipulation, the demand for high-speed and accurate pose estimation remains critical. Despite advances improving speed and accuracy in pose estimation, finding a balance between computational efficiency and accuracy poses significant challenges in dynamic environments. Most current algorithms lack scalability in estimation time, especially for diverse datasets, and the state-of-the-art (SOTA) methods are often too slow. This study focuses on developing a fast and scalable set of pose estimators based on GDRNPP to meet or exceed current benchmarks in accuracy and robustness, particularly addressing the efficiency-accuracy trade-off essential in real-time scenarios. We propose the AMIS algorithm to tailor the utilized model according to an application-specific trade-off between inference time and accuracy. We further show the effectiveness of the AMIS-based model choice on four prominent benchmark datasets (LM-O, YCB-V, T-LESS, and ITODD). </p> </div> </dd> <dt> <a name='item3'>[3]</a> <a href ="/abs/2502.14063" title="Abstract" id="2502.14063"> arXiv:2502.14063 </a> [<a href="/pdf/2502.14063" title="Download PDF" id="pdf-2502.14063" aria-labelledby="pdf-2502.14063">pdf</a>, <a href="https://arxiv.org/html/2502.14063v1" title="View HTML" id="html-2502.14063" aria-labelledby="html-2502.14063" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14063" title="Other formats" id="oth-2502.14063" aria-labelledby="oth-2502.14063">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> PedDet: Adaptive Spectral Optimization for Multimodal Pedestrian Detection </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+R">Rui Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Z">Zeyu Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+Y">Yi Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yao,+Y">Yi Yao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+Y">Yan Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+W">Wenxin Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+Z">Zirui Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+X">Xiuying Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+Y">Yang Zhao</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Pedestrian detection in intelligent transportation systems has made significant progress but faces two critical challenges: (1) insufficient fusion of complementary information between visible and infrared spectra, particularly in complex scenarios, and (2) sensitivity to illumination changes, such as low-light or overexposed conditions, leading to degraded performance. To address these issues, we propose PedDet, an adaptive spectral optimization complementarity framework specifically enhanced and optimized for multispectral pedestrian detection. PedDet introduces the Multi-scale Spectral Feature Perception Module (MSFPM) to adaptively fuse visible and infrared features, enhancing robustness and flexibility in feature extraction. Additionally, the Illumination Robustness Feature Decoupling Module (IRFDM) improves detection stability under varying lighting by decoupling pedestrian and background features. We further design a contrastive alignment to enhance intermodal feature discrimination. Experiments on LLVIP and MSDS datasets demonstrate that PedDet achieves state-of-the-art performance, improving the mAP by 6.6% with superior detection accuracy even in low-light conditions, marking a significant step forward for road safety. Code will be available at <a href="https://github.com/AIGeeksGroup/PedDet" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item4'>[4]</a> <a href ="/abs/2502.14064" title="Abstract" id="2502.14064"> arXiv:2502.14064 </a> [<a href="/pdf/2502.14064" title="Download PDF" id="pdf-2502.14064" aria-labelledby="pdf-2502.14064">pdf</a>, <a href="https://arxiv.org/html/2502.14064v1" title="View HTML" id="html-2502.14064" aria-labelledby="html-2502.14064" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14064" title="Other formats" id="oth-2502.14064" aria-labelledby="oth-2502.14064">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Triad: Vision Foundation Model for 3D Magnetic Resonance Imaging </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Shansong Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Safari,+M">Mojtaba Safari</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Q">Qiang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chang,+C">Chih-Wei Chang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qiu,+R+L">Richard LJ Qiu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Roper,+J">Justin Roper</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+D+S">David S. Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+X">Xiaofeng Yang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Vision foundation models (VFMs) are pre-trained on extensive image datasets to learn general representations for diverse types of data. These models can subsequently be fine-tuned for specific downstream tasks, significantly boosting performance across a broad range of applications. However, existing vision foundation models that claim to be applicable to various radiology tasks are mostly pre-trained on 3D computed tomography (CT), which benefits from the availability of extensive 3D CT databases. Significant differences between CT and magnetic resonance imaging (MRI) in imaging principles, signal characteristics, and data distribution may hinder their practical performance and versatility in MRI-specific applications. Here, we propose Triad, a vision foundation model for 3D MRI. Triad adopts a widely used autoencoder architecture to learn robust representations from 131,170 3D MRI volumes and uses organ-independent imaging descriptions to constrain the semantic distribution of the visual modality. The above pre-training dataset is called Triad-131K, which is currently the largest 3D MRI pre-training dataset. We evaluate Triad across three tasks, namely, organ/tumor segmentation, organ/cancer classification, and medical image registration, in two data modalities (within-domain and out-of-domain) settings using 25 downstream datasets. By initializing models with Triad's pre-trained weights, nnUNet-Triad improves segmentation performance by 6.88% compared to nnUNet-Scratch across 17 datasets. Swin-B-Triad achieves a 3.97% improvement over Swin-B-Scratch in classification tasks across five datasets. SwinUNETR-Triad improves by 4.00% compared to SwinUNETR-Scratch in registration tasks across two datasets. Our study demonstrates that pre-training can maximize performance when the data modalities and organs of upstream and downstream tasks are consistent. </p> </div> </dd> <dt> <a name='item5'>[5]</a> <a href ="/abs/2502.14068" title="Abstract" id="2502.14068"> arXiv:2502.14068 </a> [<a href="/pdf/2502.14068" title="Download PDF" id="pdf-2502.14068" aria-labelledby="pdf-2502.14068">pdf</a>, <a href="https://arxiv.org/html/2502.14068v1" title="View HTML" id="html-2502.14068" aria-labelledby="html-2502.14068" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14068" title="Other formats" id="oth-2502.14068" aria-labelledby="oth-2502.14068">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Racing Dataset and Baseline Model for Track Detection in Autonomous Racing </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ghosh,+S">Shreya Ghosh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Y">Yi-Huan Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+C">Ching-Hsiang Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jameel,+A+S+M+M">Abu Shafin Mohammad Mahdee Jameel</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ho,+C+C">Chien Chou Ho</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gamal,+A+E">Aly El Gamal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Labi,+S">Samuel Labi</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Currently Under Review </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Image and Video Processing (eess.IV) </div> <p class='mathjax'> A significant challenge in racing-related research is the lack of publicly available datasets containing raw images with corresponding annotations for the downstream task. In this paper, we introduce RoRaTrack, a novel dataset that contains annotated multi-camera image data from racing scenarios for track detection. The data is collected on a Dallara AV-21 at a racing circuit in Indiana, in collaboration with the Indy Autonomous Challenge (IAC). RoRaTrack addresses common problems such as blurriness due to high speed, color inversion from the camera, and absence of lane markings on the track. Consequently, we propose RaceGAN, a baseline model based on a Generative Adversarial Network (GAN) that effectively addresses these challenges. The proposed model demonstrates superior performance compared to current state-of-the-art machine learning models in track detection. The dataset and code for this work are available at <a href="http://github.com/RaceGAN" rel="external noopener nofollow" class="link-external link-http">this http URL</a>. </p> </div> </dd> <dt> <a name='item6'>[6]</a> <a href ="/abs/2502.14070" title="Abstract" id="2502.14070"> arXiv:2502.14070 </a> [<a href="/pdf/2502.14070" title="Download PDF" id="pdf-2502.14070" aria-labelledby="pdf-2502.14070">pdf</a>, <a href="https://arxiv.org/html/2502.14070v1" title="View HTML" id="html-2502.14070" aria-labelledby="html-2502.14070" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14070" title="Other formats" id="oth-2502.14070" aria-labelledby="oth-2502.14070">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> DiffExp: Efficient Exploration in Reward Fine-tuning for Text-to-Image Diffusion Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chae,+D">Daewon Chae</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Choi,+J+S">June Suk Choi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+J">Jinkyu Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+K">Kimin Lee</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> AAAI 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Fine-tuning text-to-image diffusion models to maximize rewards has proven effective for enhancing model performance. However, reward fine-tuning methods often suffer from slow convergence due to online sample generation. Therefore, obtaining diverse samples with strong reward signals is crucial for improving sample efficiency and overall performance. In this work, we introduce DiffExp, a simple yet effective exploration strategy for reward fine-tuning of text-to-image models. Our approach employs two key strategies: (a) dynamically adjusting the scale of classifier-free guidance to enhance sample diversity, and (b) randomly weighting phrases of the text prompt to exploit high-quality reward signals. We demonstrate that these strategies significantly enhance exploration during online sample generation, improving the sample efficiency of recent reward fine-tuning methods, such as DDPO and AlignProp. </p> </div> </dd> <dt> <a name='item7'>[7]</a> <a href ="/abs/2502.14088" title="Abstract" id="2502.14088"> arXiv:2502.14088 </a> [<a href="/pdf/2502.14088" title="Download PDF" id="pdf-2502.14088" aria-labelledby="pdf-2502.14088">pdf</a>, <a href="https://arxiv.org/html/2502.14088v1" title="View HTML" id="html-2502.14088" aria-labelledby="html-2502.14088" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14088" title="Other formats" id="oth-2502.14088" aria-labelledby="oth-2502.14088">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Regression in EO: Are VLMs Up to the Challenge? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xue,+X">Xizhe Xue</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+X+X">Xiao Xiang Zhu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Earth Observation (EO) data encompass a vast range of remotely sensed information, featuring multi-sensor and multi-temporal, playing an indispensable role in understanding our planet's dynamics. Recently, Vision Language Models (VLMs) have achieved remarkable success in perception and reasoning tasks, bringing new insights and opportunities to the EO field. However, the potential for EO applications, especially for scientific regression related applications remains largely unexplored. This paper bridges that gap by systematically examining the challenges and opportunities of adapting VLMs for EO regression tasks. The discussion first contrasts the distinctive properties of EO data with conventional computer vision datasets, then identifies four core obstacles in applying VLMs to EO regression: 1) the absence of dedicated benchmarks, 2) the discrete-versus-continuous representation mismatch, 3) cumulative error accumulation, and 4) the suboptimal nature of text-centric training objectives for numerical tasks. Next, a series of methodological insights and potential subtle pitfalls are explored. Lastly, we offer some promising future directions for designing robust, domain-aware solutions. Our findings highlight the promise of VLMs for scientific regression in EO, setting the stage for more precise and interpretable modeling of critical environmental processes. </p> </div> </dd> <dt> <a name='item8'>[8]</a> <a href ="/abs/2502.14099" title="Abstract" id="2502.14099"> arXiv:2502.14099 </a> [<a href="/pdf/2502.14099" title="Download PDF" id="pdf-2502.14099" aria-labelledby="pdf-2502.14099">pdf</a>, <a href="https://arxiv.org/html/2502.14099v1" title="View HTML" id="html-2502.14099" aria-labelledby="html-2502.14099" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14099" title="Other formats" id="oth-2502.14099" aria-labelledby="oth-2502.14099">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Point Cloud Geometry Scalable Coding Using a Resolution and Quality-conditioned Latents Probability Estimator </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Mari,+D">Daniele Mari</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guarda,+A+F+R">Andr茅 F. R. Guarda</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rodrigues,+N+M+M">Nuno M. M. Rodrigues</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Milani,+S">Simone Milani</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pereira,+F">Fernando Pereira</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Submitted to IEEE and currently under review </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> In the current age, users consume multimedia content in very heterogeneous scenarios in terms of network, hardware, and display capabilities. A naive solution to this problem is to encode multiple independent streams, each covering a different possible requirement for the clients, with an obvious negative impact in both storage and computational requirements. These drawbacks can be avoided by using codecs that enable scalability, i.e., the ability to generate a progressive bitstream, containing a base layer followed by multiple enhancement layers, that allow decoding the same bitstream serving multiple reconstructions and visualization specifications. While scalable coding is a well-known and addressed feature in conventional image and video codecs, this paper focuses on a new and very different problem, notably the development of scalable coding solutions for deep learning-based Point Cloud (PC) coding. The peculiarities of this 3D representation make it hard to implement flexible solutions that do not compromise the other functionalities of the codec. This paper proposes a joint quality and resolution scalability scheme, named Scalable Resolution and Quality Hyperprior (SRQH), that, contrary to previous solutions, can model the relationship between latents obtained with models trained for different RD tradeoffs and/or at different resolutions. Experimental results obtained by integrating SRQH in the emerging JPEG Pleno learning-based PC coding standard show that SRQH allows decoding the PC at different qualities and resolutions with a single bitstream while incurring only in a limited RD penalty and increment in complexity w.r.t. non-scalable JPEG PCC that would require one bitstream per coding configuration. </p> </div> </dd> <dt> <a name='item9'>[9]</a> <a href ="/abs/2502.14113" title="Abstract" id="2502.14113"> arXiv:2502.14113 </a> [<a href="/pdf/2502.14113" title="Download PDF" id="pdf-2502.14113" aria-labelledby="pdf-2502.14113">pdf</a>, <a href="https://arxiv.org/html/2502.14113v1" title="View HTML" id="html-2502.14113" aria-labelledby="html-2502.14113" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14113" title="Other formats" id="oth-2502.14113" aria-labelledby="oth-2502.14113">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Object-centric Binding in Contrastive Language-Image Pretraining </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Assouel,+R">Rim Assouel</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Astolfi,+P">Pietro Astolfi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bordes,+F">Florian Bordes</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Drozdzal,+M">Michal Drozdzal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Romero-Soriano,+A">Adriana Romero-Soriano</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Recent advances in vision language models (VLM) have been driven by contrastive models such as CLIP, which learn to associate visual information with their corresponding text descriptions. However, these models have limitations in understanding complex compositional scenes involving multiple objects and their spatial relationships. To address these challenges, we propose a novel approach that diverges from commonly used strategies, which rely on the design of hard-negative augmentations. Instead, our work focuses on integrating inductive biases into pre-trained CLIP-like models to improve their compositional understanding without using any additional hard-negatives. To that end, we introduce a binding module that connects a scene graph, derived from a text description, with a slot-structured image representation, facilitating a structured similarity assessment between the two modalities. We also leverage relationships as text-conditioned visual constraints, thereby capturing the intricate interactions between objects and their contextual relationships more effectively. Our resulting model not only enhances the performance of CLIP-based models in multi-object compositional understanding but also paves the way towards more accurate and sample-efficient image-text matching of complex scenes. </p> </div> </dd> <dt> <a name='item10'>[10]</a> <a href ="/abs/2502.14125" title="Abstract" id="2502.14125"> arXiv:2502.14125 </a> [<a href="/pdf/2502.14125" title="Download PDF" id="pdf-2502.14125" aria-labelledby="pdf-2502.14125">pdf</a>, <a href="https://arxiv.org/html/2502.14125v1" title="View HTML" id="html-2502.14125" aria-labelledby="html-2502.14125" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14125" title="Other formats" id="oth-2502.14125" aria-labelledby="oth-2502.14125">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Modular Prompt Learning Improves Vision-Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+Z">Zhenhan Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pedapati,+T">Tejaswini Pedapati</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+P">Pin-Yu Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+J">Jianxi Gao</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 2025 IEEE International Conference on Acoustics, Speech, and Signal Processing </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Pre-trained vision-language models are able to interpret visual concepts and language semantics. Prompt learning, a method of constructing prompts for text encoders or image encoders, elicits the potentials of pre-trained models and readily adapts them to new scenarios. Compared to fine-tuning, prompt learning enables the model to achieve comparable or better performance using fewer trainable parameters. Besides, prompt learning freezes the pre-trained model and avoids the catastrophic forgetting issue in the fine-tuning. Continuous prompts inserted into the input of every transformer layer (i.e. deep prompts) can improve the performances of pre-trained models on downstream tasks. For i-th transformer layer, the inserted prompts replace previously inserted prompts in the $(i-1)$-th layer. Although the self-attention mechanism contextualizes newly inserted prompts for the current layer and embeddings from the previous layer's output, removing all inserted prompts from the previous layer inevitably loses information contained in the continuous prompts. In this work, we propose Modular Prompt Learning (MPL) that is designed to promote the preservation of information contained in the inserted prompts. We evaluate the proposed method on base-to-new generalization and cross-dataset tasks. On average of 11 datasets, our method achieves 0.7% performance gain on the base-to-new generalization task compared to the state-of-the-art method. The largest improvement on the individual dataset is 10.7% (EuroSAT dataset). </p> </div> </dd> <dt> <a name='item11'>[11]</a> <a href ="/abs/2502.14129" title="Abstract" id="2502.14129"> arXiv:2502.14129 </a> [<a href="/pdf/2502.14129" title="Download PDF" id="pdf-2502.14129" aria-labelledby="pdf-2502.14129">pdf</a>, <a href="https://arxiv.org/html/2502.14129v1" title="View HTML" id="html-2502.14129" aria-labelledby="html-2502.14129" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14129" title="Other formats" id="oth-2502.14129" aria-labelledby="oth-2502.14129">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> GlossGau: Efficient Inverse Rendering for Glossy Surface with Anisotropic Spherical Gaussian </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Du,+B">Bang Du</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+R+B">Runfa Blark Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Du,+C">Chen Du</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nguyen,+T">Truong Nguyen</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> The reconstruction of 3D objects from calibrated photographs represents a fundamental yet intricate challenge in the domains of computer graphics and vision. Although neural reconstruction approaches based on Neural Radiance Fields (NeRF) have shown remarkable capabilities, their processing costs remain substantial. Recently, the advent of 3D Gaussian Splatting (3D-GS) largely improves the training efficiency and facilitates to generate realistic rendering in real-time. However, due to the limited ability of Spherical Harmonics (SH) to represent high-frequency information, 3D-GS falls short in reconstructing glossy objects. Researchers have turned to enhance the specular expressiveness of 3D-GS through inverse rendering. Yet these methods often struggle to maintain the training and rendering efficiency, undermining the benefits of Gaussian Splatting techniques. In this paper, we introduce GlossGau, an efficient inverse rendering framework that reconstructs scenes with glossy surfaces while maintaining training and rendering speeds comparable to vanilla 3D-GS. Specifically, we explicitly model the surface normals, Bidirectional Reflectance Distribution Function (BRDF) parameters, as well as incident lights and use Anisotropic Spherical Gaussian (ASG) to approximate the per-Gaussian Normal Distribution Function under the microfacet model. We utilize 2D Gaussian Splatting (2D-GS) as foundational primitives and apply regularization to significantly alleviate the normal estimation challenge encountered in related works. Experiments demonstrate that GlossGau achieves competitive or superior reconstruction on datasets with glossy surfaces. Compared with previous GS-based works that address the specular surface, our optimization time is considerably less. </p> </div> </dd> <dt> <a name='item12'>[12]</a> <a href ="/abs/2502.14140" title="Abstract" id="2502.14140"> arXiv:2502.14140 </a> [<a href="/pdf/2502.14140" title="Download PDF" id="pdf-2502.14140" aria-labelledby="pdf-2502.14140">pdf</a>, <a href="https://arxiv.org/html/2502.14140v1" title="View HTML" id="html-2502.14140" aria-labelledby="html-2502.14140" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14140" title="Other formats" id="oth-2502.14140" aria-labelledby="oth-2502.14140">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> ModSkill: Physical Character Skill Modularization </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+Y">Yiming Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dou,+Z">Zhiyang Dou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+L">Lingjie Liu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Graphics (cs.GR); Robotics (cs.RO) </div> <p class='mathjax'> Human motion is highly diverse and dynamic, posing challenges for imitation learning algorithms that aim to generalize motor skills for controlling simulated characters. Previous methods typically rely on a universal full-body controller for tracking reference motion (tracking-based model) or a unified full-body skill embedding space (skill embedding). However, these approaches often struggle to generalize and scale to larger motion datasets. In this work, we introduce a novel skill learning framework, ModSkill, that decouples complex full-body skills into compositional, modular skills for independent body parts. Our framework features a skill modularization attention layer that processes policy observations into modular skill embeddings that guide low-level controllers for each body part. We also propose an Active Skill Learning approach with Generative Adaptive Sampling, using large motion generation models to adaptively enhance policy learning in challenging tracking scenarios. Our results show that this modularized skill learning framework, enhanced by generative sampling, outperforms existing methods in precise full-body motion tracking and enables reusable skill embeddings for diverse goal-driven tasks. </p> </div> </dd> <dt> <a name='item13'>[13]</a> <a href ="/abs/2502.14142" title="Abstract" id="2502.14142"> arXiv:2502.14142 </a> [<a href="/pdf/2502.14142" title="Download PDF" id="pdf-2502.14142" aria-labelledby="pdf-2502.14142">pdf</a>, <a href="/format/2502.14142" title="Other formats" id="oth-2502.14142" aria-labelledby="oth-2502.14142">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Token Adaptation via Side Graph Convolution for Temporally and Spatially Efficient Fine-tuning of 3D Point Cloud Transformers </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Furuya,+T">Takahiko Furuya</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Currently under review </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Parameter-efficient fine-tuning (PEFT) of pre-trained 3D point cloud Transformers has emerged as a promising technique for 3D point cloud analysis. While existing PEFT methods attempt to minimize the number of tunable parameters, they still suffer from high temporal and spatial computational costs during fine-tuning. This paper proposes a novel PEFT algorithm for 3D point cloud Transformers, called Side Token Adaptation on a neighborhood Graph (STAG), to achieve superior temporal and spatial efficiency. STAG employs a graph convolutional side network that operates in parallel with a frozen backbone Transformer to adapt tokens to downstream tasks. STAG's side network realizes high efficiency through three key components: connection with the backbone that enables reduced gradient computation, parameter sharing framework, and efficient graph convolution. Furthermore, we present Point Cloud Classification 13 (PCC13), a new benchmark comprising diverse publicly available 3D point cloud datasets, enabling comprehensive evaluation of PEFT methods. Extensive experiments using multiple pre-trained models and PCC13 demonstrates the effectiveness of STAG. Specifically, STAG maintains classification accuracy comparable to existing methods while reducing tunable parameters to only 0.43M and achieving significant reductions in both computational time and memory consumption for fine-tuning. Code and benchmark will be available at: <a href="https://github.com/takahikof/STAG" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item14'>[14]</a> <a href ="/abs/2502.14149" title="Abstract" id="2502.14149"> arXiv:2502.14149 </a> [<a href="/pdf/2502.14149" title="Download PDF" id="pdf-2502.14149" aria-labelledby="pdf-2502.14149">pdf</a>, <a href="https://arxiv.org/html/2502.14149v1" title="View HTML" id="html-2502.14149" aria-labelledby="html-2502.14149" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14149" title="Other formats" id="oth-2502.14149" aria-labelledby="oth-2502.14149">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> PitVQA++: Vector Matrix-Low-Rank Adaptation for Open-Ended Visual Question Answering in Pituitary Surgery </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=He,+R">Runlong He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Khan,+D+Z">Danyal Z. Khan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mazomenos,+E+B">Evangelos B. Mazomenos</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Marcus,+H+J">Hani J. Marcus</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Stoyanov,+D">Danail Stoyanov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Clarkson,+M+J">Matthew J. Clarkson</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Islam,+M">Mobarakol Islam</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 9 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Vision-Language Models (VLMs) in visual question answering (VQA) offer a unique opportunity to enhance intra-operative decision-making, promote intuitive interactions, and significantly advancing surgical education. However, the development of VLMs for surgical VQA is challenging due to limited datasets and the risk of overfitting and catastrophic forgetting during full fine-tuning of pretrained weights. While parameter-efficient techniques like Low-Rank Adaptation (LoRA) and Matrix of Rank Adaptation (MoRA) address adaptation challenges, their uniform parameter distribution overlooks the feature hierarchy in deep networks, where earlier layers, that learn general features, require more parameters than later ones. This work introduces PitVQA++ with an open-ended PitVQA dataset and vector matrix-low-rank adaptation (Vector-MoLoRA), an innovative VLM fine-tuning approach for adapting GPT-2 to pituitary surgery. Open-Ended PitVQA comprises around 101,803 frames from 25 procedural videos with 745,972 question-answer sentence pairs, covering key surgical elements such as phase and step recognition, context understanding, tool detection, localization, and interactions recognition. Vector-MoLoRA incorporates the principles of LoRA and MoRA to develop a matrix-low-rank adaptation strategy that employs vector ranking to allocate more parameters to earlier layers, gradually reducing them in the later layers. Our approach, validated on the Open-Ended PitVQA and EndoVis18-VQA datasets, effectively mitigates catastrophic forgetting while significantly enhancing performance over recent baselines. Furthermore, our risk-coverage analysis highlights its enhanced reliability and trustworthiness in handling uncertain predictions. Our source code and dataset is available at~\url{<a href="https://github.com/HRL-Mike/PitVQA-Plus" rel="external noopener nofollow" class="link-external link-https">this https URL</a>}. </p> </div> </dd> <dt> <a name='item15'>[15]</a> <a href ="/abs/2502.14156" title="Abstract" id="2502.14156"> arXiv:2502.14156 </a> [<a href="/pdf/2502.14156" title="Download PDF" id="pdf-2502.14156" aria-labelledby="pdf-2502.14156">pdf</a>, <a href="https://arxiv.org/html/2502.14156v1" title="View HTML" id="html-2502.14156" aria-labelledby="html-2502.14156" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14156" title="Other formats" id="oth-2502.14156" aria-labelledby="oth-2502.14156">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Mixed Signals: A Diverse Point Cloud Dataset for Heterogeneous LiDAR V2X Collaboration </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+K+Z">Katie Z Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dao,+M">Minh-Quan Dao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zhenzhen Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Campbell,+M">Mark Campbell</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chao,+W">Wei-Lun Chao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Weinberger,+K+Q">Kilian Q. Weinberger</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Malis,+E">Ezio Malis</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fremont,+V">Vincent Fremont</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hariharan,+B">Bharath Hariharan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shan,+M">Mao Shan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Worrall,+S">Stewart Worrall</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Perez,+J+S+B">Julie Stephany Berrio Perez</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Vehicle-to-everything (V2X) collaborative perception has emerged as a promising solution to address the limitations of single-vehicle perception systems. However, existing V2X datasets are limited in scope, diversity, and quality. To address these gaps, we present Mixed Signals, a comprehensive V2X dataset featuring 45.1k point clouds and 240.6k bounding boxes collected from three connected autonomous vehicles (CAVs) equipped with two different types of LiDAR sensors, plus a roadside unit with dual LiDARs. Our dataset provides precisely aligned point clouds and bounding box annotations across 10 classes, ensuring reliable data for perception training. We provide detailed statistical analysis on the quality of our dataset and extensively benchmark existing V2X methods on it. Mixed Signals V2X Dataset is one of the highest quality, large-scale datasets publicly available for V2X perception research. Details on the website <a href="https://mixedsignalsdataset.cs.cornell.edu/" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item16'>[16]</a> <a href ="/abs/2502.14168" title="Abstract" id="2502.14168"> arXiv:2502.14168 </a> [<a href="/pdf/2502.14168" title="Download PDF" id="pdf-2502.14168" aria-labelledby="pdf-2502.14168">pdf</a>, <a href="https://arxiv.org/html/2502.14168v1" title="View HTML" id="html-2502.14168" aria-labelledby="html-2502.14168" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14168" title="Other formats" id="oth-2502.14168" aria-labelledby="oth-2502.14168">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Deep learning based infrared small object segmentation: Challenges and future directions </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+Z">Zhengeng Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+H">Hongshan Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+J">Jianjun Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+Q">Qiang Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mian,+A">Ajmal Mian</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> This is a submitted version of a paper accepted by Information Fusion. If you want a better reading experience, please refer to the final published version of Information Fusion </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Infrared sensing is a core method for supporting unmanned systems, such as autonomous vehicles and drones. Recently, infrared sensors have been widely deployed on mobile and stationary platforms for detection and classification of objects from long distances and in wide field of views. Given its success in the vision image analysis domain, deep learning has also been applied for object recognition in infrared images. However, techniques that have proven successful in visible light perception face new challenges in the infrared domain. These challenges include extremely low signal-to-noise ratios in infrared images, very small and blurred objects of interest, and limited availability of labeled/unlabeled training data due to the specialized nature of infrared sensors. Numerous methods have been proposed in the literature for the detection and classification of small objects in infrared images achieving varied levels of success. There is a need for a survey paper that critically analyzes existing techniques in this domain, identifies unsolved challenges and provides future research directions. This paper fills the gap and offers a concise and insightful review of deep learning-based methods. It also identifies the challenges faced by existing infrared object segmentation methods and provides a structured review of existing infrared perception methods from the perspective of these challenges and highlights the motivations behind the various approaches. Finally, this review suggests promising future directions based on recent advancements within this domain. </p> </div> </dd> <dt> <a name='item17'>[17]</a> <a href ="/abs/2502.14184" title="Abstract" id="2502.14184"> arXiv:2502.14184 </a> [<a href="/pdf/2502.14184" title="Download PDF" id="pdf-2502.14184" aria-labelledby="pdf-2502.14184">pdf</a>, <a href="https://arxiv.org/html/2502.14184v1" title="View HTML" id="html-2502.14184" aria-labelledby="html-2502.14184" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14184" title="Other formats" id="oth-2502.14184" aria-labelledby="oth-2502.14184">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Bayesian SegNet for Semantic Segmentation with Improved Interpretation of Microstructural Evolution During Irradiation of Materials </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Oostrom,+M">Marjolein Oostrom</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hagen,+A">Alex Hagen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=LaHaye,+N">Nicole LaHaye</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pazdernik,+K">Karl Pazdernik</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Understanding the relationship between the evolution of microstructures of irradiated LiAlO2 pellets and tritium diffusion, retention and release could improve predictions of tritium-producing burnable absorber rod performance. Given expert-labeled segmented images of irradiated and unirradiated pellets, we trained Deep Convolutional Neural Networks to segment images into defect, grain, and boundary classes. Qualitative microstructural information was calculated from these segmented images to facilitate the comparison of unirradiated and irradiated pellets. We tested modifications to improve the sensitivity of the model, including incorporating meta-data into the model and utilizing uncertainty quantification. The predicted segmentation was similar to the expert-labeled segmentation for most methods of microstructural qualification, including pixel proportion, defect area, and defect density. Overall, the high performance metrics for the best models for both irradiated and unirradiated images shows that utilizing neural network models is a viable alternative to expert-labeled images. </p> </div> </dd> <dt> <a name='item18'>[18]</a> <a href ="/abs/2502.14190" title="Abstract" id="2502.14190"> arXiv:2502.14190 </a> [<a href="/pdf/2502.14190" title="Download PDF" id="pdf-2502.14190" aria-labelledby="pdf-2502.14190">pdf</a>, <a href="https://arxiv.org/html/2502.14190v1" title="View HTML" id="html-2502.14190" aria-labelledby="html-2502.14190" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14190" title="Other formats" id="oth-2502.14190" aria-labelledby="oth-2502.14190">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Stereo Image Coding for Machines with Joint Visual Feature Compression </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Jin,+D">Dengchao Jin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lei,+J">Jianjun Lei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Peng,+B">Bo Peng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pan,+Z">Zhaoqing Pan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ling,+N">Nam Ling</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+Q">Qingming Huang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Image and Video Processing (eess.IV) </div> <p class='mathjax'> 2D image coding for machines (ICM) has achieved great success in coding efficiency, while less effort has been devoted to stereo image fields. To promote the efficiency of stereo image compression (SIC) and intelligent analysis, the stereo image coding for machines (SICM) is formulated and explored in this paper. More specifically, a machine vision-oriented stereo feature compression network (MVSFC-Net) is proposed for SICM, where the stereo visual features are effectively extracted, compressed, and transmitted for 3D visual task. To efficiently compress stereo visual features in MVSFC-Net, a stereo multi-scale feature compression (SMFC) module is designed to gradually transform sparse stereo multi-scale features into compact joint visual representations by removing spatial, inter-view, and cross-scale redundancies simultaneously. Experimental results show that the proposed MVSFC-Net obtains superior compression efficiency as well as 3D visual task performance, when compared with the existing ICM anchors recommended by MPEG and the state-of-the-art SIC method. </p> </div> </dd> <dt> <a name='item19'>[19]</a> <a href ="/abs/2502.14191" title="Abstract" id="2502.14191"> arXiv:2502.14191 </a> [<a href="/pdf/2502.14191" title="Download PDF" id="pdf-2502.14191" aria-labelledby="pdf-2502.14191">pdf</a>, <a href="https://arxiv.org/html/2502.14191v1" title="View HTML" id="html-2502.14191" aria-labelledby="html-2502.14191" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14191" title="Other formats" id="oth-2502.14191" aria-labelledby="oth-2502.14191">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Multimodal RewardBench: Holistic Evaluation of Reward Models for Vision Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yasunaga,+M">Michihiro Yasunaga</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zettlemoyer,+L">Luke Zettlemoyer</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ghazvininejad,+M">Marjan Ghazvininejad</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Dataset available at <a href="https://github.com/facebookresearch/multimodal_rewardbench" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Reward models play an essential role in training vision-language models (VLMs) by assessing output quality to enable aligning with human preferences. Despite their importance, the research community lacks comprehensive open benchmarks for evaluating multimodal reward models in VLMs. To address this gap, we introduce Multimodal RewardBench, an expert-annotated benchmark covering six domains: general correctness, preference, knowledge, reasoning, safety, and visual question-answering. Our dataset comprises 5,211 annotated (prompt, chosen response, rejected response) triplets collected from various VLMs. In evaluating a range of VLM judges, we find that even the top-performing models, Gemini 1.5 Pro and Claude 3.5 Sonnet, achieve only 72% overall accuracy. Notably, most models struggle in the reasoning and safety domains. These findings suggest that Multimodal RewardBench offers a challenging testbed for advancing reward model development across multiple domains. We release the benchmark at <a href="https://github.com/facebookresearch/multimodal_rewardbench" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item20'>[20]</a> <a href ="/abs/2502.14195" title="Abstract" id="2502.14195"> arXiv:2502.14195 </a> [<a href="/pdf/2502.14195" title="Download PDF" id="pdf-2502.14195" aria-labelledby="pdf-2502.14195">pdf</a>, <a href="/format/2502.14195" title="Other formats" id="oth-2502.14195" aria-labelledby="oth-2502.14195">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Bridging Text and Vision: A Multi-View Text-Vision Registration Approach for Cross-Modal Place Recognition </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Shang,+T">Tianyi Shang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Z">Zhenyu Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+P">Pengjie Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qiao,+J">Jinwei Qiao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+G">Gang Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ruan,+Z">Zihan Ruan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+W">Weijun Hu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 8 pages, 4 figures, conference </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Mobile robots necessitate advanced natural language understanding capabilities to accurately identify locations and perform tasks such as package delivery. However, traditional visual place recognition (VPR) methods rely solely on single-view visual information and cannot interpret human language descriptions. To overcome this challenge, we bridge text and vision by proposing a multiview (360掳 views of the surroundings) text-vision registration approach called Text4VPR for place recognition task, which is the first method that exclusively utilizes textual descriptions to match a database of images. Text4VPR employs the frozen T5 language model to extract global textual embeddings. Additionally, it utilizes the Sinkhorn algorithm with temperature coefficient to assign local tokens to their respective clusters, thereby aggregating visual descriptors from images. During the training stage, Text4VPR emphasizes the alignment between individual text-image pairs for precise textual description. In the inference stage, Text4VPR uses the Cascaded Cross-Attention Cosine Alignment (CCCA) to address the internal mismatch between text and image groups. Subsequently, Text4VPR performs precisely place match based on the descriptions of text-image groups. On Street360Loc, the first text to image VPR dataset we created, Text4VPR builds a robust baseline, achieving a leading top-1 accuracy of 57% and a leading top-10 accuracy of 92% within a 5-meter radius on the test set, which indicates that localization from textual descriptions to images is not only feasible but also holds significant potential for further advancement, as shown in Figure 1. </p> </div> </dd> <dt> <a name='item21'>[21]</a> <a href ="/abs/2502.14209" title="Abstract" id="2502.14209"> arXiv:2502.14209 </a> [<a href="/pdf/2502.14209" title="Download PDF" id="pdf-2502.14209" aria-labelledby="pdf-2502.14209">pdf</a>, <a href="https://arxiv.org/html/2502.14209v1" title="View HTML" id="html-2502.14209" aria-labelledby="html-2502.14209" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14209" title="Other formats" id="oth-2502.14209" aria-labelledby="oth-2502.14209">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Spatial and Frequency Domain Adaptive Fusion Network for Image Deblurring </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+H">Hu Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dang,+D">Depeng Dang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Image deblurring aims to reconstruct a latent sharp image from its corresponding blurred one. Although existing methods have achieved good performance, most of them operate exclusively in either the spatial domain or the frequency domain, rarely exploring solutions that fuse both domains. In this paper, we propose a spatial-frequency domain adaptive fusion network (SFAFNet) to address this limitation. Specifically, we design a gated spatial-frequency domain feature fusion block (GSFFBlock), which consists of three key components: a spatial domain information module, a frequency domain information dynamic generation module (FDGM), and a gated fusion module (GFM). The spatial domain information module employs the NAFBlock to integrate local information. Meanwhile, in the FDGM, we design a learnable low-pass filter that dynamically decomposes features into separate frequency subbands, capturing the image-wide receptive field and enabling the adaptive exploration of global contextual information. Additionally, to facilitate information flow and the learning of complementary representations. In the GFM, we present a gating mechanism (GATE) to re-weight spatial and frequency domain features, which are then fused through the cross-attention mechanism (CAM). Experimental results demonstrate that our SFAFNet performs favorably compared to state-of-the-art approaches on commonly used benchmarks. </p> </div> </dd> <dt> <a name='item22'>[22]</a> <a href ="/abs/2502.14221" title="Abstract" id="2502.14221"> arXiv:2502.14221 </a> [<a href="/pdf/2502.14221" title="Download PDF" id="pdf-2502.14221" aria-labelledby="pdf-2502.14221">pdf</a>, <a href="https://arxiv.org/html/2502.14221v1" title="View HTML" id="html-2502.14221" aria-labelledby="html-2502.14221" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14221" title="Other formats" id="oth-2502.14221" aria-labelledby="oth-2502.14221">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> H3DE-Net: Efficient and Accurate 3D Landmark Detection in Medical Imaging </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+Z">Zhen Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+R">Ronghao Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+X">Xiaoqian Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wei,+Y">Yangbo Wei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Suhua Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+X">Xiaoxin Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+H">Han Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yao,+Q">Qingsong Yao</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> 3D landmark detection is a critical task in medical image analysis, and accurately detecting anatomical landmarks is essential for subsequent medical imaging tasks. However, mainstream deep learning methods in this field struggle to simultaneously capture fine-grained local features and model global spatial relationships, while maintaining a balance between accuracy and computational efficiency. Local feature extraction requires capturing fine-grained anatomical details, while global modeling requires understanding the spatial relationships within complex anatomical structures. The high-dimensional nature of 3D volume further exacerbates these challenges, as landmarks are sparsely distributed, leading to significant computational costs. Therefore, achieving efficient and precise 3D landmark detection remains a pressing challenge in medical image analysis. <br>In this work, We propose a \textbf{H}ybrid \textbf{3}D \textbf{DE}tection \textbf{Net}(H3DE-Net), a novel framework that combines CNNs for local feature extraction with a lightweight attention mechanism designed to efficiently capture global dependencies in 3D volumetric data. This mechanism employs a hierarchical routing strategy to reduce computational cost while maintaining global context modeling. To our knowledge, H3DE-Net is the first 3D landmark detection model that integrates such a lightweight attention mechanism with CNNs. Additionally, integrating multi-scale feature fusion further enhances detection accuracy and robustness. Experimental results on a public CT dataset demonstrate that H3DE-Net achieves state-of-the-art(SOTA) performance, significantly improving accuracy and robustness, particularly in scenarios with missing landmarks or complex anatomical variations. We aready open-source our project, including code, data and model weights. </p> </div> </dd> <dt> <a name='item23'>[23]</a> <a href ="/abs/2502.14226" title="Abstract" id="2502.14226"> arXiv:2502.14226 </a> [<a href="/pdf/2502.14226" title="Download PDF" id="pdf-2502.14226" aria-labelledby="pdf-2502.14226">pdf</a>, <a href="https://arxiv.org/html/2502.14226v1" title="View HTML" id="html-2502.14226" aria-labelledby="html-2502.14226" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14226" title="Other formats" id="oth-2502.14226" aria-labelledby="oth-2502.14226">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Designing Parameter and Compute Efficient Diffusion Transformers using Distillation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Sundaresha,+V">Vignesh Sundaresha</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 4 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Image and Video Processing (eess.IV) </div> <p class='mathjax'> Diffusion Transformers (DiTs) with billions of model parameters form the backbone of popular image and video generation models like DALL.E, Stable-Diffusion and SORA. Though these models are necessary in many low-latency applications like Augmented/Virtual Reality, they cannot be deployed on resource-constrained Edge devices (like Apple Vision Pro or Meta Ray-Ban glasses) due to their huge computational complexity. To overcome this, we turn to knowledge distillation and perform a thorough design-space exploration to achieve the best DiT for a given parameter size. In particular, we provide principles for how to choose design knobs such as depth, width, attention heads and distillation setup for a DiT. During the process, a three-way trade-off emerges between model performance, size and speed that is crucial for Edge implementation of diffusion. We also propose two distillation approaches - Teaching Assistant (TA) method and Multi-In-One (MI1) method - to perform feature distillation in the DiT context. Unlike existing solutions, we demonstrate and benchmark the efficacy of our approaches on practical Edge devices such as NVIDIA Jetson Orin Nano. </p> </div> </dd> <dt> <a name='item24'>[24]</a> <a href ="/abs/2502.14235" title="Abstract" id="2502.14235"> arXiv:2502.14235 </a> [<a href="/pdf/2502.14235" title="Download PDF" id="pdf-2502.14235" aria-labelledby="pdf-2502.14235">pdf</a>, <a href="https://arxiv.org/html/2502.14235v1" title="View HTML" id="html-2502.14235" aria-labelledby="html-2502.14235" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14235" title="Other formats" id="oth-2502.14235" aria-labelledby="oth-2502.14235">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> OG-Gaussian: Occupancy Based Street Gaussians for Autonomous Driving </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+Y">Yedong Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+X">Xinran Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Duan,+Y">Yifan Duan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+S">Shiqi Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+H">Heng Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Y">Yilong Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ji,+J">Jianmin Ji</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yanyong Zhang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Accurate and realistic 3D scene reconstruction enables the lifelike creation of autonomous driving simulation environments. With advancements in 3D Gaussian Splatting (3DGS), previous studies have applied it to reconstruct complex dynamic driving scenes. These methods typically require expensive LiDAR sensors and pre-annotated datasets of dynamic objects. To address these challenges, we propose OG-Gaussian, a novel approach that replaces LiDAR point clouds with Occupancy Grids (OGs) generated from surround-view camera images using Occupancy Prediction Network (ONet). Our method leverages the semantic information in OGs to separate dynamic vehicles from static street background, converting these grids into two distinct sets of initial point clouds for reconstructing both static and dynamic objects. Additionally, we estimate the trajectories and poses of dynamic objects through a learning-based approach, eliminating the need for complex manual annotations. Experiments on Waymo Open dataset demonstrate that OG-Gaussian is on par with the current state-of-the-art in terms of reconstruction quality and rendering speed, achieving an average PSNR of 35.13 and a rendering speed of 143 FPS, while significantly reducing computational costs and economic overhead. </p> </div> </dd> <dt> <a name='item25'>[25]</a> <a href ="/abs/2502.14267" title="Abstract" id="2502.14267"> arXiv:2502.14267 </a> [<a href="/pdf/2502.14267" title="Download PDF" id="pdf-2502.14267" aria-labelledby="pdf-2502.14267">pdf</a>, <a href="https://arxiv.org/html/2502.14267v1" title="View HTML" id="html-2502.14267" aria-labelledby="html-2502.14267" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14267" title="Other formats" id="oth-2502.14267" aria-labelledby="oth-2502.14267">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Money Recognition for the Visually Impaired: A Case Study on Sri Lankan Banknotes </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Bandara,+A">Akshaan Bandara</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Currency note recognition is a critical accessibility need for blind individuals, as identifying banknotes accurately can impact their independence and security in financial transactions. Several traditional and technological initiatives have been taken to date. Nevertheless, these approaches are less user-friendly and have made it more challenging for blind people to identify banknotes. This research proposes a user-friendly stand-alone system for the identification of Sri Lankan currency notes. A custom-created dataset of images of Sri Lankan currency notes was used to fine-tune an EfficientDet model. The currency note recognition model achieved 0.9847 AP on the validation dataset and performs exceptionally well in real-world scenarios. The high accuracy and the intuitive interface have enabled blind individuals to quickly and accurately identify currency denominations, ultimately encouraging accessibility and independence. </p> </div> </dd> <dt> <a name='item26'>[26]</a> <a href ="/abs/2502.14273" title="Abstract" id="2502.14273"> arXiv:2502.14273 </a> [<a href="/pdf/2502.14273" title="Download PDF" id="pdf-2502.14273" aria-labelledby="pdf-2502.14273">pdf</a>, <a href="https://arxiv.org/html/2502.14273v1" title="View HTML" id="html-2502.14273" aria-labelledby="html-2502.14273" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14273" title="Other formats" id="oth-2502.14273" aria-labelledby="oth-2502.14273">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LLM-EvRep: Learning an LLM-Compatible Event Representation Using a Self-Supervised Framework </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+Z">Zongyou Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qu,+Q">Qiang Qu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Q">Qian Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+N">Nan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+X">Xiaoming Chen</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 6 pages, 2 figures,Companion Proceedings of the ACM Web Conference 2025 (WWW Companion '25) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Multimedia (cs.MM) </div> <p class='mathjax'> Recent advancements in event-based recognition have demonstrated significant promise, yet most existing approaches rely on extensive training, limiting their adaptability for efficient processing of event-driven visual content. Meanwhile, large language models (LLMs) have exhibited remarkable zero-shot capabilities across diverse domains, but their application to event-based visual recognition remains largely unexplored. To bridge this gap, we propose \textbf{LLM-EvGen}, an event representation generator that produces LLM-compatible event representations \textbf{LLM-EvRep}, thereby enhancing the performance of LLMs on event recognition tasks. The generator is trained using a self-supervised framework, aligning the generated representations with semantic consistency and structural fidelity. Comprehensive experiments were conducted on three datasets: N-ImageNet, N-Caltech101, and N-MNIST. The results demonstrate that our method, \textbf{LLM-EvRep}, outperforms the event-to-video method, E2VID, by 15.93\%, 0.82\%, and 50.21\%, respectively, in recognition tasks when evaluated using GPT-4o. </p> </div> </dd> <dt> <a name='item27'>[27]</a> <a href ="/abs/2502.14279" title="Abstract" id="2502.14279"> arXiv:2502.14279 </a> [<a href="/pdf/2502.14279" title="Download PDF" id="pdf-2502.14279" aria-labelledby="pdf-2502.14279">pdf</a>, <a href="https://arxiv.org/html/2502.14279v1" title="View HTML" id="html-2502.14279" aria-labelledby="html-2502.14279" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14279" title="Other formats" id="oth-2502.14279" aria-labelledby="oth-2502.14279">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> OrchardDepth: Precise Metric Depth Estimation of Orchard Scene from Monocular Camera Images </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zheng,+Z">Zhichao Zheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Williams,+H">Henry Williams</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=MacDonald,+B+A">Bruce A MacDonald</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 10 pages, 5 figures, Australasian Conference on Robotics and Automation, ACRA, 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Monocular depth estimation is a rudimentary task in robotic perception. Recently, with the development of more accurate and robust neural network models and different types of datasets, monocular depth estimation has significantly improved performance and efficiency. However, most of the research in this area focuses on very concentrated domains. In particular, most of the benchmarks in outdoor scenarios belong to urban environments for the improvement of autonomous driving devices, and these benchmarks have a massive disparity with the orchard/vineyard environment, which is hardly helpful for research in the primary industry. Therefore, we propose OrchardDepth, which fills the gap in the estimation of the metric depth of the monocular camera in the orchard/vineyard environment. In addition, we present a new retraining method to improve the training result by monitoring the consistent regularization between dense depth maps and sparse points. Our method improves the RMSE of depth estimation in the orchard environment from 1.5337 to 0.6738, proving our method's validation. </p> </div> </dd> <dt> <a name='item28'>[28]</a> <a href ="/abs/2502.14282" title="Abstract" id="2502.14282"> arXiv:2502.14282 </a> [<a href="/pdf/2502.14282" title="Download PDF" id="pdf-2502.14282" aria-labelledby="pdf-2502.14282">pdf</a>, <a href="https://arxiv.org/html/2502.14282v1" title="View HTML" id="html-2502.14282" aria-labelledby="html-2502.14282" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14282" title="Other formats" id="oth-2502.14282" aria-labelledby="oth-2502.14282">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> PC-Agent: A Hierarchical Multi-Agent Collaboration Framework for Complex Task Automation on PC </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+H">Haowei Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+X">Xi Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+H">Haiyang Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wanyan,+Y">Yuyang Wanyan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+J">Junyang Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yan,+M">Ming Yan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+J">Ji Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+C">Chunfeng Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+C">Changsheng Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+W">Weiming Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+F">Fei Huang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 14 pages, 7 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> In the field of MLLM-based GUI agents, compared to smartphones, the PC scenario not only features a more complex interactive environment, but also involves more intricate intra- and inter-app workflows. To address these issues, we propose a hierarchical agent framework named PC-Agent. Specifically, from the perception perspective, we devise an Active Perception Module (APM) to overcome the inadequate abilities of current MLLMs in perceiving screenshot content. From the decision-making perspective, to handle complex user instructions and interdependent subtasks more effectively, we propose a hierarchical multi-agent collaboration architecture that decomposes decision-making processes into Instruction-Subtask-Action levels. Within this architecture, three agents (i.e., Manager, Progress and Decision) are set up for instruction decomposition, progress tracking and step-by-step decision-making respectively. Additionally, a Reflection agent is adopted to enable timely bottom-up error feedback and adjustment. We also introduce a new benchmark PC-Eval with 25 real-world complex instructions. Empirical results on PC-Eval show that our PC-Agent achieves a 32% absolute improvement of task success rate over previous state-of-the-art methods. The code will be publicly available. </p> </div> </dd> <dt> <a name='item29'>[29]</a> <a href ="/abs/2502.14314" title="Abstract" id="2502.14314"> arXiv:2502.14314 </a> [<a href="/pdf/2502.14314" title="Download PDF" id="pdf-2502.14314" aria-labelledby="pdf-2502.14314">pdf</a>, <a href="/format/2502.14314" title="Other formats" id="oth-2502.14314" aria-labelledby="oth-2502.14314">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> ODVerse33: Is the New YOLO Version Always Better? A Multi Domain benchmark from YOLO v5 to v11 </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+T">Tianyou Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhong,+Y">Yang Zhong</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 18 pages, 4 figures, 7 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> You Look Only Once (YOLO) models have been widely used for building real-time object detectors across various domains. With the increasing frequency of new YOLO versions being released, key questions arise. Are the newer versions always better than their previous versions? What are the core innovations in each YOLO version and how do these changes translate into real-world performance gains? In this paper, we summarize the key innovations from YOLOv1 to YOLOv11, introduce a comprehensive benchmark called ODverse33, which includes 33 datasets spanning 11 diverse domains (Autonomous driving, Agricultural, Underwater, Medical, Videogame, Industrial, Aerial, Wildlife, Retail, Microscopic, and Security), and explore the practical impact of model improvements in real-world, multi-domain applications through extensive experimental results. We hope this study can provide some guidance to the extensive users of object detection models and give some references for future real-time object detector development. </p> </div> </dd> <dt> <a name='item30'>[30]</a> <a href ="/abs/2502.14316" title="Abstract" id="2502.14316"> arXiv:2502.14316 </a> [<a href="/pdf/2502.14316" title="Download PDF" id="pdf-2502.14316" aria-labelledby="pdf-2502.14316">pdf</a>, <a href="https://arxiv.org/html/2502.14316v1" title="View HTML" id="html-2502.14316" aria-labelledby="html-2502.14316" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14316" title="Other formats" id="oth-2502.14316" aria-labelledby="oth-2502.14316">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Textured 3D Regenerative Morphing with 3D Diffusion Prior </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+S">Songlin Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lan,+Y">Yushi Lan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+H">Honghua Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pan,+X">Xingang Pan</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Textured 3D morphing creates smooth and plausible interpolation sequences between two 3D objects, focusing on transitions in both shape and texture. This is important for creative applications like visual effects in filmmaking. Previous methods rely on establishing point-to-point correspondences and determining smooth deformation trajectories, which inherently restrict them to shape-only morphing on untextured, topologically aligned datasets. This restriction leads to labor-intensive preprocessing and poor generalization. To overcome these challenges, we propose a method for 3D regenerative morphing using a 3D diffusion prior. Unlike previous methods that depend on explicit correspondences and deformations, our method eliminates the additional need for obtaining correspondence and uses the 3D diffusion prior to generate morphing. Specifically, we introduce a 3D diffusion model and interpolate the source and target information at three levels: initial noise, model parameters, and condition features. We then explore an Attention Fusion strategy to generate more smooth morphing sequences. To further improve the plausibility of semantic interpolation and the generated 3D surfaces, we propose two strategies: (a) Token Reordering, where we match approximate tokens based on semantic analysis to guide implicit correspondences in the denoising process of the diffusion model, and (b) Low-Frequency Enhancement, where we enhance low-frequency signals in the tokens to improve the quality of generated surfaces. Experimental results show that our method achieves superior smoothness and plausibility in 3D morphing across diverse cross-category object pairs, offering a novel regenerative method for 3D morphing with textured representations. </p> </div> </dd> <dt> <a name='item31'>[31]</a> <a href ="/abs/2502.14332" title="Abstract" id="2502.14332"> arXiv:2502.14332 </a> [<a href="/pdf/2502.14332" title="Download PDF" id="pdf-2502.14332" aria-labelledby="pdf-2502.14332">pdf</a>, <a href="https://arxiv.org/html/2502.14332v1" title="View HTML" id="html-2502.14332" aria-labelledby="html-2502.14332" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14332" title="Other formats" id="oth-2502.14332" aria-labelledby="oth-2502.14332">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Collaborative Jade Recognition System for Mobile Devices Based on Lightweight and Large Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zhenyu Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+W">Wenjia Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+P">Pengyu Zhu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Information Retrieval (cs.IR) </div> <p class='mathjax'> With the widespread adoption and development of mobile devices, vision-based recognition applications have become a hot topic in research. Jade, as an important cultural heritage and artistic item, has significant applications in fields such as jewelry identification and cultural relic preservation. However, existing jade recognition systems still face challenges in mobile implementation, such as limited computing resources, real-time requirements, and accuracy issues. To address these challenges, this paper proposes a jade recognition system based on size model collaboration, aiming to achieve efficient and accurate jade identification using mobile devices such as <a href="http://smartphones.First" rel="external noopener nofollow" class="link-external link-http">this http URL</a>, we design a size model based on multi-scale image processing, extracting key visual information by analyzing jade's dimensions, shapes, and surface textures. Then, a collaborative multi-model classification framework is built by combining deep learning and traditional computer vision algorithms. This framework can effectively select and adjust models based on different jade characteristics, providing high accuracy results across various environments and <a href="http://devices.Experimental" rel="external noopener nofollow" class="link-external link-http">this http URL</a> results show that the proposed system can provide high recognition accuracy and fast processing time on mobile devices, while consuming relatively low computational resources. The system not only holds great application potential but also provides new ideas and technical support for the intelligent development of jade identification. </p> </div> </dd> <dt> <a name='item32'>[32]</a> <a href ="/abs/2502.14344" title="Abstract" id="2502.14344"> arXiv:2502.14344 </a> [<a href="/pdf/2502.14344" title="Download PDF" id="pdf-2502.14344" aria-labelledby="pdf-2502.14344">pdf</a>, <a href="https://arxiv.org/html/2502.14344v1" title="View HTML" id="html-2502.14344" aria-labelledby="html-2502.14344" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14344" title="Other formats" id="oth-2502.14344" aria-labelledby="oth-2502.14344">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Towards Accurate Binary Spiking Neural Networks: Learning with Adaptive Gradient Modulation Mechanism </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liang,+Y">Yu Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wei,+W">Wenjie Wei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Belatreche,+A">Ammar Belatreche</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+H">Honglin Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+Z">Zijian Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Shuai Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+M">Malu Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+Y">Yang Yang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 9 pages, 8 figures, AAAI conference </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Binary Spiking Neural Networks (BSNNs) inherit the eventdriven paradigm of SNNs, while also adopting the reduced storage burden of binarization techniques. These distinct advantages grant BSNNs lightweight and energy-efficient characteristics, rendering them ideal for deployment on resource-constrained edge devices. However, due to the binary synaptic weights and non-differentiable spike function, effectively training BSNNs remains an open question. In this paper, we conduct an in-depth analysis of the challenge for BSNN learning, namely the frequent weight sign flipping problem. To mitigate this issue, we propose an Adaptive Gradient Modulation Mechanism (AGMM), which is designed to reduce the frequency of weight sign flipping by adaptively adjusting the gradients during the learning process. The proposed AGMM can enable BSNNs to achieve faster convergence speed and higher accuracy, effectively narrowing the gap between BSNNs and their full-precision equivalents. We validate AGMM on both static and neuromorphic datasets, and results indicate that it achieves state-of-the-art results among BSNNs. This work substantially reduces storage demands and enhances SNNs' inherent energy efficiency, making them highly feasible for resource-constrained environments. </p> </div> </dd> <dt> <a name='item33'>[33]</a> <a href ="/abs/2502.14351" title="Abstract" id="2502.14351"> arXiv:2502.14351 </a> [<a href="/pdf/2502.14351" title="Download PDF" id="pdf-2502.14351" aria-labelledby="pdf-2502.14351">pdf</a>, <a href="https://arxiv.org/html/2502.14351v1" title="View HTML" id="html-2502.14351" aria-labelledby="html-2502.14351" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14351" title="Other formats" id="oth-2502.14351" aria-labelledby="oth-2502.14351">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SegAnyPET: Universal Promptable Segmentation from Positron Emission Tomography Images </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yichi Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xue,+L">Le Xue</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+W">Wenbo Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+L">Lanlan Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Y">Yuchen Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+C">Chen Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cheng,+Y">Yuan Cheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qi,+Y">Yuan Qi</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Positron Emission Tomography (PET) imaging plays a crucial role in modern medical diagnostics by revealing the metabolic processes within a patient's body, which is essential for quantification of therapy response and monitoring treatment progress. However, the segmentation of PET images presents unique challenges due to their lower contrast and less distinct boundaries compared to other structural medical modalities. Recent developments in segmentation foundation models have shown superior versatility across diverse natural image segmentation tasks. Despite the efforts of medical adaptations, these works primarily focus on structural medical images with detailed physiological structural information and exhibit poor generalization ability when adapted to molecular PET imaging. In this paper, we collect and construct PETS-5k, the largest PET segmentation dataset to date, comprising 5,731 three-dimensional whole-body PET images and encompassing over 1.3M 2D images. Based on the established dataset, we develop SegAnyPET, a modality-specific 3D foundation model for universal promptable segmentation from PET images. To issue the challenge of discrepant annotation quality of PET images, we adopt a cross prompting confident learning (CPCL) strategy with an uncertainty-guided self-rectification process to robustly learn segmentation from high-quality labeled data and low-quality noisy labeled data. Experimental results demonstrate that SegAnyPET can correctly segment seen and unseen targets using only one or a few prompt points, outperforming state-of-the-art foundation models and task-specific fully supervised models with higher accuracy and strong generalization ability for universal segmentation. As the first foundation model for PET images, we believe that SegAnyPET will advance the applications to various downstream tasks for molecular imaging. </p> </div> </dd> <dt> <a name='item34'>[34]</a> <a href ="/abs/2502.14355" title="Abstract" id="2502.14355"> arXiv:2502.14355 </a> [<a href="/pdf/2502.14355" title="Download PDF" id="pdf-2502.14355" aria-labelledby="pdf-2502.14355">pdf</a>, <a href="https://arxiv.org/html/2502.14355v1" title="View HTML" id="html-2502.14355" aria-labelledby="html-2502.14355" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14355" title="Other formats" id="oth-2502.14355" aria-labelledby="oth-2502.14355">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Triply Laplacian Scale Mixture Modeling for Seismic Data Noise Suppression </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Pan,+S">Sirui Pan</a> (1), <a href="https://arxiv.org/search/cs?searchtype=author&query=Zha,+Z">Zhiyuan Zha</a> (1), <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Shigang Wang</a> (1), <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yue Li</a> (1), <a href="https://arxiv.org/search/cs?searchtype=author&query=Fan,+Z">Zipei Fan</a> (2), <a href="https://arxiv.org/search/cs?searchtype=author&query=Yan,+G">Gang Yan</a> (3), <a href="https://arxiv.org/search/cs?searchtype=author&query=Nguyen,+B+T">Binh T. Nguyen</a> (4), <a href="https://arxiv.org/search/cs?searchtype=author&query=Wen,+B">Bihan Wen</a> (5), <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+C">Ce Zhu</a> (6) ((1) College of Communication Engineering, Jilin University, (2) School of Artificial Intelligence, Jilin University, (3) College of Computer Science and Technology, Jilin University, (4) Department of Computer Science, Faculty of Mathematics and Computer Science, University of Science, Vietnam National University, (5) School of Electrical and Electronic Engineering, Nanyang Technological University, (6) Glasgow College, University of Electronic Science and Technology of China)</div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Sparsity-based tensor recovery methods have shown great potential in suppressing seismic data noise. These methods exploit tensor sparsity measures capturing the low-dimensional structures inherent in seismic data tensors to remove noise by applying sparsity constraints through soft-thresholding or hard-thresholding operators. However, in these methods, considering that real seismic data are non-stationary and affected by noise, the variances of tensor coefficients are unknown and may be difficult to accurately estimate from the degraded seismic data, leading to undesirable noise suppression performance. In this paper, we propose a novel triply Laplacian scale mixture (TLSM) approach for seismic data noise suppression, which significantly improves the estimation accuracy of both the sparse tensor coefficients and hidden scalar parameters. To make the optimization problem manageable, an alternating direction method of multipliers (ADMM) algorithm is employed to solve the proposed TLSM-based seismic data noise suppression problem. Extensive experimental results on synthetic and field seismic data demonstrate that the proposed TLSM algorithm outperforms many state-of-the-art seismic data noise suppression methods in both quantitative and qualitative evaluations while providing exceptional computational efficiency. </p> </div> </dd> <dt> <a name='item35'>[35]</a> <a href ="/abs/2502.14360" title="Abstract" id="2502.14360"> arXiv:2502.14360 </a> [<a href="/pdf/2502.14360" title="Download PDF" id="pdf-2502.14360" aria-labelledby="pdf-2502.14360">pdf</a>, <a href="/format/2502.14360" title="Other formats" id="oth-2502.14360" aria-labelledby="oth-2502.14360">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Weed Detection using Convolutional Neural Network </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Tripathi,+S+K">Santosh Kumar Tripathi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Singh,+S+P">Shivendra Pratap Singh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sharma,+D">Devansh Sharma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Patekar,+H+U">Harshavardhan U Patekar</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> In this paper we use convolutional neural networks (CNNs) for weed detection in agricultural land. We specifically investigate the application of two CNN layer types, Conv2d and dilated Conv2d, for weed detection in crop fields. The suggested method extracts features from the input photos using pre-trained models, which are subsequently adjusted for weed detection. The findings of the experiment, which used a sizable collection of dataset consisting of 15336 segments, being 3249 of soil, 7376 of soybean, 3520 grass and 1191 of broadleaf weeds. show that the suggested approach can accurately and successfully detect weeds at an accuracy of 94%. This study has significant ramifications for lowering the usage of toxic herbicides and increasing the effectiveness of weed management in agriculture. </p> </div> </dd> <dt> <a name='item36'>[36]</a> <a href ="/abs/2502.14373" title="Abstract" id="2502.14373"> arXiv:2502.14373 </a> [<a href="/pdf/2502.14373" title="Download PDF" id="pdf-2502.14373" aria-labelledby="pdf-2502.14373">pdf</a>, <a href="https://arxiv.org/html/2502.14373v1" title="View HTML" id="html-2502.14373" aria-labelledby="html-2502.14373" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14373" title="Other formats" id="oth-2502.14373" aria-labelledby="oth-2502.14373">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CrossVTON: Mimicking the Logic Reasoning on Cross-category Virtual Try-on guided by Tri-zone Priors </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+D">Donghao Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liang,+Y">Yujie Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Peng,+X">Xu Peng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+X">Xiaobin Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+B">Boyuan Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+C">Chengming Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jin,+T">Taisong Jin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+C">Chengjie Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fu,+Y">Yanwei Fu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Despite remarkable progress in image-based virtual try-on systems, generating realistic and robust fitting images for cross-category virtual try-on remains a challenging task. The primary difficulty arises from the absence of human-like reasoning, which involves addressing size mismatches between garments and models while recognizing and leveraging the distinct functionalities of various regions within the model images. To address this issue, we draw inspiration from human cognitive processes and disentangle the complex reasoning required for cross-category try-on into a structured framework. This framework systematically decomposes the model image into three distinct regions: try-on, reconstruction, and imagination zones. Each zone plays a specific role in accommodating the garment and facilitating realistic synthesis. To endow the model with robust reasoning capabilities for cross-category scenarios, we propose an iterative data constructor. This constructor encompasses diverse scenarios, including intra-category try-on, any-to-dress transformations (replacing any garment category with a dress), and dress-to-any transformations (replacing a dress with another garment category). Utilizing the generated dataset, we introduce a tri-zone priors generator that intelligently predicts the try-on, reconstruction, and imagination zones by analyzing how the input garment is expected to align with the model image. Guided by these tri-zone priors, our proposed method, CrossVTON, achieves state-of-the-art performance, surpassing existing baselines in both qualitative and quantitative evaluations. Notably, it demonstrates superior capability in handling cross-category virtual try-on, meeting the complex demands of real-world applications. </p> </div> </dd> <dt> <a name='item37'>[37]</a> <a href ="/abs/2502.14377" title="Abstract" id="2502.14377"> arXiv:2502.14377 </a> [<a href="/pdf/2502.14377" title="Download PDF" id="pdf-2502.14377" aria-labelledby="pdf-2502.14377">pdf</a>, <a href="https://arxiv.org/html/2502.14377v1" title="View HTML" id="html-2502.14377" aria-labelledby="html-2502.14377" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14377" title="Other formats" id="oth-2502.14377" aria-labelledby="oth-2502.14377">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> RelaCtrl: Relevance-Guided Efficient Control for Diffusion Transformers </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+K">Ke Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+J">Jing Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+A">Ao Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+J">Jiasong Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Z">Zhanjie Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+X">Xuanhua He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+S">Shanyuan Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cheng,+B">Bo Cheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Leng,+D">Dawei Leng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yin,+Y">Yuhui Yin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+J">Jie Zhang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 15 pages, 9 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> The Diffusion Transformer plays a pivotal role in advancing text-to-image and text-to-video generation, owing primarily to its inherent scalability. However, existing controlled diffusion transformer methods incur significant parameter and computational overheads and suffer from inefficient resource allocation due to their failure to account for the varying relevance of control information across different transformer layers. To address this, we propose the Relevance-Guided Efficient Controllable Generation framework, RelaCtrl, enabling efficient and resource-optimized integration of control signals into the Diffusion Transformer. First, we evaluate the relevance of each layer in the Diffusion Transformer to the control information by assessing the "ControlNet Relevance Score"-i.e., the impact of skipping each control layer on both the quality of generation and the control effectiveness during inference. Based on the strength of the relevance, we then tailor the positioning, parameter scale, and modeling capacity of the control layers to reduce unnecessary parameters and redundant computations. Additionally, to further improve efficiency, we replace the self-attention and FFN in the commonly used copy block with the carefully designed Two-Dimensional Shuffle Mixer (TDSM), enabling efficient implementation of both the token mixer and channel mixer. Both qualitative and quantitative experimental results demonstrate that our approach achieves superior performance with only 15% of the parameters and computational complexity compared to PixArt-delta. More examples are available at <a href="https://relactrl.github.io/RelaCtrl/" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item38'>[38]</a> <a href ="/abs/2502.14397" title="Abstract" id="2502.14397"> arXiv:2502.14397 </a> [<a href="/pdf/2502.14397" title="Download PDF" id="pdf-2502.14397" aria-labelledby="pdf-2502.14397">pdf</a>, <a href="https://arxiv.org/html/2502.14397v1" title="View HTML" id="html-2502.14397" aria-labelledby="html-2502.14397" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14397" title="Other formats" id="oth-2502.14397" aria-labelledby="oth-2502.14397">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> PhotoDoodle: Learning Artistic Image Editing from Few-Shot Pairwise Data </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+S">Shijie Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+Y">Yiren Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yuxuan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+H">Hailong Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+X">Xueyin Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shou,+M+Z">Mike Zheng Shou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+J">Jiaming Liu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> We introduce PhotoDoodle, a novel image editing framework designed to facilitate photo doodling by enabling artists to overlay decorative elements onto photographs. Photo doodling is challenging because the inserted elements must appear seamlessly integrated with the background, requiring realistic blending, perspective alignment, and contextual coherence. Additionally, the background must be preserved without distortion, and the artist's unique style must be captured efficiently from limited training data. These requirements are not addressed by previous methods that primarily focus on global style transfer or regional inpainting. The proposed method, PhotoDoodle, employs a two-stage training strategy. Initially, we train a general-purpose image editing model, OmniEditor, using large-scale data. Subsequently, we fine-tune this model with EditLoRA using a small, artist-curated dataset of before-and-after image pairs to capture distinct editing styles and techniques. To enhance consistency in the generated results, we introduce a positional encoding reuse mechanism. Additionally, we release a PhotoDoodle dataset featuring six high-quality styles. Extensive experiments demonstrate the advanced performance and robustness of our method in customized image editing, opening new possibilities for artistic creation. </p> </div> </dd> <dt> <a name='item39'>[39]</a> <a href ="/abs/2502.14412" title="Abstract" id="2502.14412"> arXiv:2502.14412 </a> [<a href="/pdf/2502.14412" title="Download PDF" id="pdf-2502.14412" aria-labelledby="pdf-2502.14412">pdf</a>, <a href="https://arxiv.org/html/2502.14412v1" title="View HTML" id="html-2502.14412" aria-labelledby="html-2502.14412" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14412" title="Other formats" id="oth-2502.14412" aria-labelledby="oth-2502.14412">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Evaluating Precise Geolocation Inference Capabilities of Vision Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Jay,+N">Neel Jay</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nguyen,+H+M">Hieu Minh Nguyen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hoang,+T+D">Trung Dung Hoang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Haimes,+J">Jacob Haimes</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> AAAI 2025 Workshop DATASAFE </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Cryptography and Security (cs.CR); Machine Learning (cs.LG) </div> <p class='mathjax'> The prevalence of Vision-Language Models (VLMs) raises important questions about privacy in an era where visual information is increasingly available. While foundation VLMs demonstrate broad knowledge and learned capabilities, we specifically investigate their ability to infer geographic location from previously unseen image data. This paper introduces a benchmark dataset collected from Google Street View that represents its global distribution of coverage. Foundation models are evaluated on single-image geolocation inference, with many achieving median distance errors of <300 km. We further evaluate VLM "agents" with access to supplemental tools, observing up to a 30.6% decrease in distance error. Our findings establish that modern foundation VLMs can act as powerful image geolocation tools, without being specifically trained for this task. When coupled with increasing accessibility of these models, our findings have greater implications for online privacy. We discuss these risks, as well as future work in this area. </p> </div> </dd> <dt> <a name='item40'>[40]</a> <a href ="/abs/2502.14433" title="Abstract" id="2502.14433"> arXiv:2502.14433 </a> [<a href="/pdf/2502.14433" title="Download PDF" id="pdf-2502.14433" aria-labelledby="pdf-2502.14433">pdf</a>, <a href="https://arxiv.org/html/2502.14433v1" title="View HTML" id="html-2502.14433" aria-labelledby="html-2502.14433" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14433" title="Other formats" id="oth-2502.14433" aria-labelledby="oth-2502.14433">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Daily Land Surface Temperature Reconstruction in Landsat Cross-Track Areas Using Deep Ensemble Learning With Uncertainty Quantification </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+S">Shengjie Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Siqin Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+L">Lu Zhang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Many real-world applications rely on land surface temperature (LST) data at high spatiotemporal resolution. In complex urban areas, LST exhibits significant variations, fluctuating dramatically within and across city blocks. Landsat provides high spatial resolution data at 100 meters but is limited by long revisit time, with cloud cover further disrupting data collection. Here, we propose DELAG, a deep ensemble learning method that integrates annual temperature cycles and Gaussian processes, to reconstruct Landsat LST in complex urban areas. Leveraging the cross-track characteristics and dual-satellite operation of Landsat since 2021, we further enhance data availability to 4 scenes every 16 days. We select New York City, London and Hong Kong from three different continents as study areas. Experiments show that DELAG successfully reconstructed LST in the three cities under clear-sky (RMSE = 0.73-0.96 K) and heavily-cloudy (RMSE = 0.84-1.62 K) situations, superior to existing methods. Additionally, DELAG can quantify uncertainty that enhances LST reconstruction reliability. We further tested the reconstructed LST to estimate near-surface air temperature, achieving results (RMSE = 1.48-2.11 K) comparable to those derived from clear-sky LST (RMSE = 1.63-2.02 K). The results demonstrate the successful reconstruction through DELAG and highlight the broader applications of LST reconstruction for estimating accurate air temperature. Our study thus provides a novel and practical method for Landsat LST reconstruction, particularly suited for complex urban areas within Landsat cross-track areas, taking one step toward addressing complex climate events at high spatiotemporal resolution. </p> </div> </dd> <dt> <a name='item41'>[41]</a> <a href ="/abs/2502.14442" title="Abstract" id="2502.14442"> arXiv:2502.14442 </a> [<a href="/pdf/2502.14442" title="Download PDF" id="pdf-2502.14442" aria-labelledby="pdf-2502.14442">pdf</a>, <a href="https://arxiv.org/html/2502.14442v1" title="View HTML" id="html-2502.14442" aria-labelledby="html-2502.14442" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14442" title="Other formats" id="oth-2502.14442" aria-labelledby="oth-2502.14442">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Stochastic Resonance Improves the Detection of Low Contrast Images in Deep Learning Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ludwig,+S">Siegfried Ludwig</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> MSc Course Project </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Stochastic resonance describes the utility of noise in improving the detectability of weak signals in certain types of systems. It has been observed widely in natural and engineered settings, but its utility in image classification with rate-based neural networks has not been studied extensively. In this analysis a simple LSTM recurrent neural network is trained for digit recognition and classification. During the test phase, image contrast is reduced to a point where the model fails to recognize the presence of a stimulus. Controlled noise is added to partially recover classification performance. The results indicate the presence of stochastic resonance in rate-based recurrent neural networks. </p> </div> </dd> <dt> <a name='item42'>[42]</a> <a href ="/abs/2502.14454" title="Abstract" id="2502.14454"> arXiv:2502.14454 </a> [<a href="/pdf/2502.14454" title="Download PDF" id="pdf-2502.14454" aria-labelledby="pdf-2502.14454">pdf</a>, <a href="https://arxiv.org/html/2502.14454v1" title="View HTML" id="html-2502.14454" aria-labelledby="html-2502.14454" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14454" title="Other formats" id="oth-2502.14454" aria-labelledby="oth-2502.14454">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Exploiting Deblurring Networks for Radiance Fields </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Choi,+H">Haeyun Choi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+H">Heemin Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+J">Janghyeok Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cho,+S">Sunghyun Cho</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> In this paper, we propose DeepDeblurRF, a novel radiance field deblurring approach that can synthesize high-quality novel views from blurred training views with significantly reduced training time. DeepDeblurRF leverages deep neural network (DNN)-based deblurring modules to enjoy their deblurring performance and computational efficiency. To effectively combine DNN-based deblurring and radiance field construction, we propose a novel radiance field (RF)-guided deblurring and an iterative framework that performs RF-guided deblurring and radiance field construction in an alternating manner. Moreover, DeepDeblurRF is compatible with various scene representations, such as voxel grids and 3D Gaussians, expanding its applicability. We also present BlurRF-Synth, the first large-scale synthetic dataset for training radiance field deblurring frameworks. We conduct extensive experiments on both camera motion blur and defocus blur, demonstrating that DeepDeblurRF achieves state-of-the-art novel-view synthesis quality with significantly reduced training time. </p> </div> </dd> <dt> <a name='item43'>[43]</a> <a href ="/abs/2502.14471" title="Abstract" id="2502.14471"> arXiv:2502.14471 </a> [<a href="/pdf/2502.14471" title="Download PDF" id="pdf-2502.14471" aria-labelledby="pdf-2502.14471">pdf</a>, <a href="https://arxiv.org/html/2502.14471v1" title="View HTML" id="html-2502.14471" aria-labelledby="html-2502.14471" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14471" title="Other formats" id="oth-2502.14471" aria-labelledby="oth-2502.14471">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Integrating Extra Modality Helps Segmentor Find Camouflaged Objects Well </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Fang,+C">Chengyu Fang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+C">Chunming He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+L">Longxiang Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yuelin Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+C">Chenyang Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+Y">Yuqi Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+C">Chubin Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+G">Guoxia Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+X">Xiu Li</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 12 pages, 5 figures, 6 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Camouflaged Object Segmentation (COS) remains a challenging problem due to the subtle visual differences between camouflaged objects and backgrounds. Owing to the exceedingly limited visual cues available from visible spectrum, previous RGB single-modality approaches often struggle to achieve satisfactory results, prompting the exploration of multimodal data to enhance detection accuracy. In this work, we present UniCOS, a novel framework that effectively leverages diverse data modalities to improve segmentation performance. UniCOS comprises two key components: a multimodal segmentor, UniSEG, and a cross-modal knowledge learning module, UniLearner. UniSEG employs a state space fusion mechanism to integrate cross-modal features within a unified state space, enhancing contextual understanding and improving robustness to integration of heterogeneous data. Additionally, it includes a fusion-feedback mechanism that facilitate feature extraction. UniLearner exploits multimodal data unrelated to the COS task to improve the segmentation ability of the COS models by generating pseudo-modal content and cross-modal semantic associations. Extensive experiments demonstrate that UniSEG outperforms existing Multimodal COS (MCOS) segmentors, regardless of whether real or pseudo-multimodal COS data is available. Moreover, in scenarios where multimodal COS data is unavailable but multimodal non-COS data is accessible, UniLearner effectively exploits these data to enhance segmentation performance. Our code will be made publicly available on \href{<a href="https://github.com/cnyvfang/UniCOS" rel="external noopener nofollow" class="link-external link-https">this https URL</a>}{GitHub}. </p> </div> </dd> <dt> <a name='item44'>[44]</a> <a href ="/abs/2502.14493" title="Abstract" id="2502.14493"> arXiv:2502.14493 </a> [<a href="/pdf/2502.14493" title="Download PDF" id="pdf-2502.14493" aria-labelledby="pdf-2502.14493">pdf</a>, <a href="https://arxiv.org/html/2502.14493v1" title="View HTML" id="html-2502.14493" aria-labelledby="html-2502.14493" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14493" title="Other formats" id="oth-2502.14493" aria-labelledby="oth-2502.14493">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CrossFuse: Learning Infrared and Visible Image Fusion by Cross-Sensor Top-K Vision Alignment and Beyond </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Shi,+Y">Yukai Shi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shi,+C">Cidan Shi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Weng,+Z">Zhipeng Weng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tian,+Y">Yin Tian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xian,+X">Xiaoyu Xian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+L">Liang Lin</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> IEEE T-CSVT. We mainly discuss the out-of-distribution challenges in infrared and visible image fusion </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Infrared and visible image fusion (IVIF) is increasingly applied in critical fields such as video surveillance and autonomous driving systems. Significant progress has been made in deep learning-based fusion methods. However, these models frequently encounter out-of-distribution (OOD) scenes in real-world applications, which severely impact their performance and reliability. Therefore, addressing the challenge of OOD data is crucial for the safe deployment of these models in open-world environments. Unlike existing research, our focus is on the challenges posed by OOD data in real-world applications and on enhancing the robustness and generalization of models. In this paper, we propose an infrared-visible fusion framework based on Multi-View Augmentation. For external data augmentation, Top-k Selective Vision Alignment is employed to mitigate distribution shifts between datasets by performing RGB-wise transformations on visible images. This strategy effectively introduces augmented samples, enhancing the adaptability of the model to complex real-world scenarios. Additionally, for internal data augmentation, self-supervised learning is established using Weak-Aggressive Augmentation. This enables the model to learn more robust and general feature representations during the fusion process, thereby improving robustness and generalization. Extensive experiments demonstrate that the proposed method exhibits superior performance and robustness across various conditions and environments. Our approach significantly enhances the reliability and stability of IVIF tasks in practical applications. </p> </div> </dd> <dt> <a name='item45'>[45]</a> <a href ="/abs/2502.14495" title="Abstract" id="2502.14495"> arXiv:2502.14495 </a> [<a href="/pdf/2502.14495" title="Download PDF" id="pdf-2502.14495" aria-labelledby="pdf-2502.14495">pdf</a>, <a href="https://arxiv.org/html/2502.14495v1" title="View HTML" id="html-2502.14495" aria-labelledby="html-2502.14495" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14495" title="Other formats" id="oth-2502.14495" aria-labelledby="oth-2502.14495">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Nearshore Underwater Target Detection Meets UAV-borne Hyperspectral Remote Sensing: A Novel Hybrid-level Contrastive Learning Framework and Benchmark Dataset </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Qi,+J">Jiahao Qi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+C">Chuanhong Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+X">Xingyue Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+C">Chen Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+D">Dehui Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bin,+K">Kangcheng Bin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhong,+P">Ping Zhong</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 18pages,13figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> UAV-borne hyperspectral remote sensing has emerged as a promising approach for underwater target detection (UTD). However, its effectiveness is hindered by spectral distortions in nearshore environments, which compromise the accuracy of traditional hyperspectral UTD (HUTD) methods that rely on bathymetric model. These distortions lead to significant uncertainty in target and background spectra, challenging the detection process. To address this, we propose the Hyperspectral Underwater Contrastive Learning Network (HUCLNet), a novel framework that integrates contrastive learning with a self-paced learning paradigm for robust HUTD in nearshore regions. HUCLNet extracts discriminative features from distorted hyperspectral data through contrastive learning, while the self-paced learning strategy selectively prioritizes the most informative samples. Additionally, a reliability-guided clustering strategy enhances the robustness of learned <a href="http://representations.To" rel="external noopener nofollow" class="link-external link-http">this http URL</a> evaluate the method effectiveness, we conduct a novel nearshore HUTD benchmark dataset, ATR2-HUTD, covering three diverse scenarios with varying water types and turbidity, and target types. Extensive experiments demonstrate that HUCLNet significantly outperforms state-of-the-art methods. The dataset and code will be publicly available at: <a href="https://github.com/qjh1996/HUTD" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item46'>[46]</a> <a href ="/abs/2502.14503" title="Abstract" id="2502.14503"> arXiv:2502.14503 </a> [<a href="/pdf/2502.14503" title="Download PDF" id="pdf-2502.14503" aria-labelledby="pdf-2502.14503">pdf</a>, <a href="https://arxiv.org/html/2502.14503v1" title="View HTML" id="html-2502.14503" aria-labelledby="html-2502.14503" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14503" title="Other formats" id="oth-2502.14503" aria-labelledby="oth-2502.14503">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LXLv2: Enhanced LiDAR Excluded Lean 3D Object Detection with Fusion of 4D Radar and Camera </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xiong,+W">Weiyi Xiong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zou,+Z">Zean Zou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+Q">Qiuchi Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+F">Fengchun He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+B">Bing Zhu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted by IEEE Robotics and Automation Letters </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> As the previous state-of-the-art 4D radar-camera fusion-based 3D object detection method, LXL utilizes the predicted image depth distribution maps and radar 3D occupancy grids to assist the sampling-based image view transformation. However, the depth prediction lacks accuracy and consistency, and the concatenation-based fusion in LXL impedes the model robustness. In this work, we propose LXLv2, where modifications are made to overcome the limitations and improve the performance. Specifically, considering the position error in radar measurements, we devise a one-to-many depth supervision strategy via radar points, where the radar cross section (RCS) value is further exploited to adjust the supervision area for object-level depth consistency. Additionally, a channel and spatial attention-based fusion module named CSAFusion is introduced to improve feature adaptiveness. Experimental results on the View-of-Delft and TJ4DRadSet datasets show that the proposed LXLv2 can outperform LXL in detection accuracy, inference speed and robustness, demonstrating the effectiveness of the model. </p> </div> </dd> <dt> <a name='item47'>[47]</a> <a href ="/abs/2502.14504" title="Abstract" id="2502.14504"> arXiv:2502.14504 </a> [<a href="/pdf/2502.14504" title="Download PDF" id="pdf-2502.14504" aria-labelledby="pdf-2502.14504">pdf</a>, <a href="https://arxiv.org/html/2502.14504v1" title="View HTML" id="html-2502.14504" aria-labelledby="html-2502.14504" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14504" title="Other formats" id="oth-2502.14504" aria-labelledby="oth-2502.14504">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> PLPHP: Per-Layer Per-Head Vision Token Pruning for Efficient Large Vision-Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Meng,+Y">Yu Meng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+K">Kaiyuan Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+C">Chenran Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+C">Chen Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+X">Xinlei Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yong Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+X">Xiaoping Zhang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 12 pages, 8 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Large Vision-Language Models (LVLMs) have demonstrated remarkable capabilities across a range of multimodal tasks. However, their inference efficiency is constrained by the large number of visual tokens processed during decoding. To address this challenge, we propose Per-Layer Per-Head Vision Token Pruning (PLPHP), a two-level fine-grained pruning method including Layer-Level Retention Rate Allocation and Head-Level Vision Token Pruning. Motivated by the Vision Token Re-attention phenomenon across decoder layers, we dynamically adjust token retention rates layer by layer. Layers that exhibit stronger attention to visual information preserve more vision tokens, while layers with lower vision attention are aggressively pruned. Furthermore, PLPHP applies pruning at the attention head level, enabling different heads within the same layer to independently retain critical context. Experiments on multiple benchmarks demonstrate that PLPHP delivers an 18% faster decoding speed and reduces the Key-Value Cache (KV Cache) size by over 50%, all at the cost of 0.46% average performance drop, while also achieving notable performance improvements in multi-image tasks. These results highlight the effectiveness of fine-grained token pruning and contribute to advancing the efficiency and scalability of LVLMs. Our source code will be made publicly available. </p> </div> </dd> <dt> <a name='item48'>[48]</a> <a href ="/abs/2502.14520" title="Abstract" id="2502.14520"> arXiv:2502.14520 </a> [<a href="/pdf/2502.14520" title="Download PDF" id="pdf-2502.14520" aria-labelledby="pdf-2502.14520">pdf</a>, <a href="https://arxiv.org/html/2502.14520v1" title="View HTML" id="html-2502.14520" aria-labelledby="html-2502.14520" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14520" title="Other formats" id="oth-2502.14520" aria-labelledby="oth-2502.14520">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Learning Temporal 3D Semantic Scene Completion via Optical Flow Guidance </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+M">Meng Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+F">Fan Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+R">Ruihui Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qin,+Y">Yunchuan Qin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+Z">Zhuo Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+K">Kenli Li</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> 3D Semantic Scene Completion (SSC) provides comprehensive scene geometry and semantics for autonomous driving perception, which is crucial for enabling accurate and reliable decision-making. However, existing SSC methods are limited to capturing sparse information from the current frame or naively stacking multi-frame temporal features, thereby failing to acquire effective scene context. These approaches ignore critical motion dynamics and struggle to achieve temporal consistency. To address the above challenges, we propose a novel temporal SSC method FlowScene: Learning Temporal 3D Semantic Scene Completion via Optical Flow Guidance. By leveraging optical flow, FlowScene can integrate motion, different viewpoints, occlusions, and other contextual cues, thereby significantly improving the accuracy of 3D scene completion. Specifically, our framework introduces two key components: (1) a Flow-Guided Temporal Aggregation module that aligns and aggregates temporal features using optical flow, capturing motion-aware context and deformable structures; and (2) an Occlusion-Guided Voxel Refinement module that injects occlusion masks and temporally aggregated features into 3D voxel space, adaptively refining voxel representations for explicit geometric modeling. Experimental results demonstrate that FlowScene achieves state-of-the-art performance on the SemanticKITTI and SSCBench-KITTI-360 benchmarks. </p> </div> </dd> <dt> <a name='item49'>[49]</a> <a href ="/abs/2502.14573" title="Abstract" id="2502.14573"> arXiv:2502.14573 </a> [<a href="/pdf/2502.14573" title="Download PDF" id="pdf-2502.14573" aria-labelledby="pdf-2502.14573">pdf</a>, <a href="https://arxiv.org/html/2502.14573v1" title="View HTML" id="html-2502.14573" aria-labelledby="html-2502.14573" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14573" title="Other formats" id="oth-2502.14573" aria-labelledby="oth-2502.14573">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Self-supervised Monocular Depth Estimation Robust to Reflective Surface Leveraged by Triplet Mining </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Choi,+W">Wonhyeok Choi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hwang,+K">Kyumin Hwang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Peng,+W">Wei Peng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Choi,+M">Minwoo Choi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Im,+S">Sunghoon Im</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted at ICLR 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Self-supervised monocular depth estimation (SSMDE) aims to predict the dense depth map of a monocular image, by learning depth from RGB image sequences, eliminating the need for ground-truth depth labels. Although this approach simplifies data acquisition compared to supervised methods, it struggles with reflective surfaces, as they violate the assumptions of Lambertian reflectance, leading to inaccurate training on such surfaces. To tackle this problem, we propose a novel training strategy for an SSMDE by leveraging triplet mining to pinpoint reflective regions at the pixel level, guided by the camera geometry between different viewpoints. The proposed reflection-aware triplet mining loss specifically penalizes the inappropriate photometric error minimization on the localized reflective regions while preserving depth accuracy in non-reflective areas. We also incorporate a reflection-aware knowledge distillation method that enables a student model to selectively learn the pixel-level knowledge from reflective and non-reflective regions. This results in robust depth estimation across areas. Evaluation results on multiple datasets demonstrate that our method effectively enhances depth quality on reflective surfaces and outperforms state-of-the-art SSMDE baselines. </p> </div> </dd> <dt> <a name='item50'>[50]</a> <a href ="/abs/2502.14616" title="Abstract" id="2502.14616"> arXiv:2502.14616 </a> [<a href="/pdf/2502.14616" title="Download PDF" id="pdf-2502.14616" aria-labelledby="pdf-2502.14616">pdf</a>, <a href="https://arxiv.org/html/2502.14616v1" title="View HTML" id="html-2502.14616" aria-labelledby="html-2502.14616" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14616" title="Other formats" id="oth-2502.14616" aria-labelledby="oth-2502.14616">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Monocular Depth Estimation and Segmentation for Transparent Object with Iterative Semantic and Geometric Fusion </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+J">Jiangyuan Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+H">Hongxuan Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+Y">Yuxin Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+Y">Yuhao Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+C">Chi Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sui,+W">Wei Sui</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zou,+W">Wei Zou</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted by ICRA(2025). The code is accessible through: <a href="https://github.com/L-J-Yuan/MODEST" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Transparent object perception is indispensable for numerous robotic tasks. However, accurately segmenting and estimating the depth of transparent objects remain challenging due to complex optical properties. Existing methods primarily delve into only one task using extra inputs or specialized sensors, neglecting the valuable interactions among tasks and the subsequent refinement process, leading to suboptimal and blurry predictions. To address these issues, we propose a monocular framework, which is the first to excel in both segmentation and depth estimation of transparent objects, with only a single-image input. Specifically, we devise a novel semantic and geometric fusion module, effectively integrating the multi-scale information between tasks. In addition, drawing inspiration from human perception of objects, we further incorporate an iterative strategy, which progressively refines initial features for clearer results. Experiments on two challenging synthetic and real-world datasets demonstrate that our model surpasses state-of-the-art monocular, stereo, and multi-view methods by a large margin of about 38.8%-46.2% with only a single RGB input. Codes and models are publicly available at <a href="https://github.com/L-J-Yuan/MODEST" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item51'>[51]</a> <a href ="/abs/2502.14659" title="Abstract" id="2502.14659"> arXiv:2502.14659 </a> [<a href="/pdf/2502.14659" title="Download PDF" id="pdf-2502.14659" aria-labelledby="pdf-2502.14659">pdf</a>, <a href="https://arxiv.org/html/2502.14659v1" title="View HTML" id="html-2502.14659" aria-labelledby="html-2502.14659" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14659" title="Other formats" id="oth-2502.14659" aria-labelledby="oth-2502.14659">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MAGO-SP: Detection and Correction of Water-Fat Swaps in Magnitude-Only VIBE MRI </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Graf,+R">Robert Graf</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=M%C3%B6ller,+H">Hendrik M枚ller</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Starck,+S">Sophie Starck</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Atad,+M">Matan Atad</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Braun,+P">Philipp Braun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Stelter,+J">Jonathan Stelter</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Peters,+A">Annette Peters</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Krist,+L">Lilian Krist</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Willich,+S+N">Stefan N. Willich</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=V%C3%B6lzke,+H">Henry V枚lzke</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=B%C3%BClow,+R">Robin B眉low</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Berger,+K">Klaus Berger</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pischon,+T">Tobias Pischon</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Niendorf,+T">Thoralf Niendorf</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Paetzold,+J">Johannes Paetzold</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Karampinos,+D">Dimitrios Karampinos</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rueckert,+D">Daniel Rueckert</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kirschke,+J">Jan Kirschke</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Volume Interpolated Breath-Hold Examination (VIBE) MRI generates images suitable for water and fat signal composition estimation. While the two-point VIBE provides water-fat-separated images, the six-point VIBE allows estimation of the effective transversal relaxation rate R2* and the proton density fat fraction (PDFF), which are imaging markers for health and disease. Ambiguity during signal reconstruction can lead to water-fat swaps. This shortcoming challenges the application of VIBE-MRI for automated PDFF analyses of large-scale clinical data and of population studies. This study develops an automated pipeline to detect and correct water-fat swaps in non-contrast-enhanced VIBE images. Our three-step pipeline begins with training a segmentation network to classify volumes as "fat-like" or "water-like," using synthetic water-fat swaps generated by merging fat and water volumes with Perlin noise. Next, a denoising diffusion image-to-image network predicts water volumes as signal priors for correction. Finally, we integrate this prior into a physics-constrained model to recover accurate water and fat signals. Our approach achieves a < 1% error rate in water-fat swap detection for a 6-point VIBE. Notably, swaps disproportionately affect individuals in the Underweight and Class 3 Obesity BMI categories. Our correction algorithm ensures accurate solution selection in chemical phase MRIs, enabling reliable PDFF estimation. This forms a solid technical foundation for automated large-scale population imaging analysis. </p> </div> </dd> <dt> <a name='item52'>[52]</a> <a href ="/abs/2502.14676" title="Abstract" id="2502.14676"> arXiv:2502.14676 </a> [<a href="/pdf/2502.14676" title="Download PDF" id="pdf-2502.14676" aria-labelledby="pdf-2502.14676">pdf</a>, <a href="https://arxiv.org/html/2502.14676v1" title="View HTML" id="html-2502.14676" aria-labelledby="html-2502.14676" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14676" title="Other formats" id="oth-2502.14676" aria-labelledby="oth-2502.14676">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> BP-SGCN: Behavioral Pseudo-Label Informed Sparse Graph Convolution Network for Pedestrian and Heterogeneous Trajectory Prediction </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+R">Ruochen Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Katsigiannis,+S">Stamos Katsigiannis</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+T">Tae-Kyun Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shum,+H+P+H">Hubert P. H. Shum</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Trajectory prediction allows better decision-making in applications of autonomous vehicles or surveillance by predicting the short-term future movement of traffic agents. It is classified into pedestrian or heterogeneous trajectory prediction. The former exploits the relatively consistent behavior of pedestrians, but is limited in real-world scenarios with heterogeneous traffic agents such as cyclists and vehicles. The latter typically relies on extra class label information to distinguish the heterogeneous agents, but such labels are costly to annotate and cannot be generalized to represent different behaviors within the same class of agents. In this work, we introduce the behavioral pseudo-labels that effectively capture the behavior distributions of pedestrians and heterogeneous agents solely based on their motion features, significantly improving the accuracy of trajectory prediction. To implement the framework, we propose the Behavioral Pseudo-Label Informed Sparse Graph Convolution Network (BP-SGCN) that learns pseudo-labels and informs to a trajectory predictor. For optimization, we propose a cascaded training scheme, in which we first learn the pseudo-labels in an unsupervised manner, and then perform end-to-end fine-tuning on the labels in the direction of increasing the trajectory prediction accuracy. Experiments show that our pseudo-labels effectively model different behavior clusters and improve trajectory prediction. Our proposed BP-SGCN outperforms existing methods using both pedestrian (ETH/UCY, pedestrian-only SDD) and heterogeneous agent datasets (SDD, Argoverse 1). </p> </div> </dd> <dt> <a name='item53'>[53]</a> <a href ="/abs/2502.14721" title="Abstract" id="2502.14721"> arXiv:2502.14721 </a> [<a href="/pdf/2502.14721" title="Download PDF" id="pdf-2502.14721" aria-labelledby="pdf-2502.14721">pdf</a>, <a href="https://arxiv.org/html/2502.14721v1" title="View HTML" id="html-2502.14721" aria-labelledby="html-2502.14721" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14721" title="Other formats" id="oth-2502.14721" aria-labelledby="oth-2502.14721">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Multi-dataset synergistic in supervised learning to pre-label structural components in point clouds from shell construction scenes </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Rauch,+L">Lukas Rauch</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Braml,+T">Thomas Braml</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 18 pages, 8 figures, 7 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> The significant effort required to annotate data for new training datasets hinders computer vision research and machine learning in the construction industry. This work explores adapting standard datasets and the latest transformer model architectures for point cloud semantic segmentation in the context of shell construction sites. Unlike common approaches focused on object segmentation of building interiors and furniture, this study addressed the challenges of segmenting complex structural components in Architecture, Engineering, and Construction (AEC). We establish a baseline through supervised training and a custom validation dataset, evaluate the cross-domain inference with large-scale indoor datasets, and utilize transfer learning to maximize segmentation performance with minimal new data. The findings indicate that with minimal fine-tuning, pre-trained transformer architectures offer an effective strategy for building component segmentation. Our results are promising for automating the annotation of new, previously unseen data when creating larger training resources and for the segmentation of frequently recurring objects. </p> </div> </dd> <dt> <a name='item54'>[54]</a> <a href ="/abs/2502.14740" title="Abstract" id="2502.14740"> arXiv:2502.14740 </a> [<a href="/pdf/2502.14740" title="Download PDF" id="pdf-2502.14740" aria-labelledby="pdf-2502.14740">pdf</a>, <a href="https://arxiv.org/html/2502.14740v1" title="View HTML" id="html-2502.14740" aria-labelledby="html-2502.14740" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14740" title="Other formats" id="oth-2502.14740" aria-labelledby="oth-2502.14740">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> YOLOv12: A Breakdown of the Key Architectural Features </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Alif,+M+A+R">Mujadded Al Rabbani Alif</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hussain,+M">Muhammad Hussain</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> This paper presents an architectural analysis of YOLOv12, a significant advancement in single-stage, real-time object detection building upon the strengths of its predecessors while introducing key improvements. The model incorporates an optimised backbone (R-ELAN), 7x7 separable convolutions, and FlashAttention-driven area-based attention, improving feature extraction, enhanced efficiency, and robust detections. With multiple model variants, similar to its predecessors, YOLOv12 offers scalable solutions for both latency-sensitive and high-accuracy applications. Experimental results manifest consistent gains in mean average precision (mAP) and inference speed, making YOLOv12 a compelling choice for applications in autonomous systems, security, and real-time analytics. By achieving an optimal balance between computational efficiency and performance, YOLOv12 sets a new benchmark for real-time computer vision, facilitating deployment across diverse hardware platforms, from edge devices to high-performance clusters. </p> </div> </dd> <dt> <a name='item55'>[55]</a> <a href ="/abs/2502.14779" title="Abstract" id="2502.14779"> arXiv:2502.14779 </a> [<a href="/pdf/2502.14779" title="Download PDF" id="pdf-2502.14779" aria-labelledby="pdf-2502.14779">pdf</a>, <a href="https://arxiv.org/html/2502.14779v1" title="View HTML" id="html-2502.14779" aria-labelledby="html-2502.14779" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14779" title="Other formats" id="oth-2502.14779" aria-labelledby="oth-2502.14779">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> DC-ControlNet: Decoupling Inter- and Intra-Element Conditions in Image Generation with Diffusion Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+H">Hongji Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+W">Wencheng Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+Y">Yucheng Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+J">Jianbing Shen</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> In this paper, we introduce DC (Decouple)-ControlNet, a highly flexible and precisely controllable framework for multi-condition image generation. The core idea behind DC-ControlNet is to decouple control conditions, transforming global control into a hierarchical system that integrates distinct elements, contents, and layouts. This enables users to mix these individual conditions with greater flexibility, leading to more efficient and accurate image generation control. Previous ControlNet-based models rely solely on global conditions, which affect the entire image and lack the ability of element- or region-specific control. This limitation reduces flexibility and can cause condition misunderstandings in multi-conditional image generation. To address these challenges, we propose both intra-element and Inter-element Controllers in DC-ControlNet. The Intra-Element Controller handles different types of control signals within individual elements, accurately describing the content and layout characteristics of the object. For interactions between elements, we introduce the Inter-Element Controller, which accurately handles multi-element interactions and occlusion based on user-defined relationships. Extensive evaluations show that DC-ControlNet significantly outperforms existing ControlNet models and Layout-to-Image generative models in terms of control flexibility and precision in multi-condition control. </p> </div> </dd> <dt> <a name='item56'>[56]</a> <a href ="/abs/2502.14786" title="Abstract" id="2502.14786"> arXiv:2502.14786 </a> [<a href="/pdf/2502.14786" title="Download PDF" id="pdf-2502.14786" aria-labelledby="pdf-2502.14786">pdf</a>, <a href="https://arxiv.org/html/2502.14786v1" title="View HTML" id="html-2502.14786" aria-labelledby="html-2502.14786" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14786" title="Other formats" id="oth-2502.14786" aria-labelledby="oth-2502.14786">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SigLIP 2: Multilingual Vision-Language Encoders with Improved Semantic Understanding, Localization, and Dense Features </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Tschannen,+M">Michael Tschannen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gritsenko,+A">Alexey Gritsenko</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+X">Xiao Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Naeem,+M+F">Muhammad Ferjad Naeem</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Alabdulmohsin,+I">Ibrahim Alabdulmohsin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Parthasarathy,+N">Nikhil Parthasarathy</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Evans,+T">Talfan Evans</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Beyer,+L">Lucas Beyer</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xia,+Y">Ye Xia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mustafa,+B">Basil Mustafa</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=H%C3%A9naff,+O">Olivier H茅naff</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Harmsen,+J">Jeremiah Harmsen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Steiner,+A">Andreas Steiner</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhai,+X">Xiaohua Zhai</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Model checkpoints are available at <a href="https://github.com/google-research/big_vision/tree/main/big_vision/configs/proj/image_text/README_siglip2.md" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> We introduce SigLIP 2, a family of new multilingual vision-language encoders that build on the success of the original SigLIP. In this second iteration, we extend the original image-text training objective with several prior, independently developed techniques into a unified recipe -- this includes captioning-based pretraining, self-supervised losses (self-distillation, masked prediction) and online data curation. With these changes, SigLIP 2 models outperform their SigLIP counterparts at all model scales in core capabilities, including zero-shot classification, image-text retrieval, and transfer performance when extracting visual representations for Vision-Language Models (VLMs). Furthermore, the new training recipe leads to significant improvements on localization and dense prediction tasks. We also train variants which support multiple resolutions and preserve the input's native aspect ratio. Finally, we train on a more diverse data-mixture that includes de-biasing techniques, leading to much better multilingual understanding and improved fairness. To allow users to trade off inference cost with performance, we release model checkpoints at four sizes: ViT-B (86M), L (303M), So400m (400M), and g (1B). </p> </div> </dd> <dt> <a name='item57'>[57]</a> <a href ="/abs/2502.14789" title="Abstract" id="2502.14789"> arXiv:2502.14789 </a> [<a href="/pdf/2502.14789" title="Download PDF" id="pdf-2502.14789" aria-labelledby="pdf-2502.14789">pdf</a>, <a href="https://arxiv.org/html/2502.14789v1" title="View HTML" id="html-2502.14789" aria-labelledby="html-2502.14789" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14789" title="Other formats" id="oth-2502.14789" aria-labelledby="oth-2502.14789">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Structurally Disentangled Feature Fields Distillation for 3D Understanding and Editing </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Levy,+Y">Yoel Levy</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shavin,+D">David Shavin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lang,+I">Itai Lang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Benaim,+S">Sagie Benaim</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Recent work has demonstrated the ability to leverage or distill pre-trained 2D features obtained using large pre-trained 2D models into 3D features, enabling impressive 3D editing and understanding capabilities using only 2D supervision. Although impressive, models assume that 3D features are captured using a single feature field and often make a simplifying assumption that features are view-independent. In this work, we propose instead to capture 3D features using multiple disentangled feature fields that capture different structural components of 3D features involving view-dependent and view-independent components, which can be learned from 2D feature supervision only. Subsequently, each element can be controlled in isolation, enabling semantic and structural understanding and editing capabilities. For instance, using a user click, one can segment 3D features corresponding to a given object and then segment, edit, or remove their view-dependent (reflective) properties. We evaluate our approach on the task of 3D segmentation and demonstrate a set of novel understanding and editing tasks. </p> </div> </dd> <dt> <a name='item58'>[58]</a> <a href ="/abs/2502.14792" title="Abstract" id="2502.14792"> arXiv:2502.14792 </a> [<a href="/pdf/2502.14792" title="Download PDF" id="pdf-2502.14792" aria-labelledby="pdf-2502.14792">pdf</a>, <a href="https://arxiv.org/html/2502.14792v1" title="View HTML" id="html-2502.14792" aria-labelledby="html-2502.14792" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14792" title="Other formats" id="oth-2502.14792" aria-labelledby="oth-2502.14792">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> RendBEV: Semantic Novel View Synthesis for Self-Supervised Bird's Eye View Segmentation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Monteagudo,+H+P">Henrique Pi帽eiro Monteagudo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Taccari,+L">Leonardo Taccari</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pjetri,+A">Aurel Pjetri</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sambo,+F">Francesco Sambo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Salti,+S">Samuele Salti</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted at WACV 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Bird's Eye View (BEV) semantic maps have recently garnered a lot of attention as a useful representation of the environment to tackle assisted and autonomous driving tasks. However, most of the existing work focuses on the fully supervised setting, training networks on large annotated datasets. In this work, we present RendBEV, a new method for the self-supervised training of BEV semantic segmentation networks, leveraging differentiable volumetric rendering to receive supervision from semantic perspective views computed by a 2D semantic segmentation model. Our method enables zero-shot BEV semantic segmentation, and already delivers competitive results in this challenging setting. When used as pretraining to then fine-tune on labeled BEV ground-truth, our method significantly boosts performance in low-annotation regimes, and sets a new state of the art when fine-tuning on all available labels. </p> </div> </dd> <dt> <a name='item59'>[59]</a> <a href ="/abs/2502.14799" title="Abstract" id="2502.14799"> arXiv:2502.14799 </a> [<a href="/pdf/2502.14799" title="Download PDF" id="pdf-2502.14799" aria-labelledby="pdf-2502.14799">pdf</a>, <a href="https://arxiv.org/html/2502.14799v1" title="View HTML" id="html-2502.14799" aria-labelledby="html-2502.14799" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14799" title="Other formats" id="oth-2502.14799" aria-labelledby="oth-2502.14799">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Survey on Text-Driven 360-Degree Panorama Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+H">Hai Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiang,+X">Xiaoyu Xiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xia,+W">Weihao Xia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xue,+J">Jing-Hao Xue</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> The advent of text-driven 360-degree panorama generation, enabling the synthesis of 360-degree panoramic images directly from textual descriptions, marks a transformative advancement in immersive visual content creation. This innovation significantly simplifies the traditionally complex process of producing such content. Recent progress in text-to-image diffusion models has accelerated the rapid development in this emerging field. This survey presents a comprehensive review of text-driven 360-degree panorama generation, offering an in-depth analysis of state-of-the-art algorithms and their expanding applications in 360-degree 3D scene generation. Furthermore, we critically examine current limitations and propose promising directions for future research. A curated project page with relevant resources and research papers is available at <a href="https://littlewhitesea.github.io/Text-Driven-Pano-Gen/" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item60'>[60]</a> <a href ="/abs/2502.14801" title="Abstract" id="2502.14801"> arXiv:2502.14801 </a> [<a href="/pdf/2502.14801" title="Download PDF" id="pdf-2502.14801" aria-labelledby="pdf-2502.14801">pdf</a>, <a href="https://arxiv.org/html/2502.14801v1" title="View HTML" id="html-2502.14801" aria-labelledby="html-2502.14801" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14801" title="Other formats" id="oth-2502.14801" aria-labelledby="oth-2502.14801">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> AVD2: Accident Video Diffusion for Accident Video Description </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+C">Cheng Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+K">Keyuan Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+T">Tong Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yu Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhuang,+M">Mingqiao Zhuang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+H">Huan-ang Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jin,+B">Bu Jin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+H">Hao Zhao</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> ICRA 2025, Project Page: <a href="https://an-answer-tree.github.io/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Traffic accidents present complex challenges for autonomous driving, often featuring unpredictable scenarios that hinder accurate system interpretation and <a href="http://responses.Nonetheless" rel="external noopener nofollow" class="link-external link-http">this http URL</a>, prevailing methodologies fall short in elucidating the causes of accidents and proposing preventive measures due to the paucity of training data specific to accident <a href="http://scenarios.In" rel="external noopener nofollow" class="link-external link-http">this http URL</a> this work, we introduce AVD2 (Accident Video Diffusion for Accident Video Description), a novel framework that enhances accident scene understanding by generating accident videos that aligned with detailed natural language descriptions and reasoning, resulting in the contributed EMM-AU (Enhanced Multi-Modal Accident Video Understanding) dataset. Empirical results reveal that the integration of the EMM-AU dataset establishes state-of-the-art performance across both automated metrics and human evaluations, markedly advancing the domains of accident analysis and prevention. Project resources are available at <a href="https://an-answer-tree.github.io" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item61'>[61]</a> <a href ="/abs/2502.14827" title="Abstract" id="2502.14827"> arXiv:2502.14827 </a> [<a href="/pdf/2502.14827" title="Download PDF" id="pdf-2502.14827" aria-labelledby="pdf-2502.14827">pdf</a>, <a href="/format/2502.14827" title="Other formats" id="oth-2502.14827" aria-labelledby="oth-2502.14827">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Exploring Advanced Techniques for Visual Question Answering: A Comprehensive Comparison </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Baby,+A">Aiswarya Baby</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Koshy,+T+T">Tintu Thankom Koshy</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 8 pages, No figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Emerging Technologies (cs.ET); Machine Learning (cs.LG) </div> <p class='mathjax'> Visual Question Answering (VQA) has emerged as a pivotal task in the intersection of computer vision and natural language processing, requiring models to understand and reason about visual content in response to natural language questions. Analyzing VQA datasets is essential for developing robust models that can handle the complexities of multimodal reasoning. Several approaches have been developed to examine these datasets, each offering distinct perspectives on question diversity, answer distribution, and visual-textual correlations. Despite significant progress, existing VQA models face challenges related to dataset bias, limited model complexity, commonsense reasoning gaps, rigid evaluation methods, and generalization to real world scenarios. This paper presents a comprehensive comparative study of five advanced VQA models: ABC-CNN, KICNLE, Masked Vision and Language Modeling, BLIP-2, and OFA, each employing distinct methodologies to address these challenges. </p> </div> </dd> <dt> <a name='item62'>[62]</a> <a href ="/abs/2502.14831" title="Abstract" id="2502.14831"> arXiv:2502.14831 </a> [<a href="/pdf/2502.14831" title="Download PDF" id="pdf-2502.14831" aria-labelledby="pdf-2502.14831">pdf</a>, <a href="https://arxiv.org/html/2502.14831v1" title="View HTML" id="html-2502.14831" aria-labelledby="html-2502.14831" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14831" title="Other formats" id="oth-2502.14831" aria-labelledby="oth-2502.14831">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Improving the Diffusability of Autoencoders </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Skorokhodov,+I">Ivan Skorokhodov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Girish,+S">Sharath Girish</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+B">Benran Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Menapace,+W">Willi Menapace</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yanyu Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Abdal,+R">Rameen Abdal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tulyakov,+S">Sergey Tulyakov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Siarohin,+A">Aliaksandr Siarohin</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 26 pages, 22 figures, 9 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Latent diffusion models have emerged as the leading approach for generating high-quality images and videos, utilizing compressed latent representations to reduce the computational burden of the diffusion process. While recent advancements have primarily focused on scaling diffusion backbones and improving autoencoder reconstruction quality, the interaction between these components has received comparatively less attention. In this work, we perform a spectral analysis of modern autoencoders and identify inordinate high-frequency components in their latent spaces, which are especially pronounced in the autoencoders with a large bottleneck channel size. We hypothesize that this high-frequency component interferes with the coarse-to-fine nature of the diffusion synthesis process and hinders the generation quality. To mitigate the issue, we propose scale equivariance: a simple regularization strategy that aligns latent and RGB spaces across frequencies by enforcing scale equivariance in the decoder. It requires minimal code changes and only up to 20K autoencoder fine-tuning steps, yet significantly improves generation quality, reducing FID by 19% for image generation on ImageNet-1K 256x256 and FVD by at least 44% for video generation on Kinetics-700 17x256x256. </p> </div> </dd> <dt> <a name='item63'>[63]</a> <a href ="/abs/2502.14834" title="Abstract" id="2502.14834"> arXiv:2502.14834 </a> [<a href="/pdf/2502.14834" title="Download PDF" id="pdf-2502.14834" aria-labelledby="pdf-2502.14834">pdf</a>, <a href="/format/2502.14834" title="Other formats" id="oth-2502.14834" aria-labelledby="oth-2502.14834">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LongWriter-V: Enabling Ultra-Long and High-Fidelity Generation in Vision-Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Tu,+S">Shangqing Tu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yucheng Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang-Li,+D">Daniel Zhang-Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bai,+Y">Yushi Bai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+J">Jifan Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Y">Yuhao Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hou,+L">Lei Hou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+H">Huiqin Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zhiyuan Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+B">Bin Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+J">Juanzi Li</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> Existing Large Vision-Language Models (LVLMs) can process inputs with context lengths up to 128k visual and text tokens, yet they struggle to generate coherent outputs beyond 1,000 words. We find that the primary limitation is the absence of long output examples during supervised fine-tuning (SFT). To tackle this issue, we introduce LongWriter-V-22k, a SFT dataset comprising 22,158 examples, each with multiple input images, an instruction, and corresponding outputs ranging from 0 to 10,000 words. Moreover, to achieve long outputs that maintain high-fidelity to the input images, we employ Direct Preference Optimization (DPO) to the SFT model. Given the high cost of collecting human feedback for lengthy outputs (e.g., 3,000 words), we propose IterDPO, which breaks long outputs into segments and uses iterative corrections to form preference pairs with the original outputs. Additionally, we develop MMLongBench-Write, a benchmark featuring six tasks to evaluate the long-generation capabilities of VLMs. Our 7B parameter model, trained with LongWriter-V-22k and IterDPO, achieves impressive performance on this benchmark, outperforming larger proprietary models like GPT-4o. Code and data: <a href="https://github.com/THU-KEG/LongWriter-V" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item64'>[64]</a> <a href ="/abs/2502.14846" title="Abstract" id="2502.14846"> arXiv:2502.14846 </a> [<a href="/pdf/2502.14846" title="Download PDF" id="pdf-2502.14846" aria-labelledby="pdf-2502.14846">pdf</a>, <a href="https://arxiv.org/html/2502.14846v1" title="View HTML" id="html-2502.14846" aria-labelledby="html-2502.14846" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14846" title="Other formats" id="oth-2502.14846" aria-labelledby="oth-2502.14846">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Scaling Text-Rich Image Understanding via Code-Guided Synthetic Multimodal Data Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+Y">Yue Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Patel,+A">Ajay Patel</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Deitke,+M">Matt Deitke</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gupta,+T">Tanmay Gupta</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Weihs,+L">Luca Weihs</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Head,+A">Andrew Head</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yatskar,+M">Mark Yatskar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Callison-Burch,+C">Chris Callison-Burch</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Krishna,+R">Ranjay Krishna</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kembhavi,+A">Aniruddha Kembhavi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Clark,+C">Christopher Clark</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 20 pages, 19 figures, 9 tables, website: <a href="https://yueyang1996.github.io/cosyn/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Reasoning about images with rich text, such as charts and documents, is a critical application of vision-language models (VLMs). However, VLMs often struggle in these domains due to the scarcity of diverse text-rich vision-language data. To address this challenge, we present CoSyn, a framework that leverages the coding capabilities of text-only large language models (LLMs) to automatically create synthetic text-rich multimodal data. Given input text describing a target domain (e.g., "nutrition fact labels"), CoSyn prompts an LLM to generate code (Python, HTML, LaTeX, etc.) for rendering synthetic images. With the underlying code as textual representations of the synthetic images, CoSyn can generate high-quality instruction-tuning data, again relying on a text-only LLM. Using CoSyn, we constructed a dataset comprising 400K images and 2.7M rows of vision-language instruction-tuning data. Comprehensive experiments on seven benchmarks demonstrate that models trained on our synthetic data achieve state-of-the-art performance among competitive open-source models, including Llama 3.2, and surpass proprietary models such as GPT-4V and Gemini 1.5 Flash. Furthermore, CoSyn can produce synthetic pointing data, enabling VLMs to ground information within input images, showcasing its potential for developing multimodal agents capable of acting in real-world environments. </p> </div> </dd> <dt> <a name='item65'>[65]</a> <a href ="/abs/2502.14865" title="Abstract" id="2502.14865"> arXiv:2502.14865 </a> [<a href="/pdf/2502.14865" title="Download PDF" id="pdf-2502.14865" aria-labelledby="pdf-2502.14865">pdf</a>, <a href="https://arxiv.org/html/2502.14865v1" title="View HTML" id="html-2502.14865" aria-labelledby="html-2502.14865" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14865" title="Other formats" id="oth-2502.14865" aria-labelledby="oth-2502.14865">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Time Travel: A Comprehensive Benchmark to Evaluate LMMs on Historical and Cultural Artifacts </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ghaboura,+S">Sara Ghaboura</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=More,+K">Ketan More</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Thawkar,+R">Ritesh Thawkar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Alghallabi,+W">Wafa Alghallabi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Thawakar,+O">Omkar Thawakar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Khan,+F+S">Fahad Shahbaz Khan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cholakkal,+H">Hisham Cholakkal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Khan,+S">Salman Khan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Anwer,+R+M">Rao Muhammad Anwer</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 4 pages, 6 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Understanding historical and cultural artifacts demands human expertise and advanced computational techniques, yet the process remains complex and time-intensive. While large multimodal models offer promising support, their evaluation and improvement require a standardized benchmark. To address this, we introduce TimeTravel, a benchmark of 10,250 expert-verified samples spanning 266 distinct cultures across 10 major historical regions. Designed for AI-driven analysis of manuscripts, artworks, inscriptions, and archaeological discoveries, TimeTravel provides a structured dataset and robust evaluation framework to assess AI models' capabilities in classification, interpretation, and historical comprehension. By integrating AI with historical research, TimeTravel fosters AI-powered tools for historians, archaeologists, researchers, and cultural tourists to extract valuable insights while ensuring technology contributes meaningfully to historical discovery and cultural heritage preservation. We evaluate contemporary AI models on TimeTravel, highlighting their strengths and identifying areas for improvement. Our goal is to establish AI as a reliable partner in preserving cultural heritage, ensuring that technological advancements contribute meaningfully to historical discovery. Our code is available at: \url{<a href="https://github.com/mbzuai-oryx/TimeTravel" rel="external noopener nofollow" class="link-external link-https">this https URL</a>}. </p> </div> </dd> </dl> <dl id='articles'> <h3>Cross submissions (showing 28 of 28 entries)</h3> <dt> <a name='item66'>[66]</a> <a href ="/abs/2502.13974" title="Abstract" id="2502.13974"> arXiv:2502.13974 </a> (cross-list from eess.IV) [<a href="/pdf/2502.13974" title="Download PDF" id="pdf-2502.13974" aria-labelledby="pdf-2502.13974">pdf</a>, <a href="https://arxiv.org/html/2502.13974v1" title="View HTML" id="html-2502.13974" aria-labelledby="html-2502.13974" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.13974" title="Other formats" id="oth-2502.13974" aria-labelledby="oth-2502.13974">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Segmentation-free integration of nuclei morphology and spatial transcriptomics for retinal images </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Chelebian,+E">Eduard Chelebian</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Dasgupta,+P">Pratiti Dasgupta</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Samadi,+Z">Zainalabedin Samadi</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=W%C3%A4hlby,+C">Carolina W盲hlby</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Askary,+A">Amjad Askary</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> This study introduces SEFI (SEgmentation-Free Integration), a novel method for integrating morphological features of cell nuclei with spatial transcriptomics data. Cell segmentation poses a significant challenge in the analysis of spatial transcriptomics data, as tissue-specific structural complexities and densely packed cells in certain regions make it difficult to develop a universal approach. SEFI addresses this by utilizing self-supervised learning to extract morphological features from fluorescent nuclear staining images, enhancing the clustering of gene expression data without requiring segmentation. We demonstrate SEFI on spatially resolved gene expression profiles of the developing retina, acquired using multiplexed single molecule Fluorescence In Situ Hybridization (smFISH). SEFI is publicly available at <a href="https://github.com/eduardchelebian/sefi" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item67'>[67]</a> <a href ="/abs/2502.14023" title="Abstract" id="2502.14023"> arXiv:2502.14023 </a> (cross-list from cs.LG) [<a href="/pdf/2502.14023" title="Download PDF" id="pdf-2502.14023" aria-labelledby="pdf-2502.14023">pdf</a>, <a href="https://arxiv.org/html/2502.14023v1" title="View HTML" id="html-2502.14023" aria-labelledby="html-2502.14023" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14023" title="Other formats" id="oth-2502.14023" aria-labelledby="oth-2502.14023">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Dynamic Activation with Knowledge Distillation for Energy-Efficient Spiking NN Ensembles </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Konstantaropoulos,+O">Orestis Konstantaropoulos</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mallios,+T">Theodoris Mallios</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Papadopouli,+M">Maria Papadopouli</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV); Neural and Evolutionary Computing (cs.NE) </div> <p class='mathjax'> While foundation AI models excel at tasks like classification and decision-making, their high energy consumption makes them unsuitable for energy-constrained applications. Inspired by the brain's efficiency, spiking neural networks (SNNs) have emerged as a viable alternative due to their event-driven nature and compatibility with neuromorphic chips. This work introduces a novel system that combines knowledge distillation and ensemble learning to bridge the performance gap between artificial neural networks (ANNs) and SNNs. A foundation AI model acts as a teacher network, guiding smaller student SNNs organized into an ensemble, called Spiking Neural Ensemble (SNE). SNE enables the disentanglement of the teacher's knowledge, allowing each student to specialize in predicting a distinct aspect of it, while processing the same input. The core innovation of SNE is the adaptive activation of a subset of SNN models of an ensemble, leveraging knowledge-distillation, enhanced with an informed-partitioning (disentanglement) of the teacher's feature space. By dynamically activating only a subset of these student SNNs, the system balances accuracy and energy efficiency, achieving substantial energy savings with minimal accuracy loss. Moreover, SNE is significantly more efficient than the teacher network, reducing computational requirements by up to 20x with only a 2% drop in accuracy on the CIFAR-10 dataset. This disentanglement procedure achieves an accuracy improvement of up to 2.4% on the CIFAR-10 dataset compared to other partitioning schemes. Finally, we comparatively analyze SNE performance under noisy conditions, demonstrating enhanced robustness compared to its ANN teacher. In summary, SNE offers a promising new direction for energy-constrained applications. </p> </div> </dd> <dt> <a name='item68'>[68]</a> <a href ="/abs/2502.14090" title="Abstract" id="2502.14090"> arXiv:2502.14090 </a> (cross-list from eess.IV) [<a href="/pdf/2502.14090" title="Download PDF" id="pdf-2502.14090" aria-labelledby="pdf-2502.14090">pdf</a>, <a href="https://arxiv.org/html/2502.14090v1" title="View HTML" id="html-2502.14090" aria-labelledby="html-2502.14090" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14090" title="Other formats" id="oth-2502.14090" aria-labelledby="oth-2502.14090">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MambaLiteSR: Image Super-Resolution with Low-Rank Mamba using Knowledge Distillation </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Aalishah,+R">Romina Aalishah</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Navardi,+M">Mozhgan Navardi</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Mohsenin,+T">Tinoosh Mohsenin</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Special Session: Generative AI on Edge, 26th International Symposium on Quality Electronic Design (ISQED'25) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Generative Artificial Intelligence (AI) has gained significant attention in recent years, revolutionizing various applications across industries. Among these, advanced vision models for image super-resolution are in high demand, particularly for deployment on edge devices where real-time processing is crucial. However, deploying such models on edge devices is challenging due to limited computing power and memory. In this paper, we present MambaLiteSR, a novel lightweight image Super-Resolution (SR) model that utilizes the architecture of Vision Mamba. It integrates State Space Blocks and a reconstruction module for efficient feature extraction. To optimize efficiency without affecting performance, MambaLiteSR employs knowledge distillation to transfer key insights from a larger Mamba-based teacher model to a smaller student model via hyperparameter tuning. Through mathematical analysis of model parameters and their impact on PSNR, we identify key factors and adjust them accordingly. Our comprehensive evaluation shows that MambaLiteSR outperforms state-of-the-art edge SR methods by reducing power consumption while maintaining competitive PSNR and SSIM scores across benchmark datasets. It also reduces power usage during training via low-rank approximation. Moreover, MambaLiteSR reduces parameters with minimal performance loss, enabling efficient deployment of generative AI models on resource-constrained devices. Deployment on the embedded NVIDIA Jetson Orin Nano confirms the superior balance of MambaLiteSR size, latency, and efficiency. Experiments show that MambaLiteSR achieves performance comparable to both the baseline and other edge models while using 15% fewer parameters. It also improves power consumption by up to 58% compared to state-of-the-art SR edge models, all while maintaining low energy use during training. </p> </div> </dd> <dt> <a name='item69'>[69]</a> <a href ="/abs/2502.14092" title="Abstract" id="2502.14092"> arXiv:2502.14092 </a> (cross-list from cs.RO) [<a href="/pdf/2502.14092" title="Download PDF" id="pdf-2502.14092" aria-labelledby="pdf-2502.14092">pdf</a>, <a href="https://arxiv.org/html/2502.14092v1" title="View HTML" id="html-2502.14092" aria-labelledby="html-2502.14092" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14092" title="Other formats" id="oth-2502.14092" aria-labelledby="oth-2502.14092">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Hybrid Visual Servoing of Tendon-driven Continuum Robots </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Danesh,+R">Rana Danesh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Janabi-Sharifi,+F">Farrokh Janabi-Sharifi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Aghili,+F">Farhad Aghili</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span>; Computer Vision and Pattern Recognition (cs.CV); Systems and Control (eess.SY) </div> <p class='mathjax'> This paper introduces a novel Hybrid Visual Servoing (HVS) approach for controlling tendon-driven continuum robots (TDCRs). The HVS system combines Image-Based Visual Servoing (IBVS) with Deep Learning-Based Visual Servoing (DLBVS) to overcome the limitations of each method and improve overall performance. IBVS offers higher accuracy and faster convergence in feature-rich environments, while DLBVS enhances robustness against disturbances and offers a larger workspace. By enabling smooth transitions between IBVS and DLBVS, the proposed HVS ensures effective control in dynamic, unstructured environments. The effectiveness of this approach is validated through simulations and real-world experiments, demonstrating that HVS achieves reduced iteration time, faster convergence, lower final error, and smoother performance compared to DLBVS alone, while maintaining DLBVS's robustness in challenging conditions such as occlusions, lighting changes, actuator noise, and physical impacts. </p> </div> </dd> <dt> <a name='item70'>[70]</a> <a href ="/abs/2502.14178" title="Abstract" id="2502.14178"> arXiv:2502.14178 </a> (cross-list from cs.GR) [<a href="/pdf/2502.14178" title="Download PDF" id="pdf-2502.14178" aria-labelledby="pdf-2502.14178">pdf</a>, <a href="https://arxiv.org/html/2502.14178v1" title="View HTML" id="html-2502.14178" aria-labelledby="html-2502.14178" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14178" title="Other formats" id="oth-2502.14178" aria-labelledby="oth-2502.14178">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> NeRF-3DTalker: Neural Radiance Field with 3D Prior Aided Audio Disentanglement for Talking Head Synthesis </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+X">Xiaoxing Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zhilei Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bi,+C">Chongke Bi</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted by ICASSP 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Graphics (cs.GR)</span>; Computer Vision and Pattern Recognition (cs.CV); Multimedia (cs.MM); Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> <p class='mathjax'> Talking head synthesis is to synthesize a lip-synchronized talking head video using audio. Recently, the capability of NeRF to enhance the realism and texture details of synthesized talking heads has attracted the attention of researchers. However, most current NeRF methods based on audio are exclusively concerned with the rendering of frontal faces. These methods are unable to generate clear talking heads in novel views. Another prevalent challenge in current 3D talking head synthesis is the difficulty in aligning acoustic and visual spaces, which often results in suboptimal lip-syncing of the generated talking heads. To address these issues, we propose Neural Radiance Field with 3D Prior Aided Audio Disentanglement for Talking Head Synthesis (NeRF-3DTalker). Specifically, the proposed method employs 3D prior information to synthesize clear talking heads with free views. Additionally, we propose a 3D Prior Aided Audio Disentanglement module, which is designed to disentangle the audio into two distinct categories: features related to 3D awarded speech movements and features related to speaking style. Moreover, to reposition the generated frames that are distant from the speaker's motion space in the real space, we have devised a local-global Standardized Space. This method normalizes the irregular positions in the generated frames from both global and local semantic perspectives. Through comprehensive qualitative and quantitative experiments, it has been demonstrated that our NeRF-3DTalker outperforms state-of-the-art in synthesizing realistic talking head videos, exhibiting superior image quality and lip synchronization. Project page: <a href="https://nerf-3dtalker.github.io/NeRF-3Dtalker" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item71'>[71]</a> <a href ="/abs/2502.14214" title="Abstract" id="2502.14214"> arXiv:2502.14214 </a> (cross-list from cs.LG) [<a href="/pdf/2502.14214" title="Download PDF" id="pdf-2502.14214" aria-labelledby="pdf-2502.14214">pdf</a>, <a href="https://arxiv.org/html/2502.14214v1" title="View HTML" id="html-2502.14214" aria-labelledby="html-2502.14214" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14214" title="Other formats" id="oth-2502.14214" aria-labelledby="oth-2502.14214">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Asymmetric Co-Training for Source-Free Few-Shot Domain Adaptation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+G">Gengxu Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Y">Yuan Wu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 13 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Source-free unsupervised domain adaptation (SFUDA) has gained significant attention as an alternative to traditional unsupervised domain adaptation (UDA), which relies on the constant availability of labeled source data. However, SFUDA approaches come with inherent limitations that are frequently overlooked. These challenges include performance degradation when the unlabeled target data fails to meet critical assumptions, such as having a closed-set label distribution identical to that of the source domain, or when sufficient unlabeled target data is unavailable-a common situation in real-world applications. To address these issues, we propose an asymmetric co-training (ACT) method specifically designed for the SFFSDA scenario. SFFSDA presents a more practical alternative to SFUDA, as gathering a few labeled target instances is more feasible than acquiring large volumes of unlabeled target data in many real-world contexts. Our ACT method begins by employing a weak-strong augmentation to enhance data diversity. Then we use a two-step optimization process to train the target model. In the first step, we optimize the label smoothing cross-entropy loss, the entropy of the class-conditional distribution, and the reverse-entropy loss to bolster the model's discriminative ability while mitigating overfitting. The second step focuses on reducing redundancy in the output space by minimizing classifier determinacy disparity. Extensive experiments across four benchmarks demonstrate the superiority of our ACT approach, which outperforms state-of-the-art SFUDA methods and transfer learning techniques. Our findings suggest that adapting a source pre-trained model using only a small amount of labeled target data offers a practical and dependable solution. The code is available at <a href="https://github.com/gengxuli/ACT" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item72'>[72]</a> <a href ="/abs/2502.14247" title="Abstract" id="2502.14247"> arXiv:2502.14247 </a> (cross-list from cs.GR) [<a href="/pdf/2502.14247" title="Download PDF" id="pdf-2502.14247" aria-labelledby="pdf-2502.14247">pdf</a>, <a href="https://arxiv.org/html/2502.14247v1" title="View HTML" id="html-2502.14247" aria-labelledby="html-2502.14247" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14247" title="Other formats" id="oth-2502.14247" aria-labelledby="oth-2502.14247">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Pandora3D: A Comprehensive Framework for High-Quality 3D Shape and Texture Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+J">Jiayu Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shang,+T">Taizhang Shang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+W">Weixuan Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+X">Xibin Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Z">Ziang Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Senbo Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+S">Shenzhou Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+W">Weizhe Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+H">Hongdong Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ji,+P">Pan Ji</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Tencent XR 3D Gen </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Graphics (cs.GR)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> This report presents a comprehensive framework for generating high-quality 3D shapes and textures from diverse input prompts, including single images, multi-view images, and text descriptions. The framework consists of 3D shape generation and texture generation. (1). The 3D shape generation pipeline employs a Variational Autoencoder (VAE) to encode implicit 3D geometries into a latent space and a diffusion network to generate latents conditioned on input prompts, with modifications to enhance model capacity. An alternative Artist-Created Mesh (AM) generation approach is also explored, yielding promising results for simpler geometries. (2). Texture generation involves a multi-stage process starting with frontal images generation followed by multi-view images generation, RGB-to-PBR texture conversion, and high-resolution multi-view texture refinement. A consistency scheduler is plugged into every stage, to enforce pixel-wise consistency among multi-view textures during inference, ensuring seamless integration. <br>The pipeline demonstrates effective handling of diverse input formats, leveraging advanced neural architectures and novel methodologies to produce high-quality 3D content. This report details the system architecture, experimental results, and potential future directions to improve and expand the framework. The source code and pretrained weights are released at: \url{<a href="https://github.com/Tencent/Tencent-XR-3DGen" rel="external noopener nofollow" class="link-external link-https">this https URL</a>}. </p> </div> </dd> <dt> <a name='item73'>[73]</a> <a href ="/abs/2502.14260" title="Abstract" id="2502.14260"> arXiv:2502.14260 </a> (cross-list from eess.IV) [<a href="/pdf/2502.14260" title="Download PDF" id="pdf-2502.14260" aria-labelledby="pdf-2502.14260">pdf</a>, <a href="https://arxiv.org/html/2502.14260v1" title="View HTML" id="html-2502.14260" aria-labelledby="html-2502.14260" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14260" title="Other formats" id="oth-2502.14260" aria-labelledby="oth-2502.14260">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> EyeBench: A Call for More Rigorous Evaluation of Retinal Image Enhancement </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Zhu,+W">Wenhui Zhu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Dong,+X">Xuanzhao Dong</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Li,+X">Xin Li</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Xiong,+Y">Yujian Xiong</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Chen,+X">Xiwen Chen</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Qiu,+P">Peijie Qiu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Vasa,+V+K">Vamsi Krishna Vasa</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Yang,+Z">Zhangsihao Yang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Su,+Y">Yi Su</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Dumitrascu,+O">Oana Dumitrascu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Wang,+Y">Yalin Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Over the past decade, generative models have achieved significant success in enhancement fundus <a href="http://images.However" rel="external noopener nofollow" class="link-external link-http">this http URL</a>, the evaluation of these models still presents a considerable challenge. A comprehensive evaluation benchmark for fundus image enhancement is indispensable for three main reasons: 1) The existing denoising metrics (e.g., PSNR, SSIM) are hardly to extend to downstream real-world clinical research (e.g., Vessel morphology consistency). 2) There is a lack of comprehensive evaluation for both paired and unpaired enhancement methods, along with the need for expert protocols to accurately assess clinical value. 3) An ideal evaluation system should provide insights to inform future developments of fundus image enhancement. To this end, we propose a novel comprehensive benchmark, EyeBench, to provide insights that align enhancement models with clinical needs, offering a foundation for future work to improve the clinical relevance and applicability of generative models for fundus image enhancement. EyeBench has three appealing properties: 1) multi-dimensional clinical alignment downstream evaluation: In addition to evaluating the enhancement task, we provide several clinically significant downstream tasks for fundus images, including vessel segmentation, DR grading, denoising generalization, and lesion segmentation. 2) Medical expert-guided evaluation design: We introduce a novel dataset that promote comprehensive and fair comparisons between paired and unpaired methods and includes a manual evaluation protocol by medical experts. 3) Valuable insights: Our benchmark study provides a comprehensive and rigorous evaluation of existing methods across different downstream tasks, assisting medical experts in making informed choices. Additionally, we offer further analysis of the challenges faced by existing methods. The code is available at \url{<a href="https://github.com/Retinal-Research/EyeBench" rel="external noopener nofollow" class="link-external link-https">this https URL</a>} </p> </div> </dd> <dt> <a name='item74'>[74]</a> <a href ="/abs/2502.14363" title="Abstract" id="2502.14363"> arXiv:2502.14363 </a> (cross-list from eess.IV) [<a href="/pdf/2502.14363" title="Download PDF" id="pdf-2502.14363" aria-labelledby="pdf-2502.14363">pdf</a>, <a href="https://arxiv.org/html/2502.14363v1" title="View HTML" id="html-2502.14363" aria-labelledby="html-2502.14363" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14363" title="Other formats" id="oth-2502.14363" aria-labelledby="oth-2502.14363">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Topology-Aware Wavelet Mamba for Airway Structure Segmentation in Postoperative Recurrent Nasopharyngeal Carcinoma CT Scans </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Huang,+H">Haishan Huang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Liang,+P">Pengchen Liang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Lin,+N">Naier Lin</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Wang,+L">Luxi Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Pu,+B">Bin Pu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Chen,+J">Jianguo Chen</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Chang,+Q">Qing Chang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Shen,+X">Xia Shen</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Ran,+G">Guo Ran</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 20 pages, 11 figures, 6 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Nasopharyngeal carcinoma (NPC) patients often undergo radiotherapy and chemotherapy, which can lead to postoperative complications such as limited mouth opening and joint stiffness, particularly in recurrent cases that require re-surgery. These complications can affect airway function, making accurate postoperative airway risk assessment essential for managing patient care. Accurate segmentation of airway-related structures in postoperative CT scans is crucial for assessing these risks. This study introduces TopoWMamba (Topology-aware Wavelet Mamba), a novel segmentation model specifically designed to address the challenges of postoperative airway risk evaluation in recurrent NPC patients. TopoWMamba combines wavelet-based multi-scale feature extraction, state-space sequence modeling, and topology-aware modules to segment airway-related structures in CT scans robustly. By leveraging the Wavelet-based Mamba Block (WMB) for hierarchical frequency decomposition and the Snake Conv VSS (SCVSS) module to preserve anatomical continuity, TopoWMamba effectively captures both fine-grained boundaries and global structural context, crucial for accurate segmentation in complex postoperative scenarios. Through extensive testing on the NPCSegCT dataset, TopoWMamba achieves an average Dice score of 88.02%, outperforming existing models such as UNet, Attention UNet, and SwinUNet. Additionally, TopoWMamba is tested on the SegRap 2023 Challenge dataset, where it shows a significant improvement in trachea segmentation with a Dice score of 95.26%. The proposed model provides a strong foundation for automated segmentation, enabling more accurate postoperative airway risk evaluation. </p> </div> </dd> <dt> <a name='item75'>[75]</a> <a href ="/abs/2502.14370" title="Abstract" id="2502.14370"> arXiv:2502.14370 </a> (cross-list from cs.LG) [<a href="/pdf/2502.14370" title="Download PDF" id="pdf-2502.14370" aria-labelledby="pdf-2502.14370">pdf</a>, <a href="https://arxiv.org/html/2502.14370v1" title="View HTML" id="html-2502.14370" aria-labelledby="html-2502.14370" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14370" title="Other formats" id="oth-2502.14370" aria-labelledby="oth-2502.14370">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> PPO-MI: Efficient Black-Box Model Inversion via Proximal Policy Optimization </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Shou,+X">Xinpeng Shou</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 6 pages, submitting to ICML 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Model inversion attacks pose a significant privacy risk by attempting to reconstruct private training data from trained models. Most of the existing methods either depend on gradient estimation or require white-box access to model parameters, which limits their applicability in practical scenarios. In this paper, we propose PPO-MI, a novel reinforcement learning-based framework for black-box model inversion attacks. Our approach formulates the inversion task as a Markov Decision Process, where an agent navigates the latent space of a generative model to reconstruct private training samples using only model predictions. By employing Proximal Policy Optimization (PPO) with a momentum-based state transition mechanism, along with a reward function balancing prediction accuracy and exploration, PPO-MI ensures efficient latent space exploration and high query efficiency. We conduct extensive experiments illustrates that PPO-MI outperforms the existing methods while require less attack knowledge, and it is robust across various model architectures and datasets. These results underline its effectiveness and generalizability in practical black-box scenarios, raising important considerations for the privacy vulnerabilities of deployed machine learning models. </p> </div> </dd> <dt> <a name='item76'>[76]</a> <a href ="/abs/2502.14376" title="Abstract" id="2502.14376"> arXiv:2502.14376 </a> (cross-list from cs.CL) [<a href="/pdf/2502.14376" title="Download PDF" id="pdf-2502.14376" aria-labelledby="pdf-2502.14376">pdf</a>, <a href="https://arxiv.org/html/2502.14376v1" title="View HTML" id="html-2502.14376" aria-labelledby="html-2502.14376" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14376" title="Other formats" id="oth-2502.14376" aria-labelledby="oth-2502.14376">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Similarity Paradigm Through Textual Regularization Without Forgetting </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Cui,+F">Fangming Cui</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fong,+J">Jan Fong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zeng,+R">Rongfei Zeng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tian,+X">Xinmei Tian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+J">Jun Yu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Prompt learning has emerged as a promising method for adapting pre-trained visual-language models (VLMs) to a range of downstream tasks. While optimizing the context can be effective for improving performance on specific tasks, it can often lead to poor generalization performance on unseen classes or datasets sampled from different distributions. It may be attributed to the fact that textual prompts tend to overfit downstream data distributions, leading to the forgetting of generalized knowledge derived from hand-crafted prompts. In this paper, we propose a novel method called Similarity Paradigm with Textual Regularization (SPTR) for prompt learning without forgetting. SPTR is a two-pronged design based on hand-crafted prompts that is an inseparable framework. 1) To avoid forgetting general textual knowledge, we introduce the optimal transport as a textual regularization to finely ensure approximation with hand-crafted features and tuning textual features. 2) In order to continuously unleash the general ability of multiple hand-crafted prompts, we propose a similarity paradigm for natural alignment score and adversarial alignment score to improve model robustness for generalization. Both modules share a common objective in addressing generalization issues, aiming to maximize the generalization capability derived from multiple hand-crafted prompts. Four representative tasks (i.e., non-generalization few-shot learning, base-to-novel generalization, cross-dataset generalization, domain generalization) across 11 datasets demonstrate that SPTR outperforms existing prompt learning methods. </p> </div> </dd> <dt> <a name='item77'>[77]</a> <a href ="/abs/2502.14401" title="Abstract" id="2502.14401"> arXiv:2502.14401 </a> (cross-list from eess.IV) [<a href="/pdf/2502.14401" title="Download PDF" id="pdf-2502.14401" aria-labelledby="pdf-2502.14401">pdf</a>, <a href="https://arxiv.org/html/2502.14401v1" title="View HTML" id="html-2502.14401" aria-labelledby="html-2502.14401" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14401" title="Other formats" id="oth-2502.14401" aria-labelledby="oth-2502.14401">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MedFuncta: Modality-Agnostic Representations Based on Efficient Neural Fields </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Friedrich,+P">Paul Friedrich</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Bieder,+F">Florentin Bieder</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Cattin,+P+C">Phlippe C. Cattin</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Code and Dataset: <a href="https://github.com/pfriedri/medfuncta" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Recent research in medical image analysis with deep learning almost exclusively focuses on grid- or voxel-based data representations. We challenge this common choice by introducing MedFuncta, a modality-agnostic continuous data representation based on neural fields. We demonstrate how to scale neural fields from single instances to large datasets by exploiting redundancy in medical signals and by applying an efficient meta-learning approach with a context reduction scheme. We further address the spectral bias in commonly used SIREN activations, by introducing an $\omega_0$-schedule, improving reconstruction quality and convergence speed. We validate our proposed approach on a large variety of medical signals of different dimensions and modalities (1D: ECG; 2D: Chest X-ray, Retinal OCT, Fundus Camera, Dermatoscope, Colon Histopathology, Cell Microscopy; 3D: Brain MRI, Lung CT) and successfully demonstrate that we can solve relevant downstream tasks on these representations. We additionally release a large-scale dataset of > 550k annotated neural fields to promote research in this direction. </p> </div> </dd> <dt> <a name='item78'>[78]</a> <a href ="/abs/2502.14418" title="Abstract" id="2502.14418"> arXiv:2502.14418 </a> (cross-list from eess.AS) [<a href="/pdf/2502.14418" title="Download PDF" id="pdf-2502.14418" aria-labelledby="pdf-2502.14418">pdf</a>, <a href="https://arxiv.org/html/2502.14418v1" title="View HTML" id="html-2502.14418" aria-labelledby="html-2502.14418" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14418" title="Other formats" id="oth-2502.14418" aria-labelledby="oth-2502.14418">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Role of the Pretraining and the Adaptation data sizes for low-resource real-time MRI video segmentation </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Tholan,+M+T">Masoud Thajudeen Tholan</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Hegde,+V">Vinayaka Hegde</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Sharma,+C">Chetan Sharma</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Ghosh,+P+K">Prasanta Kumar Ghosh</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted to ICASSP 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Computer Vision and Pattern Recognition (cs.CV); Signal Processing (eess.SP) </div> <p class='mathjax'> Real-time Magnetic Resonance Imaging (rtMRI) is frequently used in speech production studies as it provides a complete view of the vocal tract during articulation. This study investigates the effectiveness of rtMRI in analyzing vocal tract movements by employing the SegNet and UNet models for Air-Tissue Boundary (ATB)segmentation tasks. We conducted pretraining of a few base models using increasing numbers of subjects and videos, to assess performance on two datasets. First, consisting of unseen subjects with unseen videos from the same data source, achieving 0.33% and 0.91% (Pixel-wise Classification Accuracy (PCA) and Dice Coefficient respectively) better than its matched condition. Second, comprising unseen videos from a new data source, where we obtained an accuracy of 99.63% and 98.09% (PCA and Dice Coefficient respectively) of its matched condition performance. Here, matched condition performance refers to the performance of a model trained only on the test subjects which was set as a benchmark for the other models. Our findings highlight the significance of fine-tuning and adapting models with limited data. Notably, we demonstrated that effective model adaptation can be achieved with as few as 15 rtMRI frames from any new dataset. </p> </div> </dd> <dt> <a name='item79'>[79]</a> <a href ="/abs/2502.14420" title="Abstract" id="2502.14420"> arXiv:2502.14420 </a> (cross-list from cs.RO) [<a href="/pdf/2502.14420" title="Download PDF" id="pdf-2502.14420" aria-labelledby="pdf-2502.14420">pdf</a>, <a href="https://arxiv.org/html/2502.14420v1" title="View HTML" id="html-2502.14420" aria-labelledby="html-2502.14420" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14420" title="Other formats" id="oth-2502.14420" aria-labelledby="oth-2502.14420">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> ChatVLA: Unified Multimodal Understanding and Robot Control with Vision-Language-Action Model </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+Z">Zhongyi Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+Y">Yichen Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+M">Minjie Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wen,+J">Junjie Wen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+N">Ning Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+Z">Zhiyuan Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Meng,+W">Weibin Meng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cheng,+R">Ran Cheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Peng,+Y">Yaxin Peng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+C">Chaomin Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+F">Feifei Feng</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span>; Computer Vision and Pattern Recognition (cs.CV); Machine Learning (cs.LG) </div> <p class='mathjax'> Humans possess a unified cognitive ability to perceive, comprehend, and interact with the physical world. Why can't large language models replicate this holistic understanding? Through a systematic analysis of existing training paradigms in vision-language-action models (VLA), we identify two key challenges: spurious forgetting, where robot training overwrites crucial visual-text alignments, and task interference, where competing control and understanding tasks degrade performance when trained jointly. To overcome these limitations, we propose ChatVLA, a novel framework featuring Phased Alignment Training, which incrementally integrates multimodal data after initial control mastery, and a Mixture-of-Experts architecture to minimize task interference. ChatVLA demonstrates competitive performance on visual question-answering datasets and significantly surpasses state-of-the-art vision-language-action (VLA) methods on multimodal understanding benchmarks. Notably, it achieves a six times higher performance on MMMU and scores 47.2% on MMStar with a more parameter-efficient design than ECoT. Furthermore, ChatVLA demonstrates superior performance on 25 real-world robot manipulation tasks compared to existing VLA methods like OpenVLA. Our findings highlight the potential of our unified framework for achieving both robust multimodal understanding and effective robot control. </p> </div> </dd> <dt> <a name='item80'>[80]</a> <a href ="/abs/2502.14462" title="Abstract" id="2502.14462"> arXiv:2502.14462 </a> (cross-list from cs.GR) [<a href="/pdf/2502.14462" title="Download PDF" id="pdf-2502.14462" aria-labelledby="pdf-2502.14462">pdf</a>, <a href="https://arxiv.org/html/2502.14462v1" title="View HTML" id="html-2502.14462" aria-labelledby="html-2502.14462" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14462" title="Other formats" id="oth-2502.14462" aria-labelledby="oth-2502.14462">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Single-image Reflectance and Transmittance Estimation from Any Flatbed Scanner </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Rodriguez-Pardo,+C">Carlos Rodriguez-Pardo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pascual-Hernandez,+D">David Pascual-Hernandez</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rodriguez-Vazquez,+J">Javier Rodriguez-Vazquez</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lopez-Moreno,+J">Jorge Lopez-Moreno</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Garces,+E">Elena Garces</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted to Computers & Graphics </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Graphics (cs.GR)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV); Machine Learning (cs.LG) </div> <p class='mathjax'> Flatbed scanners have emerged as promising devices for high-resolution, single-image material capture. However, existing approaches assume very specific conditions, such as uniform diffuse illumination, which are only available in certain high-end devices, hindering their scalability and cost. In contrast, in this work, we introduce a method inspired by intrinsic image decomposition, which accurately removes both shading and specularity, effectively allowing captures with any flatbed scanner. Further, we extend previous work on single-image material reflectance capture with the estimation of opacity and transmittance, critical components of full material appearance (SVBSDF), improving the results for any material captured with a flatbed scanner, at a very high resolution and accuracy </p> </div> </dd> <dt> <a name='item81'>[81]</a> <a href ="/abs/2502.14487" title="Abstract" id="2502.14487"> arXiv:2502.14487 </a> (cross-list from cs.LG) [<a href="/pdf/2502.14487" title="Download PDF" id="pdf-2502.14487" aria-labelledby="pdf-2502.14487">pdf</a>, <a href="https://arxiv.org/html/2502.14487v1" title="View HTML" id="html-2502.14487" aria-labelledby="html-2502.14487" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14487" title="Other formats" id="oth-2502.14487" aria-labelledby="oth-2502.14487">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Temporal Misalignment and Probabilistic Neurons </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Bojkovi%C4%87,+V">Velibor Bojkovi膰</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+X">Xiaofeng Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gu,+B">Bin Gu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Spiking Neural Networks (SNNs) offer a more energy-efficient alternative to Artificial Neural Networks (ANNs) by mimicking biological neural principles, establishing them as a promising approach to mitigate the increasing energy demands of large-scale neural models. However, fully harnessing the capabilities of SNNs remains challenging due to their discrete signal processing and temporal dynamics. ANN-SNN conversion has emerged as a practical approach, enabling SNNs to achieve competitive performance on complex machine learning tasks. In this work, we identify a phenomenon in the ANN-SNN conversion framework, termed temporal misalignment, in which random spike rearrangement across SNN layers leads to performance improvements. Based on this observation, we introduce biologically plausible two-phase probabilistic (TPP) spiking neurons, further enhancing the conversion process. We demonstrate the advantages of our proposed method both theoretically and empirically through comprehensive experiments on CIFAR-10/100, CIFAR10-DVS, and ImageNet across a variety of architectures, achieving state-of-the-art results. </p> </div> </dd> <dt> <a name='item82'>[82]</a> <a href ="/abs/2502.14514" title="Abstract" id="2502.14514"> arXiv:2502.14514 </a> (cross-list from cs.RO) [<a href="/pdf/2502.14514" title="Download PDF" id="pdf-2502.14514" aria-labelledby="pdf-2502.14514">pdf</a>, <a href="https://arxiv.org/html/2502.14514v1" title="View HTML" id="html-2502.14514" aria-labelledby="html-2502.14514" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14514" title="Other formats" id="oth-2502.14514" aria-labelledby="oth-2502.14514">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Mobile Robotic Approach to Autonomous Surface Scanning in Legal Medicine </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Grube,+S">Sarah Grube</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Latus,+S">Sarah Latus</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fischer,+M">Martin Fischer</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Raudonis,+V">Vidas Raudonis</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Heinemann,+A">Axel Heinemann</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ondruschka,+B">Benjamin Ondruschka</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Schlaefer,+A">Alexander Schlaefer</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Submitted and accepted for presentation at CARS 2025. This preprint has not undergone peer review or post-submission revisions. The final version of this work will appear in the official CARS 2025 proceedings </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span>; Computer Vision and Pattern Recognition (cs.CV); Systems and Control (eess.SY) </div> <p class='mathjax'> Purpose: Comprehensive legal medicine documentation includes both an internal but also an external examination of the corpse. Typically, this documentation is conducted manually during conventional autopsy. A systematic digital documentation would be desirable, especially for the external examination of wounds, which is becoming more relevant for legal medicine analysis. For this purpose, RGB surface scanning has been introduced. While a manual full surface scan using a handheld camera is timeconsuming and operator dependent, floor or ceiling mounted robotic systems require substantial space and a dedicated room. Hence, we consider whether a mobile robotic system can be used for external documentation. Methods: We develop a mobile robotic system that enables full-body RGB-D surface scanning. Our work includes a detailed configuration space analysis to identify the environmental parameters that need to be considered to successfully perform a surface scan. We validate our findings through an experimental study in the lab and demonstrate the system's application in a legal medicine environment. Results: Our configuration space analysis shows that a good trade-off between coverage and time is reached with three robot base positions, leading to a coverage of 94.96 %. Experiments validate the effectiveness of the system in accurately capturing body surface geometry with an average surface coverage of 96.90 +- 3.16 % and 92.45 +- 1.43 % for a body phantom and actual corpses, respectively. Conclusion: This work demonstrates the potential of a mobile robotic system to automate RGB-D surface scanning in legal medicine, complementing the use of post-mortem CT scans for inner documentation. Our results indicate that the proposed system can contribute to more efficient and autonomous legal medicine documentation, reducing the need for manual intervention. </p> </div> </dd> <dt> <a name='item83'>[83]</a> <a href ="/abs/2502.14584" title="Abstract" id="2502.14584"> arXiv:2502.14584 </a> (cross-list from eess.IV) [<a href="/pdf/2502.14584" title="Download PDF" id="pdf-2502.14584" aria-labelledby="pdf-2502.14584">pdf</a>, <a href="https://arxiv.org/html/2502.14584v1" title="View HTML" id="html-2502.14584" aria-labelledby="html-2502.14584" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14584" title="Other formats" id="oth-2502.14584" aria-labelledby="oth-2502.14584">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Vision Foundation Models in Medical Image Analysis: Advances and Challenges </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Liang,+P">Pengchen Liang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Pu,+B">Bin Pu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Huang,+H">Haishan Huang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Li,+Y">Yiwei Li</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Wang,+H">Hualiang Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Ma,+W">Weibo Ma</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Chang,+Q">Qing Chang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 17 pages, 1 figure </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> The rapid development of Vision Foundation Models (VFMs), particularly Vision Transformers (ViT) and Segment Anything Model (SAM), has sparked significant advances in the field of medical image analysis. These models have demonstrated exceptional capabilities in capturing long-range dependencies and achieving high generalization in segmentation tasks. However, adapting these large models to medical image analysis presents several challenges, including domain differences between medical and natural images, the need for efficient model adaptation strategies, and the limitations of small-scale medical datasets. This paper reviews the state-of-the-art research on the adaptation of VFMs to medical image segmentation, focusing on the challenges of domain adaptation, model compression, and federated learning. We discuss the latest developments in adapter-based improvements, knowledge distillation techniques, and multi-scale contextual feature modeling, and propose future directions to overcome these bottlenecks. Our analysis highlights the potential of VFMs, along with emerging methodologies such as federated learning and model compression, to revolutionize medical image analysis and enhance clinical applications. The goal of this work is to provide a comprehensive overview of current approaches and suggest key areas for future research that can drive the next wave of innovation in medical image segmentation. </p> </div> </dd> <dt> <a name='item84'>[84]</a> <a href ="/abs/2502.14638" title="Abstract" id="2502.14638"> arXiv:2502.14638 </a> (cross-list from cs.CL) [<a href="/pdf/2502.14638" title="Download PDF" id="pdf-2502.14638" aria-labelledby="pdf-2502.14638">pdf</a>, <a href="https://arxiv.org/html/2502.14638v1" title="View HTML" id="html-2502.14638" aria-labelledby="html-2502.14638" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14638" title="Other formats" id="oth-2502.14638" aria-labelledby="oth-2502.14638">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> NAVIG: Natural Language-guided Analysis with Vision Language Models for Image Geo-localization </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Z">Zheyuan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+R">Runze Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kabir,+T">Tasnim Kabir</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Boyd-Graber,+J">Jordan Boyd-Graber</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Image geo-localization is the task of predicting the specific location of an image and requires complex reasoning across visual, geographical, and cultural contexts. While prior Vision Language Models (VLMs) have the best accuracy at this task, there is a dearth of high-quality datasets and models for analytical reasoning. We first create NaviClues, a high-quality dataset derived from GeoGuessr, a popular geography game, to supply examples of expert reasoning from language. Using this dataset, we present Navig, a comprehensive image geo-localization framework integrating global and fine-grained image information. By reasoning with language, Navig reduces the average distance error by 14% compared to previous state-of-the-art models while requiring fewer than 1000 training samples. Our dataset and code are available at <a href="https://github.com/SparrowZheyuan18/Navig/" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item85'>[85]</a> <a href ="/abs/2502.14684" title="Abstract" id="2502.14684"> arXiv:2502.14684 </a> (cross-list from cs.GR) [<a href="/pdf/2502.14684" title="Download PDF" id="pdf-2502.14684" aria-labelledby="pdf-2502.14684">pdf</a>, <a href="https://arxiv.org/html/2502.14684v1" title="View HTML" id="html-2502.14684" aria-labelledby="html-2502.14684" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14684" title="Other formats" id="oth-2502.14684" aria-labelledby="oth-2502.14684">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CDGS: Confidence-Aware Depth Regularization for 3D Gaussian Splatting </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Q">Qilin Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wysocki,+O">Olaf Wysocki</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Urban,+S">Steffen Urban</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jutzi,+B">Boris Jutzi</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Graphics (cs.GR)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> 3D Gaussian Splatting (3DGS) has shown significant advantages in novel view synthesis (NVS), particularly in achieving high rendering speeds and high-quality results. However, its geometric accuracy in 3D reconstruction remains limited due to the lack of explicit geometric constraints during optimization. This paper introduces CDGS, a confidence-aware depth regularization approach developed to enhance 3DGS. We leverage multi-cue confidence maps of monocular depth estimation and sparse Structure-from-Motion depth to adaptively adjust depth supervision during the optimization process. Our method demonstrates improved geometric detail preservation in early training stages and achieves competitive performance in both NVS quality and geometric accuracy. Experiments on the publicly available Tanks and Temples benchmark dataset show that our method achieves more stable convergence behavior and more accurate geometric reconstruction results, with improvements of up to 2.31 dB in PSNR for NVS and consistently lower geometric errors in M3C2 distance metrics. Notably, our method reaches comparable F-scores to the original 3DGS with only 50% of the training iterations. We expect this work will facilitate the development of efficient and accurate 3D reconstruction systems for real-world applications such as digital twin creation, heritage preservation, or forestry applications. </p> </div> </dd> <dt> <a name='item86'>[86]</a> <a href ="/abs/2502.14753" title="Abstract" id="2502.14753"> arXiv:2502.14753 </a> (cross-list from eess.IV) [<a href="/pdf/2502.14753" title="Download PDF" id="pdf-2502.14753" aria-labelledby="pdf-2502.14753">pdf</a>, <a href="https://arxiv.org/html/2502.14753v1" title="View HTML" id="html-2502.14753" aria-labelledby="html-2502.14753" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14753" title="Other formats" id="oth-2502.14753" aria-labelledby="oth-2502.14753">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MedVAE: Efficient Automated Interpretation of Medical Images with Large-Scale Generalizable Autoencoders </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Varma,+M">Maya Varma</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Kumar,+A">Ashwin Kumar</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=van+der+Sluijs,+R">Rogier van der Sluijs</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Ostmeier,+S">Sophie Ostmeier</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Blankemeier,+L">Louis Blankemeier</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Chambon,+P">Pierre Chambon</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Bluethgen,+C">Christian Bluethgen</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Prince,+J">Jip Prince</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Langlotz,+C">Curtis Langlotz</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Chaudhari,+A">Akshay Chaudhari</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Medical images are acquired at high resolutions with large fields of view in order to capture fine-grained features necessary for clinical decision-making. Consequently, training deep learning models on medical images can incur large computational costs. In this work, we address the challenge of downsizing medical images in order to improve downstream computational efficiency while preserving clinically-relevant features. We introduce MedVAE, a family of six large-scale 2D and 3D autoencoders capable of encoding medical images as downsized latent representations and decoding latent representations back to high-resolution images. We train MedVAE autoencoders using a novel two-stage training approach with 1,052,730 medical images. Across diverse tasks obtained from 20 medical image datasets, we demonstrate that (1) utilizing MedVAE latent representations in place of high-resolution images when training downstream models can lead to efficiency benefits (up to 70x improvement in throughput) while simultaneously preserving clinically-relevant features and (2) MedVAE can decode latent representations back to high-resolution images with high fidelity. Our work demonstrates that large-scale, generalizable autoencoders can help address critical efficiency challenges in the medical domain. Our code is available at <a href="https://github.com/StanfordMIMI/MedVAE" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item87'>[87]</a> <a href ="/abs/2502.14762" title="Abstract" id="2502.14762"> arXiv:2502.14762 </a> (cross-list from cs.LG) [<a href="/pdf/2502.14762" title="Download PDF" id="pdf-2502.14762" aria-labelledby="pdf-2502.14762">pdf</a>, <a href="https://arxiv.org/html/2502.14762v1" title="View HTML" id="html-2502.14762" aria-labelledby="html-2502.14762" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14762" title="Other formats" id="oth-2502.14762" aria-labelledby="oth-2502.14762">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Sculpting [CLS] Features for Pre-Trained Model-Based Class-Incremental Learning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yildirim,+M+O">Murat Onur Yildirim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yildirim,+E+C+G">Elif Ceren Gok Yildirim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Vanschoren,+J">Joaquin Vanschoren</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Class-incremental learning requires models to continually acquire knowledge of new classes without forgetting old ones. Although pre-trained models have demonstrated strong performance in class-incremental learning, they remain susceptible to catastrophic forgetting when learning new concepts. Excessive plasticity in the models breaks generalizability and causes forgetting, while strong stability results in insufficient adaptation to new classes. This necessitates effective adaptation with minimal modifications to preserve the general knowledge of pre-trained models. To address this challenge, we first introduce a new parameter-efficient fine-tuning module 'Learn and Calibrate', or LuCA, designed to acquire knowledge through an adapter-calibrator couple, enabling effective adaptation with well-refined feature representations. Second, for each learning session, we deploy a sparse LuCA module on top of the last token just before the classifier, which we refer to as 'Token-level Sparse Calibration and Adaptation', or TOSCA. This strategic design improves the orthogonality between the modules and significantly reduces both training and inference complexity. By leaving the generalization capabilities of the pre-trained models intact and adapting exclusively via the last token, our approach achieves a harmonious balance between stability and plasticity. Extensive experiments demonstrate TOSCA's state-of-the-art performance while introducing ~8 times fewer parameters compared to prior methods. </p> </div> </dd> <dt> <a name='item88'>[88]</a> <a href ="/abs/2502.14778" title="Abstract" id="2502.14778"> arXiv:2502.14778 </a> (cross-list from cs.CL) [<a href="/pdf/2502.14778" title="Download PDF" id="pdf-2502.14778" aria-labelledby="pdf-2502.14778">pdf</a>, <a href="https://arxiv.org/html/2502.14778v1" title="View HTML" id="html-2502.14778" aria-labelledby="html-2502.14778" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14778" title="Other formats" id="oth-2502.14778" aria-labelledby="oth-2502.14778">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Harnessing PDF Data for Improving Japanese Large Multimodal Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Baek,+J">Jeonghun Baek</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Aizawa,+A">Akiko Aizawa</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Aizawa,+K">Kiyoharu Aizawa</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 15 pages, 8 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Large Multimodal Models (LMMs) have demonstrated strong performance in English, but their effectiveness in Japanese remains limited due to the lack of high-quality training data. Current Japanese LMMs often rely on translated English datasets, restricting their ability to capture Japan-specific cultural knowledge. To address this, we explore the potential of Japanese PDF data as a training resource, an area that remains largely underutilized. We introduce a fully automated pipeline that leverages pretrained models to extract image-text pairs from PDFs through layout analysis, OCR, and vision-language pairing, removing the need for manual annotation. Additionally, we construct instruction data from extracted image-text pairs to enrich the training data. To evaluate the effectiveness of PDF-derived data, we train Japanese LMMs and assess their performance on the Japanese LMM Benchmark. Our results demonstrate substantial improvements, with performance gains ranging from 3.9% to 13.8% on Heron-Bench. Further analysis highlights the impact of PDF-derived data on various factors, such as model size and language models, reinforcing its value as a multimodal resource for Japanese LMMs. We plan to make the source code and data publicly available upon acceptance. </p> </div> </dd> <dt> <a name='item89'>[89]</a> <a href ="/abs/2502.14780" title="Abstract" id="2502.14780"> arXiv:2502.14780 </a> (cross-list from cs.CL) [<a href="/pdf/2502.14780" title="Download PDF" id="pdf-2502.14780" aria-labelledby="pdf-2502.14780">pdf</a>, <a href="https://arxiv.org/html/2502.14780v1" title="View HTML" id="html-2502.14780" aria-labelledby="html-2502.14780" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14780" title="Other formats" id="oth-2502.14780" aria-labelledby="oth-2502.14780">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> ReVision: A Dataset and Baseline VLM for Privacy-Preserving Task-Oriented Visual Instruction Rewriting </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Mishra,+A">Abhijit Mishra</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Noh,+R">Richard Noh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fu,+H">Hsiang Fu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+M">Mingda Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+M">Minji Kim</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 12 pages, 7 figures, 3 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Efficient and privacy-preserving multimodal interaction is essential as AR, VR, and modern smartphones with powerful cameras become primary interfaces for human-computer communication. Existing powerful large vision-language models (VLMs) enabling multimodal interaction often rely on cloud-based processing, raising significant concerns about (1) visual privacy by transmitting sensitive vision data to servers, and (2) their limited real-time, on-device usability. This paper explores Visual Instruction Rewriting, a novel approach that transforms multimodal instructions into text-only commands, allowing seamless integration of lightweight on-device instruction rewriter VLMs (250M parameters) with existing conversational AI systems, enhancing vision data privacy. To achieve this, we present a dataset of over 39,000 examples across 14 domains and develop a compact VLM, pretrained on image captioning datasets and fine-tuned for instruction rewriting. Experimental results, evaluated through NLG metrics such as BLEU, METEOR, and ROUGE, along with semantic parsing analysis, demonstrate that even a quantized version of the model (<500MB storage footprint) can achieve effective instruction rewriting, thus enabling privacy-focused, multimodal AI applications. </p> </div> </dd> <dt> <a name='item90'>[90]</a> <a href ="/abs/2502.14795" title="Abstract" id="2502.14795"> arXiv:2502.14795 </a> (cross-list from cs.RO) [<a href="/pdf/2502.14795" title="Download PDF" id="pdf-2502.14795" aria-labelledby="pdf-2502.14795">pdf</a>, <a href="https://arxiv.org/html/2502.14795v1" title="View HTML" id="html-2502.14795" aria-labelledby="html-2502.14795" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14795" title="Other formats" id="oth-2502.14795" aria-labelledby="oth-2502.14795">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Humanoid-VLA: Towards Universal Humanoid Control with Visual Integration </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ding,+P">Pengxiang Ding</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+J">Jianfei Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tong,+X">Xinyang Tong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zou,+B">Binghong Zou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+X">Xinxin Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fan,+Y">Yiguo Fan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+T">Ting Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+H">Hongchao Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mo,+P">Panzhong Mo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+J">Jinxin Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yuefan Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+H">Huaicheng Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+W">Wenshuo Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+J">Jiacheng Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+S">Siteng Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+D">Donglin Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> This paper addresses the limitations of current humanoid robot control frameworks, which primarily rely on reactive mechanisms and lack autonomous interaction capabilities due to data scarcity. We propose Humanoid-VLA, a novel framework that integrates language understanding, egocentric scene perception, and motion control, enabling universal humanoid control. Humanoid-VLA begins with language-motion pre-alignment using non-egocentric human motion datasets paired with textual descriptions, allowing the model to learn universal motion patterns and action semantics. We then incorporate egocentric visual context through a parameter efficient video-conditioned fine-tuning, enabling context-aware motion generation. Furthermore, we introduce a self-supervised data augmentation strategy that automatically generates pseudoannotations directly derived from motion data. This process converts raw motion sequences into informative question-answer pairs, facilitating the effective use of large-scale unlabeled video data. Built upon whole-body control architectures, extensive experiments show that Humanoid-VLA achieves object interaction and environment exploration tasks with enhanced contextual awareness, demonstrating a more human-like capacity for adaptive and intelligent engagement. </p> </div> </dd> <dt> <a name='item91'>[91]</a> <a href ="/abs/2502.14807" title="Abstract" id="2502.14807"> arXiv:2502.14807 </a> (cross-list from eess.IV) [<a href="/pdf/2502.14807" title="Download PDF" id="pdf-2502.14807" aria-labelledby="pdf-2502.14807">pdf</a>, <a href="https://arxiv.org/html/2502.14807v1" title="View HTML" id="html-2502.14807" aria-labelledby="html-2502.14807" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14807" title="Other formats" id="oth-2502.14807" aria-labelledby="oth-2502.14807">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> FetalCLIP: A Visual-Language Foundation Model for Fetal Ultrasound Image Analysis </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Maani,+F">Fadillah Maani</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Saeed,+N">Numan Saeed</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Saleem,+T">Tausifa Saleem</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Farooq,+Z">Zaid Farooq</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Alasmawi,+H">Hussain Alasmawi</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Diehl,+W">Werner Diehl</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Mohammad,+A">Ameera Mohammad</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Waring,+G">Gareth Waring</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Valappi,+S">Saudabi Valappi</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Bricker,+L">Leanne Bricker</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Yaqub,+M">Mohammad Yaqub</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Foundation models are becoming increasingly effective in the medical domain, offering pre-trained models on large datasets that can be readily adapted for downstream tasks. Despite progress, fetal ultrasound images remain a challenging domain for foundation models due to their inherent complexity, often requiring substantial additional training and facing limitations due to the scarcity of paired multimodal data. To overcome these challenges, here we introduce FetalCLIP, a vision-language foundation model capable of generating universal representation of fetal ultrasound images. FetalCLIP was pre-trained using a multimodal learning approach on a diverse dataset of 210,035 fetal ultrasound images paired with text. This represents the largest paired dataset of its kind used for foundation model development to date. This unique training approach allows FetalCLIP to effectively learn the intricate anatomical features present in fetal ultrasound images, resulting in robust representations that can be used for a variety of downstream applications. In extensive benchmarking across a range of key fetal ultrasound applications, including classification, gestational age estimation, congenital heart defect (CHD) detection, and fetal structure segmentation, FetalCLIP outperformed all baselines while demonstrating remarkable generalizability and strong performance even with limited labeled data. We plan to release the FetalCLIP model publicly for the benefit of the broader scientific community. </p> </div> </dd> <dt> <a name='item92'>[92]</a> <a href ="/abs/2502.14844" title="Abstract" id="2502.14844"> arXiv:2502.14844 </a> (cross-list from cs.GR) [<a href="/pdf/2502.14844" title="Download PDF" id="pdf-2502.14844" aria-labelledby="pdf-2502.14844">pdf</a>, <a href="https://arxiv.org/html/2502.14844v1" title="View HTML" id="html-2502.14844" aria-labelledby="html-2502.14844" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14844" title="Other formats" id="oth-2502.14844" aria-labelledby="oth-2502.14844">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Dynamic Concepts Personalization from Single Videos </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Abdal,+R">Rameen Abdal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Patashnik,+O">Or Patashnik</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Skorokhodov,+I">Ivan Skorokhodov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Menapace,+W">Willi Menapace</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Siarohin,+A">Aliaksandr Siarohin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tulyakov,+S">Sergey Tulyakov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cohen-Or,+D">Daniel Cohen-Or</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Aberman,+K">Kfir Aberman</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Webpage: <a href="https://snap-research.github.io/dynamic_concepts/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Graphics (cs.GR)</span>; Computer Vision and Pattern Recognition (cs.CV); Machine Learning (cs.LG) </div> <p class='mathjax'> Personalizing generative text-to-image models has seen remarkable progress, but extending this personalization to text-to-video models presents unique challenges. Unlike static concepts, personalizing text-to-video models has the potential to capture dynamic concepts, i.e., entities defined not only by their appearance but also by their motion. In this paper, we introduce Set-and-Sequence, a novel framework for personalizing Diffusion Transformers (DiTs)-based generative video models with dynamic concepts. Our approach imposes a spatio-temporal weight space within an architecture that does not explicitly separate spatial and temporal features. This is achieved in two key stages. First, we fine-tune Low-Rank Adaptation (LoRA) layers using an unordered set of frames from the video to learn an identity LoRA basis that represents the appearance, free from temporal interference. In the second stage, with the identity LoRAs frozen, we augment their coefficients with Motion Residuals and fine-tune them on the full video sequence, capturing motion dynamics. Our Set-and-Sequence framework results in a spatio-temporal weight space that effectively embeds dynamic concepts into the video model's output domain, enabling unprecedented editability and compositionality while setting a new benchmark for personalizing dynamic concepts. </p> </div> </dd> <dt> <a name='item93'>[93]</a> <a href ="/abs/2502.14864" title="Abstract" id="2502.14864"> arXiv:2502.14864 </a> (cross-list from cs.AI) [<a href="/pdf/2502.14864" title="Download PDF" id="pdf-2502.14864" aria-labelledby="pdf-2502.14864">pdf</a>, <a href="https://arxiv.org/html/2502.14864v1" title="View HTML" id="html-2502.14864" aria-labelledby="html-2502.14864" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14864" title="Other formats" id="oth-2502.14864" aria-labelledby="oth-2502.14864">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Benchmarking Multimodal RAG through a Chart-based Document Question-Answering Generation Framework </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+Y">Yuming Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhong,+J">Jiang Zhong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jin,+L">Li Jin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+J">Jingwang Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+J">Jingpeng Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Q">Qing Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bai,+Y">Yang Bai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+J">Jingyuan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+R">Rui Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wei,+K">Kaiwen Wei</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Multimodal Retrieval-Augmented Generation (MRAG) enhances reasoning capabilities by integrating external knowledge. However, existing benchmarks primarily focus on simple image-text interactions, overlooking complex visual formats like charts that are prevalent in real-world applications. In this work, we introduce a novel task, Chart-based MRAG, to address this limitation. To semi-automatically generate high-quality evaluation samples, we propose CHARt-based document question-answering GEneration (CHARGE), a framework that produces evaluation data through structured keypoint extraction, crossmodal verification, and keypoint-based generation. By combining CHARGE with expert validation, we construct Chart-MRAG Bench, a comprehensive benchmark for chart-based MRAG evaluation, featuring 4,738 question-answering pairs across 8 domains from real-world documents. Our evaluation reveals three critical limitations in current approaches: (1) unified multimodal embedding retrieval methods struggles in chart-based scenarios, (2) even with ground-truth retrieval, state-of-the-art MLLMs achieve only 58.19% Correctness and 73.87% Coverage scores, and (3) MLLMs demonstrate consistent text-over-visual modality bias during Chart-based MRAG reasoning. The CHARGE and Chart-MRAG Bench are released at <a href="https://github.com/Nomothings/CHARGE.git" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> </dl> <dl id='articles'> <h3>Replacement submissions (showing 51 of 51 entries)</h3> <dt> <a name='item94'>[94]</a> <a href ="/abs/2304.06020" title="Abstract" id="2304.06020"> arXiv:2304.06020 </a> (replaced) [<a href="/pdf/2304.06020" title="Download PDF" id="pdf-2304.06020" aria-labelledby="pdf-2304.06020">pdf</a>, <a href="https://arxiv.org/html/2304.06020v2" title="View HTML" id="html-2304.06020" aria-labelledby="html-2304.06020" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2304.06020" title="Other formats" id="oth-2304.06020" aria-labelledby="oth-2304.06020">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> VidStyleODE: Disentangled Video Editing via StyleGAN and NeuralODEs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ali,+M+H">Moayed Haji Ali</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bond,+A">Andrew Bond</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Birdal,+T">Tolga Birdal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ceylan,+D">Duygu Ceylan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Karacan,+L">Levent Karacan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Erdem,+E">Erkut Erdem</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Erdem,+A">Aykut Erdem</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Project website: <a href="https://cyberiada.github.io/VidStyleODE" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> ICCV 2023 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> We propose $\textbf{VidStyleODE}$, a spatiotemporally continuous disentangled $\textbf{Vid}$eo representation based upon $\textbf{Style}$GAN and Neural-$\textbf{ODE}$s. Effective traversal of the latent space learned by Generative Adversarial Networks (GANs) has been the basis for recent breakthroughs in image editing. However, the applicability of such advancements to the video domain has been hindered by the difficulty of representing and controlling videos in the latent space of GANs. In particular, videos are composed of content (i.e., appearance) and complex motion components that require a special mechanism to disentangle and control. To achieve this, VidStyleODE encodes the video content in a pre-trained StyleGAN $\mathcal{W}_+$ space and benefits from a latent ODE component to summarize the spatiotemporal dynamics of the input video. Our novel continuous video generation process then combines the two to generate high-quality and temporally consistent videos with varying frame rates. We show that our proposed method enables a variety of applications on real videos: text-guided appearance manipulation, motion manipulation, image animation, and video interpolation and extrapolation. Project website: <a href="https://cyberiada.github.io/VidStyleODE" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item95'>[95]</a> <a href ="/abs/2308.05480" title="Abstract" id="2308.05480"> arXiv:2308.05480 </a> (replaced) [<a href="/pdf/2308.05480" title="Download PDF" id="pdf-2308.05480" aria-labelledby="pdf-2308.05480">pdf</a>, <a href="https://arxiv.org/html/2308.05480v2" title="View HTML" id="html-2308.05480" aria-labelledby="html-2308.05480" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2308.05480" title="Other formats" id="oth-2308.05480" aria-labelledby="oth-2308.05480">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> YOLO-MS: Rethinking Multi-Scale Representation Learning for Real-time Object Detection </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Y">Yuming Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+X">Xinbin Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+J">Jiabao Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+R">Ruiqi Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+X">Xiang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hou,+Q">Qibin Hou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cheng,+M">Ming-Ming Cheng</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 13 pages, 8 figures </div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> IEEE TPAMI 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> We aim at providing the object detection community with an efficient and performant object detector, termed YOLO-MS. The core design is based on a series of investigations on how multi-branch features of the basic block and convolutions with different kernel sizes affect the detection performance of objects at different scales. The outcome is a new strategy that can significantly enhance multi-scale feature representations of real-time object detectors. To verify the effectiveness of our work, we train our YOLO-MS on the MS COCO dataset from scratch without relying on any other large-scale datasets, like ImageNet or pre-trained weights. Without bells and whistles, our YOLO-MS outperforms the recent state-of-the-art real-time object detectors, including YOLO-v7, RTMDet, and YOLO-v8. Taking the XS version of YOLO-MS as an example, it can achieve an AP score of 42+% on MS COCO, which is about 2% higher than RTMDet with the same model size. Furthermore, our work can also serve as a plug-and-play module for other YOLO models. Typically, our method significantly advances the APs, APl, and AP of YOLOv8-N from 18%+, 52%+, and 37%+ to 20%+, 55%+, and 40%+, respectively, with even fewer parameters and MACs. Code and trained models are publicly available at <a href="https://github.com/FishAndWasabi/YOLO-MS" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. We also provide the Jittor version at <a href="https://github.com/NK-JittorCV/nk-yolo" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item96'>[96]</a> <a href ="/abs/2309.12865" title="Abstract" id="2309.12865"> arXiv:2309.12865 </a> (replaced) [<a href="/pdf/2309.12865" title="Download PDF" id="pdf-2309.12865" aria-labelledby="pdf-2309.12865">pdf</a>, <a href="https://arxiv.org/html/2309.12865v4" title="View HTML" id="html-2309.12865" aria-labelledby="html-2309.12865" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2309.12865" title="Other formats" id="oth-2309.12865" aria-labelledby="oth-2309.12865">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Bridging Sensor Gaps via Attention Gated Tuning for Hyperspectral Image Classification </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xue,+X">Xizhe Xue</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+H">Haokui Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jing,+H">Haizhao Jing</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tao,+L">Lijie Tao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bai,+Z">Zongwen Bai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Ying Li</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Data-hungry HSI classification methods require high-quality labeled HSIs, which are often costly to obtain. This characteristic limits the performance potential of data-driven methods when dealing with limited annotated samples. Bridging the domain gap between data acquired from different sensors allows us to utilize abundant labeled data across sensors to break this bottleneck. In this paper, we propose a novel Attention-Gated Tuning (AGT) strategy and a triplet-structured transformer model, Tri-Former, to address this issue. The AGT strategy serves as a bridge, allowing us to leverage existing labeled HSI datasets, even RGB datasets to enhance the performance on new HSI datasets with limited samples. Instead of inserting additional parameters inside the basic model, we train a lightweight auxiliary branch that takes intermediate features as input from the basic model and makes predictions. The proposed AGT resolves conflicts between heterogeneous and even cross-modal data by suppressing the disturbing information and enhances the useful information through a soft gate. Additionally, we introduce Tri-Former, a triplet-structured transformer with a spectral-spatial separation design that enhances parameter utilization and computational efficiency, enabling easier and flexible fine-tuning. Comparison experiments conducted on three representative HSI datasets captured by different sensors demonstrate the proposed Tri-Former achieves better performance compared to several state-of-the-art methods. Homologous, heterologous and cross-modal tuning experiments verified the effectiveness of the proposed AGT. Code has been released at: \href{<a href="https://github.com/Cecilia-xue/AGT" rel="external noopener nofollow" class="link-external link-https">this https URL</a>}{<a href="https://github.com/Cecilia-xue/AGT" rel="external noopener nofollow" class="link-external link-https">this https URL</a>}. </p> </div> </dd> <dt> <a name='item97'>[97]</a> <a href ="/abs/2309.16850" title="Abstract" id="2309.16850"> arXiv:2309.16850 </a> (replaced) [<a href="/pdf/2309.16850" title="Download PDF" id="pdf-2309.16850" aria-labelledby="pdf-2309.16850">pdf</a>, <a href="https://arxiv.org/html/2309.16850v2" title="View HTML" id="html-2309.16850" aria-labelledby="html-2309.16850" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2309.16850" title="Other formats" id="oth-2309.16850" aria-labelledby="oth-2309.16850">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Sketch2CAD: 3D CAD Model Reconstruction from 2D Sketch using Visual Transformer </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+H">Hong-Bin Yang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Current 3D reconstruction methods typically generate outputs in the form of voxels, point clouds, or meshes. However, each of these formats has inherent limitations, such as rough surfaces and distorted structures. Additionally, these data types are not ideal for further manual editing and post-processing. In this paper, we present a novel 3D reconstruction method designed to overcome these disadvantages by reconstructing CAD-compatible models. We trained a visual transformer to predict a "scene descriptor" from a single 2D wire-frame image. This descriptor includes essential information, such as object types and parameters like position, rotation, and size. Using the predicted parameters, a 3D scene can be reconstructed with 3D modeling software that has programmable interfaces, such as Rhino Grasshopper, to build highly editable 3D models in the form of B-rep. To evaluate our proposed model, we created two datasets: one consisting of simple scenes and another with more complex scenes. The test results indicate the model's capability to accurately reconstruct simple scenes while highlighting its difficulties with more complex ones. </p> </div> </dd> <dt> <a name='item98'>[98]</a> <a href ="/abs/2312.04398" title="Abstract" id="2312.04398"> arXiv:2312.04398 </a> (replaced) [<a href="/pdf/2312.04398" title="Download PDF" id="pdf-2312.04398" aria-labelledby="pdf-2312.04398">pdf</a>, <a href="/format/2312.04398" title="Other formats" id="oth-2312.04398" aria-labelledby="oth-2312.04398">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Intelligent Anomaly Detection for Lane Rendering Using Transformer with Self-Supervised Pre-Training and Customized Fine-Tuning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Dong,+Y">Yongqi Dong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+X">Xingmin Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+R">Ruohan Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+W">Wei Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=van+Arem,+B">Bart van Arem</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Farah,+H">Haneen Farah</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 26 pages, 7 figures, accepted by the 103rd Transportation Research Board (TRB) Annual Meeting, under review by Transportation Research Record: Journal of the Transportation Research Board </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG); Image and Video Processing (eess.IV); Machine Learning (stat.ML) </div> <p class='mathjax'> The burgeoning navigation services using digital maps provide great convenience to drivers. Nevertheless, the presence of anomalies in lane rendering map images occasionally introduces potential hazards, as such anomalies can be misleading to human drivers and consequently contribute to unsafe driving conditions. In response to this concern and to accurately and effectively detect the anomalies, this paper transforms lane rendering image anomaly detection into a classification problem and proposes a four-phase pipeline consisting of data pre-processing, self-supervised pre-training with the masked image modeling (MiM) method, customized fine-tuning using cross-entropy based loss with label smoothing, and post-processing to tackle it leveraging state-of-the-art deep learning techniques, especially those involving Transformer models. Various experiments verify the effectiveness of the proposed pipeline. Results indicate that the proposed pipeline exhibits superior performance in lane rendering image anomaly detection, and notably, the self-supervised pre-training with MiM can greatly enhance the detection accuracy while significantly reducing the total training time. For instance, employing the Swin Transformer with Uniform Masking as self-supervised pretraining (Swin-Trans-UM) yielded a heightened accuracy at 94.77% and an improved Area Under The Curve (AUC) score of 0.9743 compared with the pure Swin Transformer without pre-training (Swin-Trans) with an accuracy of 94.01% and an AUC of 0.9498. The fine-tuning epochs were dramatically reduced to 41 from the original 280. In conclusion, the proposed pipeline, with its incorporation of self-supervised pre-training using MiM and other advanced deep learning techniques, emerges as a robust solution for enhancing the accuracy and efficiency of lane rendering image anomaly detection in digital navigation systems. </p> </div> </dd> <dt> <a name='item99'>[99]</a> <a href ="/abs/2402.12788" title="Abstract" id="2402.12788"> arXiv:2402.12788 </a> (replaced) [<a href="/pdf/2402.12788" title="Download PDF" id="pdf-2402.12788" aria-labelledby="pdf-2402.12788">pdf</a>, <a href="https://arxiv.org/html/2402.12788v3" title="View HTML" id="html-2402.12788" aria-labelledby="html-2402.12788" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2402.12788" title="Other formats" id="oth-2402.12788" aria-labelledby="oth-2402.12788">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> RhythmFormer: Extracting Patterned rPPG Signals based on Periodic Sparse Attention </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zou,+B">Bochao Zou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+Z">Zizheng Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+J">Jiansheng Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhuo,+J">Junbao Zhuo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+W">Weiran Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+H">Huimin Ma</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Remote photoplethysmography (rPPG) is a non-contact method for detecting physiological signals based on facial videos, holding high potential in various applications. Due to the periodicity nature of rPPG signals, the long-range dependency capturing capacity of the transformer was assumed to be advantageous for such signals. However, existing methods have not conclusively demonstrated the superior performance of transformers over traditional convolutional neural networks. This may be attributed to the quadratic scaling exhibited by transformer with sequence length, resulting in coarse-grained feature extraction, which in turn affects robustness and generalization. To address that, this paper proposes a periodic sparse attention mechanism based on temporal attention sparsity induced by periodicity. A pre-attention stage is introduced before the conventional attention mechanism. This stage learns periodic patterns to filter out a large number of irrelevant attention computations, thus enabling fine-grained feature extraction. Moreover, to address the issue of fine-grained features being more susceptible to noise interference, a fusion stem is proposed to effectively guide self-attention towards rPPG features. It can be easily integrated into existing methods to enhance their performance. Extensive experiments show that the proposed method achieves state-of-the-art performance in both intra-dataset and cross-dataset evaluations. The codes are available at <a href="https://github.com/zizheng-guo/RhythmFormer" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item100'>[100]</a> <a href ="/abs/2404.03631" title="Abstract" id="2404.03631"> arXiv:2404.03631 </a> (replaced) [<a href="/pdf/2404.03631" title="Download PDF" id="pdf-2404.03631" aria-labelledby="pdf-2404.03631">pdf</a>, <a href="https://arxiv.org/html/2404.03631v2" title="View HTML" id="html-2404.03631" aria-labelledby="html-2404.03631" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2404.03631" title="Other formats" id="oth-2404.03631" aria-labelledby="oth-2404.03631">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Robust Concept Erasure Using Task Vectors </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Pham,+M">Minh Pham</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Marshall,+K+O">Kelly O. Marshall</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hegde,+C">Chinmay Hegde</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cohen,+N">Niv Cohen</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> With the rapid growth of text-to-image models, a variety of techniques have been suggested to prevent undesirable image generations. Yet, these methods often only protect against specific user prompts and have been shown to allow unsafe generations with other inputs. Here we focus on unconditionally erasing a concept from a text-to-image model rather than conditioning the erasure on the user's prompt. We first show that compared to input-dependent erasure methods, concept erasure that uses Task Vectors (TV) is more robust to unexpected user inputs, not seen during training. However, TV-based erasure can also affect the core performance of the edited model, particularly when the required edit strength is unknown. To this end, we propose a method called Diverse Inversion, which we use to estimate the required strength of the TV edit. Diverse Inversion finds within the model input space a large set of word embeddings, each of which induces the generation of the target concept. We find that encouraging diversity in the set makes our estimation more robust to unexpected prompts. Finally, we show that Diverse Inversion enables us to apply a TV edit only to a subset of the model weights, enhancing the erasure capabilities while better maintaining the core functionality of the model. </p> </div> </dd> <dt> <a name='item101'>[101]</a> <a href ="/abs/2404.08449" title="Abstract" id="2404.08449"> arXiv:2404.08449 </a> (replaced) [<a href="/pdf/2404.08449" title="Download PDF" id="pdf-2404.08449" aria-labelledby="pdf-2404.08449">pdf</a>, <a href="/format/2404.08449" title="Other formats" id="oth-2404.08449" aria-labelledby="oth-2404.08449">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> OccGaussian: 3D Gaussian Splatting for Occluded Human Rendering </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ye,+J">Jingrui Ye</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Z">Zongkai Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+Y">Yujiao Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liao,+Q">Qingmin Liao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+W">Wenming Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+Z">Zongqing Lu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> We have decided to withdraw this paper because the results require further verification or additional experimental data. We plan to resubmit an updated version once the necessary work is completed </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Rendering dynamic 3D human from monocular videos is crucial for various applications such as virtual reality and digital entertainment. Most methods assume the people is in an unobstructed scene, while various objects may cause the occlusion of body parts in real-life scenarios. Previous method utilizing NeRF for surface rendering to recover the occluded areas, but it requiring more than one day to train and several seconds to render, failing to meet the requirements of real-time interactive applications. To address these issues, we propose OccGaussian based on 3D Gaussian Splatting, which can be trained within 6 minutes and produces high-quality human renderings up to 160 FPS with occluded input. OccGaussian initializes 3D Gaussian distributions in the canonical space, and we perform occlusion feature query at occluded regions, the aggregated pixel-align feature is extracted to compensate for the missing information. Then we use Gaussian Feature MLP to further process the feature along with the occlusion-aware loss functions to better perceive the occluded area. Extensive experiments both in simulated and real-world occlusions, demonstrate that our method achieves comparable or even superior performance compared to the state-of-the-art method. And we improving training and inference speeds by 250x and 800x, respectively. Our code will be available for research purposes. </p> </div> </dd> <dt> <a name='item102'>[102]</a> <a href ="/abs/2404.13425" title="Abstract" id="2404.13425"> arXiv:2404.13425 </a> (replaced) [<a href="/pdf/2404.13425" title="Download PDF" id="pdf-2404.13425" aria-labelledby="pdf-2404.13425">pdf</a>, <a href="https://arxiv.org/html/2404.13425v3" title="View HTML" id="html-2404.13425" aria-labelledby="html-2404.13425" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2404.13425" title="Other formats" id="oth-2404.13425" aria-labelledby="oth-2404.13425">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Enhancing Adversarial Robustness of Vision-Language Models through Low-Rank Adaptation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ji,+Y">Yuheng Ji</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Y">Yue Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Z">Zhicheng Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Z">Zhao Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+Y">Yuting Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hao,+X">Xiaoshuai Hao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+G">Gang Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+X">Xingwei Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zheng,+X">Xiaolong Zheng</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Vision-Language Models (VLMs) play a crucial role in the advancement of Artificial General Intelligence (AGI). As AGI rapidly evolves, addressing security concerns has emerged as one of the most significant challenges for VLMs. In this paper, we present extensive experiments that expose the vulnerabilities of conventional adaptation methods for VLMs, highlighting significant security risks. Moreover, as VLMs grow in size, the application of traditional adversarial adaptation techniques incurs substantial computational costs. To address these issues, we propose a parameter-efficient adversarial adaptation method called \textbf{\textit{AdvLoRA}} based on Low-Rank Adaptation. We investigate and reveal the inherent low-rank properties involved in adversarial adaptation for VLMs. Different from LoRA, we enhance the efficiency and robustness of adversarial adaptation by introducing a novel reparameterization method that leverages parameter clustering and alignment. Additionally, we propose an adaptive parameter update strategy to further bolster robustness. These innovations enable our AdvLoRA to mitigate issues related to model security and resource wastage. Extensive experiments confirm the effectiveness and efficiency of AdvLoRA. </p> </div> </dd> <dt> <a name='item103'>[103]</a> <a href ="/abs/2405.14974" title="Abstract" id="2405.14974"> arXiv:2405.14974 </a> (replaced) [<a href="/pdf/2405.14974" title="Download PDF" id="pdf-2405.14974" aria-labelledby="pdf-2405.14974">pdf</a>, <a href="https://arxiv.org/html/2405.14974v3" title="View HTML" id="html-2405.14974" aria-labelledby="html-2405.14974" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2405.14974" title="Other formats" id="oth-2405.14974" aria-labelledby="oth-2405.14974">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LOVA3: Learning to Visual Question Answering, Asking and Assessment </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+H+H">Henry Hengyuan Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+P">Pan Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+D">Difei Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bai,+Z">Zechen Bai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shou,+M+Z">Mike Zheng Shou</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> NeurIPS 2024. The code is available at <a href="https://github.com/showlab/LOVA3" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> Question answering, asking, and assessment are three innate human traits crucial for understanding the world and acquiring knowledge. By enhancing these capabilities, humans can more effectively utilize data, leading to better comprehension and learning outcomes. Current Multimodal Large Language Models (MLLMs) primarily focus on question answering, often neglecting the full potential of questioning and assessment skills. Inspired by the human learning mechanism, we introduce LOVA3, an innovative framework named "Learning tO Visual question Answering, Asking and Assessment," designed to equip MLLMs with these additional capabilities. Our approach involves the creation of two supplementary training tasks GenQA and EvalQA, aiming at fostering the skills of asking and assessing questions in the context of images. To develop the questioning ability, we compile a comprehensive set of multimodal foundational tasks. For assessment, we introduce a new benchmark called EvalQABench, comprising 64,000 training samples (split evenly between positive and negative samples) and 5,000 validation and testing samples. We posit that enhancing MLLMs with the capabilities to answer, ask, and assess questions will enhance their multimodal comprehension, ultimately improving overall performance. To validate this hypothesis, we train MLLMs using the LOVA3 framework and evaluate them on a range of multimodal datasets and benchmarks. Our results demonstrate consistent performance gains, underscoring the critical role of these additional tasks in fostering comprehensive intelligence in MLLMs. The code is available at <a href="https://github.com/showlab/LOVA3" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item104'>[104]</a> <a href ="/abs/2405.17719" title="Abstract" id="2405.17719"> arXiv:2405.17719 </a> (replaced) [<a href="/pdf/2405.17719" title="Download PDF" id="pdf-2405.17719" aria-labelledby="pdf-2405.17719">pdf</a>, <a href="https://arxiv.org/html/2405.17719v3" title="View HTML" id="html-2405.17719" aria-labelledby="html-2405.17719" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2405.17719" title="Other formats" id="oth-2405.17719" aria-labelledby="oth-2405.17719">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Do Egocentric Video-Language Models Truly Understand Hand-Object Interactions? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+B">Boshen Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Ziheng Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Du,+Y">Yang Du</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+Z">Zhinan Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zheng,+S">Sipeng Zheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jin,+Q">Qin Jin</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted by ICLR 2025. Code: <a href="https://github.com/xuboshen/EgoNCEpp" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Egocentric video-language pretraining is a crucial step in advancing the understanding of hand-object interactions in first-person scenarios. Despite successes on existing testbeds, we find that current EgoVLMs can be easily misled by simple modifications, such as changing the verbs or nouns in interaction descriptions, with models struggling to distinguish between these changes. This raises the question: Do EgoVLMs truly understand hand-object interactions? To address this question, we introduce a benchmark called EgoHOIBench, revealing the performance limitation of current egocentric models when confronted with such challenges. We attribute this performance gap to insufficient fine-grained supervision and the greater difficulty EgoVLMs experience in recognizing verbs compared to nouns. To tackle these issues, we propose a novel asymmetric contrastive objective named EgoNCE++. For the video-to-text objective, we enhance text supervision by generating negative captions using large language models or leveraging pretrained vocabulary for HOI-related word substitutions. For the text-to-video objective, we focus on preserving an object-centric feature space that clusters video representations based on shared nouns. Extensive experiments demonstrate that EgoNCE++ significantly enhances EgoHOI understanding, leading to improved performance across various EgoVLMs in tasks such as multi-instance retrieval, action recognition, and temporal understanding. Our code is available at <a href="https://github.com/xuboshen/EgoNCEpp" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item105'>[105]</a> <a href ="/abs/2406.02506" title="Abstract" id="2406.02506"> arXiv:2406.02506 </a> (replaced) [<a href="/pdf/2406.02506" title="Download PDF" id="pdf-2406.02506" aria-labelledby="pdf-2406.02506">pdf</a>, <a href="https://arxiv.org/html/2406.02506v3" title="View HTML" id="html-2406.02506" aria-labelledby="html-2406.02506" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.02506" title="Other formats" id="oth-2406.02506" aria-labelledby="oth-2406.02506">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> An Open-Source Tool for Mapping War Destruction at Scale in Ukraine using Sentinel-1 Time Series </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Dietrich,+O">Olivier Dietrich</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Peters,+T">Torben Peters</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Garnot,+V+S+F">Vivien Sainte Fare Garnot</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sticher,+V">Valerie Sticher</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Whelan,+T+T">Thao Ton-That Whelan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Schindler,+K">Konrad Schindler</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wegner,+J+D">Jan Dirk Wegner</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Access to detailed war impact assessments is crucial for humanitarian organizations to assist affected populations effectively. However, maintaining a comprehensive understanding of the situation on the ground is challenging, especially in widespread and prolonged conflicts. Here we present a scalable method for estimating building damage resulting from armed conflicts. By training a machine learning model on Synthetic Aperture Radar image time series, we generate probabilistic damage estimates at the building level, leveraging existing damage assessments and open building footprints. To allow large-scale inference and ensure accessibility, we tie our method to run on Google Earth Engine. Users can adjust confidence intervals to suit their needs, enabling rapid and flexible assessments of war-related damage across large areas. We provide two publicly accessible dashboards: a Ukraine Damage Explorer to dynamically view our precomputed estimates, and a Rapid Damage Mapping Tool to run our method and generate custom maps. </p> </div> </dd> <dt> <a name='item106'>[106]</a> <a href ="/abs/2406.03293" title="Abstract" id="2406.03293"> arXiv:2406.03293 </a> (replaced) [<a href="/pdf/2406.03293" title="Download PDF" id="pdf-2406.03293" aria-labelledby="pdf-2406.03293">pdf</a>, <a href="/format/2406.03293" title="Other formats" id="oth-2406.03293" aria-labelledby="oth-2406.03293">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Text-to-Image Rectified Flow as Plug-and-Play Priors </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+X">Xiaofeng Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+C">Cheng Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+X">Xulei Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+F">Fayao Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+G">Guosheng Lin</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> ICLR 2025 Camera Ready. Code: <a href="https://github.com/yangxiaofeng/rectified_flow_prior" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Large-scale diffusion models have achieved remarkable performance in generative tasks. Beyond their initial training applications, these models have proven their ability to function as versatile plug-and-play priors. For instance, 2D diffusion models can serve as loss functions to optimize 3D implicit models. Rectified flow, a novel class of generative models, enforces a linear progression from the source to the target distribution and has demonstrated superior performance across various domains. Compared to diffusion-based methods, rectified flow approaches surpass in terms of generation quality and efficiency, requiring fewer inference steps. In this work, we present theoretical and experimental evidence demonstrating that rectified flow based methods offer similar functionalities to diffusion models - they can also serve as effective priors. Besides the generative capabilities of diffusion priors, motivated by the unique time-symmetry properties of rectified flow models, a variant of our method can additionally perform image inversion. Experimentally, our rectified flow-based priors outperform their diffusion counterparts - the SDS and VSD losses - in text-to-3D generation. Our method also displays competitive performance in image inversion and editing. </p> </div> </dd> <dt> <a name='item107'>[107]</a> <a href ="/abs/2406.09408" title="Abstract" id="2406.09408"> arXiv:2406.09408 </a> (replaced) [<a href="/pdf/2406.09408" title="Download PDF" id="pdf-2406.09408" aria-labelledby="pdf-2406.09408">pdf</a>, <a href="https://arxiv.org/html/2406.09408v3" title="View HTML" id="html-2406.09408" aria-labelledby="html-2406.09408" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.09408" title="Other formats" id="oth-2406.09408" aria-labelledby="oth-2406.09408">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Data Attribution for Text-to-Image Models by Unlearning Synthesized Images </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Sheng-Yu Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hertzmann,+A">Aaron Hertzmann</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Efros,+A+A">Alexei A. Efros</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+J">Jun-Yan Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+R">Richard Zhang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> NeurIPS 2024 camera ready version. Project page: <a href="https://peterwang512.github.io/AttributeByUnlearning" rel="external noopener nofollow" class="link-external link-https">this https URL</a> Code: <a href="https://github.com/PeterWang512/AttributeByUnlearning" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> The goal of data attribution for text-to-image models is to identify the training images that most influence the generation of a new image. Influence is defined such that, for a given output, if a model is retrained from scratch without the most influential images, the model would fail to reproduce the same output. Unfortunately, directly searching for these influential images is computationally infeasible, since it would require repeatedly retraining models from scratch. In our work, we propose an efficient data attribution method by simulating unlearning the synthesized image. We achieve this by increasing the training loss on the output image, without catastrophic forgetting of other, unrelated concepts. We then identify training images with significant loss deviations after the unlearning process and label these as influential. We evaluate our method with a computationally intensive but "gold-standard" retraining from scratch and demonstrate our method's advantages over previous methods. </p> </div> </dd> <dt> <a name='item108'>[108]</a> <a href ="/abs/2406.14482" title="Abstract" id="2406.14482"> arXiv:2406.14482 </a> (replaced) [<a href="/pdf/2406.14482" title="Download PDF" id="pdf-2406.14482" aria-labelledby="pdf-2406.14482">pdf</a>, <a href="https://arxiv.org/html/2406.14482v2" title="View HTML" id="html-2406.14482" aria-labelledby="html-2406.14482" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.14482" title="Other formats" id="oth-2406.14482" aria-labelledby="oth-2406.14482">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Visible-Thermal Tiny Object Detection: A Benchmark Dataset and Baselines </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ying,+X">Xinyi Ying</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiao,+C">Chao Xiao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+R">Ruojing Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+X">Xu He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+B">Boyang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+X">Xu Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Z">Zhaoxu Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yingqian Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+M">Mingyuan Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+Q">Qingyu Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+Z">Zaiping Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+M">Miao Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+S">Shilin Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=An,+W">Wei An</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sheng,+W">Weidong Sheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+L">Li Liu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Small object detection (SOD) has been a longstanding yet challenging task for decades, with numerous datasets and algorithms being developed. However, they mainly focus on either visible or thermal modality, while visible-thermal (RGBT) bimodality is rarely explored. Although some RGBT datasets have been developed recently, the insufficient quantity, limited category, misaligned images and large target size cannot provide an impartial benchmark to evaluate multi-category visible-thermal small object detection (RGBT SOD) algorithms. In this paper, we build the first large-scale benchmark with high diversity for RGBT SOD (namely RGBT-Tiny), including 115 paired sequences, 93K frames and 1.2M manual annotations. RGBT-Tiny contains abundant targets (7 categories) and high-diversity scenes (8 types that cover different illumination and density variations). Note that, over 81% of targets are smaller than 16x16, and we provide paired bounding box annotations with tracking ID to offer an extremely challenging benchmark with wide-range applications, such as RGBT fusion, detection and tracking. In addition, we propose a scale adaptive fitness (SAFit) measure that exhibits high robustness on both small and large targets. The proposed SAFit can provide reasonable performance evaluation and promote detection performance. Based on the proposed RGBT-Tiny dataset and SAFit measure, extensive evaluations have been conducted, including 23 recent state-of-the-art algorithms that cover four different types (i.e., visible generic detection, visible SOD, thermal SOD and RGBT object detection). Project is available at <a href="https://github.com/XinyiYing/RGBT-Tiny" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item109'>[109]</a> <a href ="/abs/2407.01230" title="Abstract" id="2407.01230"> arXiv:2407.01230 </a> (replaced) [<a href="/pdf/2407.01230" title="Download PDF" id="pdf-2407.01230" aria-labelledby="pdf-2407.01230">pdf</a>, <a href="https://arxiv.org/html/2407.01230v3" title="View HTML" id="html-2407.01230" aria-labelledby="html-2407.01230" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2407.01230" title="Other formats" id="oth-2407.01230" aria-labelledby="oth-2407.01230">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> DaBiT: Depth and Blur informed Transformer for Video Focal Deblurring </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Morris,+C">Crispian Morris</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Anantrasirichai,+N">Nantheera Anantrasirichai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+F">Fan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bull,+D">David Bull</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> In many real-world scenarios, recorded videos suffer from accidental focus blur, and while video deblurring methods exist, most specifically target motion blur or spatial-invariant blur. This paper introduces a framework optimized for the as yet unattempted task of video focal deblurring (refocusing). The proposed method employs novel map-guided transformers, in addition to image propagation, to effectively leverage the continuous spatial variance of focal blur and restore the footage. We also introduce a flow re-focusing module designed to efficiently align relevant features between blurry and sharp domains. Additionally, we propose a novel technique for generating synthetic focal blur data, broadening the model's learning capabilities and robustness to include a wider array of content. We have made a new benchmark dataset, DAVIS-Blur, available. This dataset, a modified extension of the popular DAVIS video segmentation set, provides realistic focal blur degradations as well as the corresponding blur maps. Comprehensive experiments demonstrate the superiority of our approach. We achieve state-of-the-art results with an average PSNR performance over 1.9dB greater than comparable existing video restoration methods. Our source code and the developed databases will be made available at <a href="https://github.com/crispianm/DaBiT" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item110'>[110]</a> <a href ="/abs/2408.08822" title="Abstract" id="2408.08822"> arXiv:2408.08822 </a> (replaced) [<a href="/pdf/2408.08822" title="Download PDF" id="pdf-2408.08822" aria-labelledby="pdf-2408.08822">pdf</a>, <a href="https://arxiv.org/html/2408.08822v3" title="View HTML" id="html-2408.08822" aria-labelledby="html-2408.08822" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2408.08822" title="Other formats" id="oth-2408.08822" aria-labelledby="oth-2408.08822">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> PFDiff: Training-Free Acceleration of Diffusion Models Combining Past and Future Scores </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+G">Guangyi Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cai,+Y">Yuren Cai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+L">Lijiang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Peng,+W">Wei Peng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Su,+S">Songzhi Su</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted at ICLR 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Diffusion Probabilistic Models (DPMs) have shown remarkable potential in image generation, but their sampling efficiency is hindered by the need for numerous denoising steps. Most existing solutions accelerate the sampling process by proposing fast ODE solvers. However, the inevitable discretization errors of the ODE solvers are significantly magnified when the number of function evaluations (NFE) is fewer. In this work, we propose PFDiff, a novel training-free and orthogonal timestep-skipping strategy, which enables existing fast ODE solvers to operate with fewer NFE. Specifically, PFDiff initially utilizes score replacement from past time steps to predict a ``springboard". Subsequently, it employs this ``springboard" along with foresight updates inspired by Nesterov momentum to rapidly update current intermediate states. This approach effectively reduces unnecessary NFE while correcting for discretization errors inherent in first-order ODE solvers. Experimental results demonstrate that PFDiff exhibits flexible applicability across various pre-trained DPMs, particularly excelling in conditional DPMs and surpassing previous state-of-the-art training-free methods. For instance, using DDIM as a baseline, we achieved 16.46 FID (4 NFE) compared to 138.81 FID with DDIM on ImageNet 64x64 with classifier guidance, and 13.06 FID (10 NFE) on Stable Diffusion with 7.5 guidance scale. Code is available at \url{<a href="https://github.com/onefly123/PFDiff" rel="external noopener nofollow" class="link-external link-https">this https URL</a>}. </p> </div> </dd> <dt> <a name='item111'>[111]</a> <a href ="/abs/2408.15740" title="Abstract" id="2408.15740"> arXiv:2408.15740 </a> (replaced) [<a href="/pdf/2408.15740" title="Download PDF" id="pdf-2408.15740" aria-labelledby="pdf-2408.15740">pdf</a>, <a href="/format/2408.15740" title="Other formats" id="oth-2408.15740" aria-labelledby="oth-2408.15740">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MambaPlace:Text-to-Point-Cloud Cross-Modal Place Recognition with Attention Mamba Mechanisms </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Shang,+T">Tianyi Shang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Z">Zhenyu Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+P">Pengjie Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qiao,+J">Jinwei Qiao</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 8 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Vision Language Place Recognition (VLVPR) enhances robot localization performance by incorporating natural language descriptions from images. By utilizing language information, VLVPR directs robot place matching, overcoming the constraint of solely depending on vision. The essence of multimodal fusion lies in mining the complementary information between different modalities. However, general fusion methods rely on traditional neural architectures and are not well equipped to capture the dynamics of cross modal interactions, especially in the presence of complex intra modal and inter modal correlations. To this end, this paper proposes a novel coarse to fine and end to end connected cross modal place recognition framework, called MambaPlace. In the coarse localization stage, the text description and 3D point cloud are encoded by the pretrained T5 and instance encoder, respectively. They are then processed using Text Attention Mamba (TAM) and Point Clouds Mamba (PCM) for data enhancement and alignment. In the subsequent fine localization stage, the features of the text description and 3D point cloud are cross modally fused and further enhanced through cascaded Cross Attention Mamba (CCAM). Finally, we predict the positional offset from the fused text point cloud features, achieving the most accurate localization. Extensive experiments show that MambaPlace achieves improved localization accuracy on the KITTI360Pose dataset compared to the state of the art methods. </p> </div> </dd> <dt> <a name='item112'>[112]</a> <a href ="/abs/2409.06490" title="Abstract" id="2409.06490"> arXiv:2409.06490 </a> (replaced) [<a href="/pdf/2409.06490" title="Download PDF" id="pdf-2409.06490" aria-labelledby="pdf-2409.06490">pdf</a>, <a href="https://arxiv.org/html/2409.06490v4" title="View HTML" id="html-2409.06490" aria-labelledby="html-2409.06490" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.06490" title="Other formats" id="oth-2409.06490" aria-labelledby="oth-2409.06490">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> UAVDB: Trajectory-Guided Adaptable Bounding Boxes for UAV Detection </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Y">Yu-Hsi Chen</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 9 pages, 5 figures, 4 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Applications (stat.AP) </div> <p class='mathjax'> The widespread deployment of Unmanned Aerial Vehicles (UAVs) in surveillance, security, and airspace management has created an urgent demand for precise, scalable, and efficient UAV detection. However, existing datasets often suffer from limited scale diversity and inaccurate annotations, hindering robust model development. This paper introduces UAVDB, a high-resolution UAV detection dataset constructed using Patch Intensity Convergence (PIC). This novel technique automatically generates high-fidelity bounding box annotations from UAV trajectory data~\cite{li2020reconstruction}, eliminating the need for manual labeling. UAVDB features single-class annotations with a fixed-camera setup and consists of RGB frames capturing UAVs across various scales, from large-scale UAVs to near-single-pixel representations, along with challenging backgrounds that pose difficulties for modern detectors. We first validate the accuracy and efficiency of PIC-generated bounding boxes by comparing Intersection over Union (IoU) performance and runtime against alternative annotation methods, demonstrating that PIC achieves higher annotation accuracy while being more efficient. Subsequently, we benchmark UAVDB using state-of-the-art (SOTA) YOLO-series detectors, establishing UAVDB as a valuable resource for advancing long-range and high-resolution UAV detection. </p> </div> </dd> <dt> <a name='item113'>[113]</a> <a href ="/abs/2409.12448" title="Abstract" id="2409.12448"> arXiv:2409.12448 </a> (replaced) [<a href="/pdf/2409.12448" title="Download PDF" id="pdf-2409.12448" aria-labelledby="pdf-2409.12448">pdf</a>, <a href="https://arxiv.org/html/2409.12448v3" title="View HTML" id="html-2409.12448" aria-labelledby="html-2409.12448" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.12448" title="Other formats" id="oth-2409.12448" aria-labelledby="oth-2409.12448">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Infrared Small Target Detection in Satellite Videos: A New Dataset and A Novel Recurrent Feature Refinement Framework </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ying,+X">Xinyi Ying</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+L">Li Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+Z">Zaipin Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shi,+Y">Yangsi Shi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yingqian Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+R">Ruojing Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+X">Xu Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+B">Boyang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+S">Shilin Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=An,+W">Wei An</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Multi-frame infrared small target (MIRST) detection in satellite videos is a long-standing, fundamental yet challenging task for decades, and the challenges can be summarized as: First, extremely small target size, highly complex clutters & noises, various satellite motions result in limited feature representation, high false alarms, and difficult motion analyses. Second, the lack of large-scale public available MIRST dataset in satellite videos greatly hinders the algorithm development. To address the aforementioned challenges, in this paper, we first build a large-scale dataset for MIRST detection in satellite videos (namely IRSatVideo-LEO), and then develop a recurrent feature refinement (RFR) framework as the baseline method. Specifically, IRSatVideo-LEO is a semi-simulated dataset with synthesized satellite motion, target appearance, trajectory and intensity, which can provide a standard toolbox for satellite video generation and a reliable evaluation platform to facilitate the algorithm development. For baseline method, RFR is proposed to be equipped with existing powerful CNN-based methods for long-term temporal dependency exploitation and integrated motion compensation & MIRST detection. Specifically, a pyramid deformable alignment (PDA) module and a temporal-spatial-frequency modulation (TSFM) module are proposed to achieve effective and efficient feature alignment, propagation, aggregation and refinement. Extensive experiments have been conducted to demonstrate the effectiveness and superiority of our scheme. The comparative results show that ResUNet equipped with RFR outperforms the state-of-the-art MIRST detection methods. Dataset and code are released at <a href="https://github.com/XinyiYing/RFR" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item114'>[114]</a> <a href ="/abs/2409.15529" title="Abstract" id="2409.15529"> arXiv:2409.15529 </a> (replaced) [<a href="/pdf/2409.15529" title="Download PDF" id="pdf-2409.15529" aria-labelledby="pdf-2409.15529">pdf</a>, <a href="https://arxiv.org/html/2409.15529v3" title="View HTML" id="html-2409.15529" aria-labelledby="html-2409.15529" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.15529" title="Other formats" id="oth-2409.15529" aria-labelledby="oth-2409.15529">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> VaLID: Verification as Late Integration of Detections for LiDAR-Camera Fusion </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Vats,+V">Vanshika Vats</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nizam,+M+B">Marzia Binta Nizam</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Davis,+J">James Davis</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Vehicle object detection benefits from both LiDAR and camera data, with LiDAR offering superior performance in many scenarios. Fusion of these modalities further enhances accuracy, but existing methods often introduce complexity or dataset-specific dependencies. In our study, we propose a model-adaptive late-fusion method, VaLID, which validates whether each predicted bounding box is acceptable or not. Our method verifies the higher-performing, yet overly optimistic LiDAR model detections using camera detections that are obtained from either specially trained, general, or open-vocabulary models. VaLID uses a lightweight neural verification network trained with a high recall bias to reduce the false predictions made by the LiDAR detector, while still preserving the true ones. Evaluating with multiple combinations of LiDAR and camera detectors on the KITTI dataset, we reduce false positives by an average of 63.9%, thus outperforming the individual detectors on 3D average precision (3DAP). Our approach is model-adaptive and demonstrates state-of-the-art competitive performance even when using generic camera detectors that were not trained specifically for this dataset. </p> </div> </dd> <dt> <a name='item115'>[115]</a> <a href ="/abs/2410.00486" title="Abstract" id="2410.00486"> arXiv:2410.00486 </a> (replaced) [<a href="/pdf/2410.00486" title="Download PDF" id="pdf-2410.00486" aria-labelledby="pdf-2410.00486">pdf</a>, <a href="https://arxiv.org/html/2410.00486v3" title="View HTML" id="html-2410.00486" aria-labelledby="html-2410.00486" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.00486" title="Other formats" id="oth-2410.00486" aria-labelledby="oth-2410.00486">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CaRtGS: Computational Alignment for Real-Time Gaussian Splatting SLAM </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+D">Dapeng Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Z">Zhiqiang Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yin,+Y">Yizhen Yin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhong,+S">Shipeng Zhong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qi,+Y">Yuhua Qi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+H">Hongbo Chen</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted by IEEE Robotics and Automation Letters (RA-L) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Robotics (cs.RO) </div> <p class='mathjax'> Simultaneous Localization and Mapping (SLAM) is pivotal in robotics, with photorealistic scene reconstruction emerging as a key challenge. To address this, we introduce Computational Alignment for Real-Time Gaussian Splatting SLAM (CaRtGS), a novel method enhancing the efficiency and quality of photorealistic scene reconstruction in real-time environments. Leveraging 3D Gaussian Splatting (3DGS), CaRtGS achieves superior rendering quality and processing speed, which is crucial for scene photorealistic reconstruction. Our approach tackles computational misalignment in Gaussian Splatting SLAM (GS-SLAM) through an adaptive strategy that enhances optimization iterations, addresses long-tail optimization, and refines densification. Experiments on Replica, TUM-RGBD, and VECtor datasets demonstrate CaRtGS's effectiveness in achieving high-fidelity rendering with fewer Gaussian primitives. This work propels SLAM towards real-time, photorealistic dense rendering, significantly advancing photorealistic scene representation. For the benefit of the research community, we release the code and accompanying videos on our project website: <a href="https://dapengfeng.github.io/cartgs" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item116'>[116]</a> <a href ="/abs/2410.03051" title="Abstract" id="2410.03051"> arXiv:2410.03051 </a> (replaced) [<a href="/pdf/2410.03051" title="Download PDF" id="pdf-2410.03051" aria-labelledby="pdf-2410.03051">pdf</a>, <a href="https://arxiv.org/html/2410.03051v2" title="View HTML" id="html-2410.03051" aria-labelledby="html-2410.03051" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.03051" title="Other formats" id="oth-2410.03051" aria-labelledby="oth-2410.03051">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> AuroraCap: Efficient, Performant Video Detailed Captioning and a New Benchmark </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chai,+W">Wenhao Chai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+E">Enxin Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Du,+Y">Yilun Du</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Meng,+C">Chenlin Meng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Madhavan,+V">Vashisht Madhavan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bar-Tal,+O">Omer Bar-Tal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hwang,+J">Jenq-Neng Hwang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xie,+S">Saining Xie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Manning,+C+D">Christopher D. Manning</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted to ICLR 2025. Code, docs, weight, benchmark and training data are all avaliable at <a href="https://rese1f.github.io/aurora-web/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Video detailed captioning is a key task which aims to generate comprehensive and coherent textual descriptions of video content, benefiting both video understanding and generation. In this paper, we propose AuroraCap, a video captioner based on a large multimodal model. We follow the simplest architecture design without additional parameters for temporal modeling. To address the overhead caused by lengthy video sequences, we implement the token merging strategy, reducing the number of input visual tokens. Surprisingly, we found that this strategy results in little performance loss. AuroraCap shows superior performance on various video and image captioning benchmarks, for example, obtaining a CIDEr of 88.9 on Flickr30k, beating GPT-4V (55.3) and Gemini-1.5 Pro (82.2). However, existing video caption benchmarks only include simple descriptions, consisting of a few dozen words, which limits research in this field. Therefore, we develop VDC, a video detailed captioning benchmark with over one thousand carefully annotated structured captions. In addition, we propose a new LLM-assisted metric VDCscore for bettering evaluation, which adopts a divide-and-conquer strategy to transform long caption evaluation into multiple short question-answer pairs. With the help of human Elo ranking, our experiments show that this benchmark better correlates with human judgments of video detailed captioning quality. </p> </div> </dd> <dt> <a name='item117'>[117]</a> <a href ="/abs/2410.03858" title="Abstract" id="2410.03858"> arXiv:2410.03858 </a> (replaced) [<a href="/pdf/2410.03858" title="Download PDF" id="pdf-2410.03858" aria-labelledby="pdf-2410.03858">pdf</a>, <a href="https://arxiv.org/html/2410.03858v2" title="View HTML" id="html-2410.03858" aria-labelledby="html-2410.03858" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.03858" title="Other formats" id="oth-2410.03858" aria-labelledby="oth-2410.03858">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Pose Prior Learner: Unsupervised Categorical Prior Learning for Pose Estimation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Ziyu Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+S">Shuangpeng Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+M">Mengmi Zhang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> A prior represents a set of beliefs or assumptions about a system, aiding inference and decision-making. In this paper, we introduce the challenge of unsupervised categorical prior learning in pose estimation, where AI models learn a general pose prior for an object category from images in a self-supervised manner. Although priors are effective in estimating pose, acquiring them can be difficult. We propose a novel method, named Pose Prior Learner (PPL), to learn a general pose prior for any object category. PPL uses a hierarchical memory to store compositional parts of prototypical poses, from which we distill a general pose prior. This prior improves pose estimation accuracy through template transformation and image reconstruction. PPL learns meaningful pose priors without any additional human annotations or interventions, outperforming competitive baselines on both human and animal pose estimation datasets. Notably, our experimental results reveal the effectiveness of PPL using learned prototypical poses for pose estimation on occluded images. Through iterative inference, PPL leverages the pose prior to refine estimated poses, regressing them to any prototypical poses stored in memory. Our code, model, and data will be publicly available. </p> </div> </dd> <dt> <a name='item118'>[118]</a> <a href ="/abs/2410.18974" title="Abstract" id="2410.18974"> arXiv:2410.18974 </a> (replaced) [<a href="/pdf/2410.18974" title="Download PDF" id="pdf-2410.18974" aria-labelledby="pdf-2410.18974">pdf</a>, <a href="https://arxiv.org/html/2410.18974v2" title="View HTML" id="html-2410.18974" aria-labelledby="html-2410.18974" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.18974" title="Other formats" id="oth-2410.18974" aria-labelledby="oth-2410.18974">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> 3D-Adapter: Geometry-Consistent Multi-View Diffusion for High-Quality 3D Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+H">Hansheng Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+B">Bokui Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Y">Yulin Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shi,+R">Ruoxi Shi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+L">Linqi Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+C+Z">Connor Z. Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gu,+J">Jiayuan Gu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Su,+H">Hao Su</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wetzstein,+G">Gordon Wetzstein</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guibas,+L">Leonidas Guibas</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Project page: <a href="https://lakonik.github.io/3d-adapter/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Multi-view image diffusion models have significantly advanced open-domain 3D object generation. However, most existing models rely on 2D network architectures that lack inherent 3D biases, resulting in compromised geometric consistency. To address this challenge, we introduce 3D-Adapter, a plug-in module designed to infuse 3D geometry awareness into pretrained image diffusion models. Central to our approach is the idea of 3D feedback augmentation: for each denoising step in the sampling loop, 3D-Adapter decodes intermediate multi-view features into a coherent 3D representation, then re-encodes the rendered RGBD views to augment the pretrained base model through feature addition. We study two variants of 3D-Adapter: a fast feed-forward version based on Gaussian splatting and a versatile training-free version utilizing neural fields and meshes. Our extensive experiments demonstrate that 3D-Adapter not only greatly enhances the geometry quality of text-to-multi-view models such as Instant3D and Zero123++, but also enables high-quality 3D generation using the plain text-to-image Stable Diffusion. Furthermore, we showcase the broad application potential of 3D-Adapter by presenting high quality results in text-to-3D, image-to-3D, text-to-texture, and text-to-avatar tasks. </p> </div> </dd> <dt> <a name='item119'>[119]</a> <a href ="/abs/2410.22454" title="Abstract" id="2410.22454"> arXiv:2410.22454 </a> (replaced) [<a href="/pdf/2410.22454" title="Download PDF" id="pdf-2410.22454" aria-labelledby="pdf-2410.22454">pdf</a>, <a href="/format/2410.22454" title="Other formats" id="oth-2410.22454" aria-labelledby="oth-2410.22454">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Brain age identification from diffusion MRI synergistically predicts neurodegenerative disease </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+C">Chenyu Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+M+E">Michael E. Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ramadass,+K">Karthik Ramadass</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kanakaraj,+P">Praitayini Kanakaraj</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Krishnan,+A+R">Aravind R. Krishnan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Saunders,+A+M">Adam M. Saunders</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Newlin,+N+R">Nancy R. Newlin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+H+H">Ho Hin Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+Q">Qi Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Taylor,+W+D">Warren D. Taylor</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Boyd,+B+D">Brian D. Boyd</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Beason-Held,+L+L">Lori L. Beason-Held</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Resnick,+S+M">Susan M. Resnick</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Barnes,+L+L">Lisa L. Barnes</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bennett,+D+A">David A. Bennett</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Van+Schaik,+K+D">Katherine D. Van Schaik</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Archer,+D+B">Derek B. Archer</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hohman,+T+J">Timothy J. Hohman</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jefferson,+A+L">Angela L. Jefferson</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=I%C5%A1gum,+I">Ivana I拧gum</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Moyer,+D">Daniel Moyer</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huo,+Y">Yuankai Huo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Schilling,+K+G">Kurt G. Schilling</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zuo,+L">Lianrui Zuo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bao,+S">Shunxing Bao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Khairi,+N+M">Nazirah Mohd Khairi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Z">Zhiyuan Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Davatzikos,+C">Christos Davatzikos</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Landman,+B+A">Bennett A. Landman</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Estimated brain age from magnetic resonance image (MRI) and its deviation from chronological age can provide early insights into potential neurodegenerative diseases, supporting early detection and implementation of prevention strategies. Diffusion MRI (dMRI) presents an opportunity to build an earlier biomarker for neurodegenerative disease prediction because it captures subtle microstructural changes that precede more perceptible macrostructural changes. However, the coexistence of macro- and micro-structural information in dMRI raises the question of whether current dMRI-based brain age estimation models are leveraging the intended microstructural information or if they inadvertently rely on the macrostructural information. To develop a microstructure-specific brain age, we propose a method for brain age identification from dMRI that mitigates the model's use of macrostructural information by non-rigidly registering all images to a standard template. Imaging data from 13,398 participants across 12 datasets were used for the training and evaluation. We compare our brain age models, trained with and without macrostructural information mitigated, with an architecturally similar T1-weighted (T1w) MRI-based brain age model and two recent, popular, openly available T1w MRI-based brain age models that primarily use macrostructural information. We observe difference between our dMRI-based brain age and T1w MRI-based brain age across stages of neurodegeneration, with dMRI-based brain age being older than T1w MRI-based brain age in participants transitioning from cognitively normal (CN) to mild cognitive impairment (MCI), but younger in participants already diagnosed with Alzheimer's disease (AD). Furthermore, dMRI-based brain age may offer advantages over T1w MRI-based brain age in predicting the transition from CN to MCI up to five years before diagnosis. </p> </div> </dd> <dt> <a name='item120'>[120]</a> <a href ="/abs/2411.07742" title="Abstract" id="2411.07742"> arXiv:2411.07742 </a> (replaced) [<a href="/pdf/2411.07742" title="Download PDF" id="pdf-2411.07742" aria-labelledby="pdf-2411.07742">pdf</a>, <a href="https://arxiv.org/html/2411.07742v3" title="View HTML" id="html-2411.07742" aria-labelledby="html-2411.07742" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.07742" title="Other formats" id="oth-2411.07742" aria-labelledby="oth-2411.07742">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Efficient 3D Perception on Multi-Sweep Point Cloud with Gumbel Spatial Pruning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+T">Tianyu Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+J">Jianhao Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+X">Xueqian Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zhongdao Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+B">Bailan Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+H">Hengshuang Zhao</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> This paper studies point cloud perception within outdoor environments. Existing methods face limitations in recognizing objects located at a distance or occluded, due to the sparse nature of outdoor point clouds. In this work, we observe a significant mitigation of this problem by accumulating multiple temporally consecutive point cloud sweeps, resulting in a remarkable improvement in perception accuracy. However, the computation cost also increases, hindering previous approaches from utilizing a large number of point cloud sweeps. To tackle this challenge, we find that a considerable portion of points in the accumulated point cloud is redundant, and discarding these points has minimal impact on perception accuracy. We introduce a simple yet effective Gumbel Spatial Pruning (GSP) layer that dynamically prunes points based on a learned end-to-end sampling. The GSP layer is decoupled from other network components and thus can be seamlessly integrated into existing point cloud network architectures. Without incurring additional computational overhead, we increase the number of point cloud sweeps from 10, a common practice, to as many as 40. Consequently, there is a significant enhancement in perception performance. For instance, in nuScenes 3D object detection and BEV map segmentation tasks, our pruning strategy improves several 3D perception baseline methods. </p> </div> </dd> <dt> <a name='item121'>[121]</a> <a href ="/abs/2501.14679" title="Abstract" id="2501.14679"> arXiv:2501.14679 </a> (replaced) [<a href="/pdf/2501.14679" title="Download PDF" id="pdf-2501.14679" aria-labelledby="pdf-2501.14679">pdf</a>, <a href="https://arxiv.org/html/2501.14679v5" title="View HTML" id="html-2501.14679" aria-labelledby="html-2501.14679" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.14679" title="Other formats" id="oth-2501.14679" aria-labelledby="oth-2501.14679">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Surface Vision Mamba: Leveraging Bidirectional State Space Model for Efficient Spherical Manifold Representation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=He,+R">Rongzhao He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zheng,+W">Weihao Zheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+L">Leilei Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Ying Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+D">Dalin Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+D">Dan Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+B">Bin Hu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Attention-based methods have demonstrated exceptional performance in modelling long-range dependencies on spherical cortical surfaces, surpassing traditional Geometric Deep Learning (GDL) models. However, their extensive inference time and high memory demands pose challenges for application to large datasets with limited computing resources. Inspired by the state space model in computer vision, we introduce the attention-free Vision Mamba (Vim) to spherical surfaces, presenting a domain-agnostic architecture for analyzing data on spherical manifolds. Our method achieves surface patching by representing spherical data as a sequence of triangular patches derived from a subdivided icosphere. The proposed Surface Vision Mamba (SiM) is evaluated on multiple neurodevelopmental phenotype regression tasks using cortical surface metrics from neonatal brains. Experimental results demonstrate that SiM outperforms both attention- and GDL-based methods, delivering 4.8 times faster inference and achieving 91.7% lower memory consumption compared to the Surface Vision Transformer (SiT) under the Ico-4 grid partitioning. Sensitivity analysis further underscores the potential of SiM to identify subtle cognitive developmental patterns. The code is available at <a href="https://github.com/Rongzhao-He/surface-vision-mamba" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item122'>[122]</a> <a href ="/abs/2502.01401" title="Abstract" id="2502.01401"> arXiv:2502.01401 </a> (replaced) [<a href="/pdf/2502.01401" title="Download PDF" id="pdf-2502.01401" aria-labelledby="pdf-2502.01401">pdf</a>, <a href="https://arxiv.org/html/2502.01401v3" title="View HTML" id="html-2502.01401" aria-labelledby="html-2502.01401" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.01401" title="Other formats" id="oth-2502.01401" aria-labelledby="oth-2502.01401">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Evolving Symbolic 3D Visual Grounder with Weakly Supervised Reflection </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Mi,+B">Boyu Mi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+H">Hanqing Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+T">Tai Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Y">Yilun Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pang,+J">Jiangmiao Pang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> 3D visual grounding (3DVG) is challenging because of the requirement of understanding on visual information, language and spatial relationships. While supervised approaches have achieved superior performance, they are constrained by the scarcity and high cost of 3D vision-language datasets. On the other hand, LLM/VLM based agents are proposed for 3DVG, eliminating the need for training data. However, these methods incur prohibitive time and token costs during inference. To address the challenges, we introduce a novel training-free symbolic framework for 3D visual grounding, namely Evolvable Symbolic Visual Grounder, that offers significantly reduced inference costs compared to previous agent-based methods while maintaining comparable performance. EaSe uses LLM generated codes to compute on spatial relationships. EaSe also implements an automatic pipeline to evaluate and optimize the quality of these codes and integrate VLMs to assist in the grounding process. Experimental results demonstrate that EaSe achieves 52.9% accuracy on Nr3D dataset and 49.2% Acc@0.25 on ScanRefer, which is top-tier among training-free methods. Moreover, it substantially reduces the inference time and cost, offering a balanced trade-off between performance and efficiency. Codes are available at <a href="https://github.com/OpenRobotLab/EaSe" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item123'>[123]</a> <a href ="/abs/2502.06527" title="Abstract" id="2502.06527"> arXiv:2502.06527 </a> (replaced) [<a href="/pdf/2502.06527" title="Download PDF" id="pdf-2502.06527" aria-labelledby="pdf-2502.06527">pdf</a>, <a href="/format/2502.06527" title="Other formats" id="oth-2502.06527" aria-labelledby="oth-2502.06527">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CustomVideoX: 3D Reference Attention Driven Dynamic Adaptation for Zero-Shot Customized Video Diffusion Transformers </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=She,+D">D. She</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+M">Mushui Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pang,+J">Jingxuan Pang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+J">Jin Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+Z">Zhen Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+W">Wanggui He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+G">Guanghao Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yi Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+Q">Qihan Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+H">Haobin Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+Y">Yunlong Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fu,+S">Siming Fu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Section 4 in CustomVideoX Entity Region-Aware Enhancement has description errors. The compared methods data of Table I lacks other metrics </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Customized generation has achieved significant progress in image synthesis, yet personalized video generation remains challenging due to temporal inconsistencies and quality degradation. In this paper, we introduce CustomVideoX, an innovative framework leveraging the video diffusion transformer for personalized video generation from a reference image. CustomVideoX capitalizes on pre-trained video networks by exclusively training the LoRA parameters to extract reference features, ensuring both efficiency and adaptability. To facilitate seamless interaction between the reference image and video content, we propose 3D Reference Attention, which enables direct and simultaneous engagement of reference image features with all video frames across spatial and temporal dimensions. To mitigate the excessive influence of reference image features and textual guidance on generated video content during inference, we implement the Time-Aware Reference Attention Bias (TAB) strategy, dynamically modulating reference bias over different time steps. Additionally, we introduce the Entity Region-Aware Enhancement (ERAE) module, aligning highly activated regions of key entity tokens with reference feature injection by adjusting attention bias. To thoroughly evaluate personalized video generation, we establish a new benchmark, VideoBench, comprising over 50 objects and 100 prompts for extensive assessment. Experimental results show that CustomVideoX significantly outperforms existing methods in terms of video consistency and quality. </p> </div> </dd> <dt> <a name='item124'>[124]</a> <a href ="/abs/2502.07172" title="Abstract" id="2502.07172"> arXiv:2502.07172 </a> (replaced) [<a href="/pdf/2502.07172" title="Download PDF" id="pdf-2502.07172" aria-labelledby="pdf-2502.07172">pdf</a>, <a href="https://arxiv.org/html/2502.07172v3" title="View HTML" id="html-2502.07172" aria-labelledby="html-2502.07172" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.07172" title="Other formats" id="oth-2502.07172" aria-labelledby="oth-2502.07172">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SemiHMER: Semi-supervised Handwritten Mathematical Expression Recognition using pseudo-labels </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+K">Kehua Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+H">Haoyang Shen</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 17 pages,3 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> In this paper, we study semi-supervised Handwritten Mathematical Expression Recognition (HMER) via exploring both labeled data and extra unlabeled data. We propose a novel consistency regularization framework, termed SemiHMER, which introduces dual-branch semi-supervised learning. Specifically, we enforce consistency between the two networks for the same input image. The pseudo-label, generated by one perturbed recognition network, is utilized to supervise the other network using the standard cross-entropy loss. The SemiHMER consistency encourages high similarity between the predictions of the two perturbed networks for the same input image and expands the training data by leveraging unlabeled data with pseudo-labels. We further introduce a weak-to-strong strategy by applying different levels of augmentation to each branch, effectively expanding the training data and enhancing the quality of network training. Additionally, we propose a novel module, the Global Dynamic Counting Module (GDCM), to enhance the performance of the HMER decoder by alleviating recognition inaccuracies in long-distance formula recognition and reducing the occurrence of repeated characters. The experimental results demonstrate that our work achieves significant performance improvements, with an average accuracy increase of 5.47% on CROHME14, 4.87% on CROHME16, and 5.25% on CROHME19, compared to our baselines. </p> </div> </dd> <dt> <a name='item125'>[125]</a> <a href ="/abs/2502.09873" title="Abstract" id="2502.09873"> arXiv:2502.09873 </a> (replaced) [<a href="/pdf/2502.09873" title="Download PDF" id="pdf-2502.09873" aria-labelledby="pdf-2502.09873">pdf</a>, <a href="https://arxiv.org/html/2502.09873v2" title="View HTML" id="html-2502.09873" aria-labelledby="html-2502.09873" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.09873" title="Other formats" id="oth-2502.09873" aria-labelledby="oth-2502.09873">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Compression-Aware One-Step Diffusion Model for JPEG Artifact Removal </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+J">Jinpei Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Z">Zheng Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+W">Wenbo Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+Y">Yong Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yulun Zhang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Diffusion models have demonstrated remarkable success in image restoration tasks. However, their multi-step denoising process introduces significant computational overhead, limiting their practical deployment. Furthermore, existing methods struggle to effectively remove severe JPEG artifact, especially in highly compressed images. To address these challenges, we propose CODiff, a compression-aware one-step diffusion model for JPEG artifact removal. The core of CODiff is the compression-aware visual embedder (CaVE), which extracts and leverages JPEG compression priors to guide the diffusion model. We propose a dual learning strategy that combines explicit and implicit learning. Specifically, explicit learning enforces a quality prediction objective to differentiate low-quality images with different compression levels. Implicit learning employs a reconstruction objective that enhances the model's generalization. This dual learning allows for a deeper and more comprehensive understanding of JPEG compression. Experimental results demonstrate that CODiff surpasses recent leading methods in both quantitative and visual quality metrics. The code and models will be released at <a href="https://github.com/jp-guo/CODiff" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item126'>[126]</a> <a href ="/abs/2502.11859" title="Abstract" id="2502.11859"> arXiv:2502.11859 </a> (replaced) [<a href="/pdf/2502.11859" title="Download PDF" id="pdf-2502.11859" aria-labelledby="pdf-2502.11859">pdf</a>, <a href="https://arxiv.org/html/2502.11859v2" title="View HTML" id="html-2502.11859" aria-labelledby="html-2502.11859" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11859" title="Other formats" id="oth-2502.11859" aria-labelledby="oth-2502.11859">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Defining and Evaluating Visual Language Models' Basic Spatial Abilities: A Perspective from Psychometrics </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+W">Wenrui Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lyu,+D">Dalin Lyu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+W">Weihang Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+J">Jie Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+C">Chen Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yong Li</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> The Theory of Multiple Intelligences underscores the hierarchical nature of cognitive capabilities. To advance Spatial Artificial Intelligence, we pioneer a psychometric framework defining five Basic Spatial Abilities (BSAs) in Visual Language Models (VLMs): Spatial Perception, Spatial Relation, Spatial Orientation, Mental Rotation, and Spatial Visualization. Benchmarking 13 mainstream VLMs through nine validated psychometric experiments reveals significant gaps versus humans (average score 24.95 vs. 68.38), with three key findings: 1) VLMs mirror human hierarchies (strongest in 2D orientation, weakest in 3D rotation) with independent BSAs (Pearson's r<0.4); 2) Smaller models such as Qwen2-VL-7B surpass larger counterparts, with Qwen leading (30.82) and InternVL2 lagging (19.6); 3) Interventions like chain-of-thought (0.100 accuracy gain) and 5-shot training (0.259 improvement) show limits from architectural constraints. Identified barriers include weak geometry encoding and missing dynamic simulation. By linking psychometric BSAs to VLM capabilities, we provide a diagnostic toolkit for spatial intelligence evaluation, methodological foundations for embodied AI development, and a cognitive science-informed roadmap for achieving human-like spatial intelligence. </p> </div> </dd> <dt> <a name='item127'>[127]</a> <a href ="/abs/2502.12138" title="Abstract" id="2502.12138"> arXiv:2502.12138 </a> (replaced) [<a href="/pdf/2502.12138" title="Download PDF" id="pdf-2502.12138" aria-labelledby="pdf-2502.12138">pdf</a>, <a href="https://arxiv.org/html/2502.12138v2" title="View HTML" id="html-2502.12138" aria-labelledby="html-2502.12138" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12138" title="Other formats" id="oth-2502.12138" aria-labelledby="oth-2502.12138">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> FLARE: Feed-forward Geometry, Appearance and Camera Estimation from Uncalibrated Sparse Views </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+S">Shangzhan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+J">Jianyuan Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+Y">Yinghao Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xue,+N">Nan Xue</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rupprecht,+C">Christian Rupprecht</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+X">Xiaowei Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+Y">Yujun Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wetzstein,+G">Gordon Wetzstein</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 8 pages. Website: <a href="https://zhanghe3z.github.io/FLARE/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> We present FLARE, a feed-forward model designed to infer high-quality camera poses and 3D geometry from uncalibrated sparse-view images (i.e., as few as 2-8 inputs), which is a challenging yet practical setting in real-world applications. Our solution features a cascaded learning paradigm with camera pose serving as the critical bridge, recognizing its essential role in mapping 3D structures onto 2D image planes. Concretely, FLARE starts with camera pose estimation, whose results condition the subsequent learning of geometric structure and appearance, optimized through the objectives of geometry reconstruction and novel-view synthesis. Utilizing large-scale public datasets for training, our method delivers state-of-the-art performance in the tasks of pose estimation, geometry reconstruction, and novel view synthesis, while maintaining the inference efficiency (i.e., less than 0.5 seconds). The project page and code can be found at: <a href="https://zhanghe3z.github.io/FLARE/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item128'>[128]</a> <a href ="/abs/2210.06330" title="Abstract" id="2210.06330"> arXiv:2210.06330 </a> (replaced) [<a href="/pdf/2210.06330" title="Download PDF" id="pdf-2210.06330" aria-labelledby="pdf-2210.06330">pdf</a>, <a href="https://arxiv.org/html/2210.06330v2" title="View HTML" id="html-2210.06330" aria-labelledby="html-2210.06330" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2210.06330" title="Other formats" id="oth-2210.06330" aria-labelledby="oth-2210.06330">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CoRRECT: A Deep Unfolding Framework for Motion-Corrected Quantitative R2* Mapping </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Xu,+X">Xiaojian Xu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Gan,+W">Weijie Gan</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Kothapalli,+S+V">Satya V.V.N. Kothapalli</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Yablonskiy,+D+A">Dmitriy A. Yablonskiy</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Kamilov,+U+S">Ulugbek S. Kamilov</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Computer Vision and Pattern Recognition (cs.CV); Machine Learning (cs.LG) </div> <p class='mathjax'> Quantitative MRI (qMRI) refers to a class of MRI methods for quantifying the spatial distribution of biological tissue parameters. Traditional qMRI methods usually deal separately with artifacts arising from accelerated data acquisition, involuntary physical motion, and magnetic-field inhomogeneities, leading to suboptimal end-to-end performance. This paper presents CoRRECT, a unified deep unfolding (DU) framework for qMRI consisting of a model-based end-to-end neural network, a method for motion-artifact reduction, and a self-supervised learning scheme. The network is trained to produce R2* maps whose k-space data matches the real data by also accounting for motion and field inhomogeneities. When deployed, CoRRECT only uses the k-space data without any pre-computed parameters for motion or inhomogeneity correction. Our results on experimentally collected multi-Gradient-Recalled Echo (mGRE) MRI data show that CoRRECT recovers motion and inhomogeneity artifact-free R2* maps in highly accelerated acquisition settings. This work opens the door to DU methods that can integrate physical measurement models, biophysical signal models, and learned prior models for high-quality qMRI. </p> </div> </dd> <dt> <a name='item129'>[129]</a> <a href ="/abs/2212.12322" title="Abstract" id="2212.12322"> arXiv:2212.12322 </a> (replaced) [<a href="/pdf/2212.12322" title="Download PDF" id="pdf-2212.12322" aria-labelledby="pdf-2212.12322">pdf</a>, <a href="https://arxiv.org/html/2212.12322v4" title="View HTML" id="html-2212.12322" aria-labelledby="html-2212.12322" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2212.12322" title="Other formats" id="oth-2212.12322" aria-labelledby="oth-2212.12322">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Infrared Image Super-Resolution: Systematic Review, and Future Trends </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Huang,+Y">Yongsong Huang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Miyazaki,+T">Tomo Miyazaki</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Liu,+X">Xiaofeng Liu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Omachi,+S">Shinichiro Omachi</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> This work has been submitted to the Pattern Recognition for possible publication </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Computer Vision and Pattern Recognition (cs.CV); Machine Learning (cs.LG) </div> <p class='mathjax'> Image Super-Resolution (SR) is essential for a wide range of computer vision and image processing tasks. Investigating infrared (IR) image (or thermal images) super-resolution is a continuing concern within the development of deep learning. This survey aims to provide a comprehensive perspective of IR image super-resolution, including its applications, hardware imaging system dilemmas, and taxonomy of image processing methodologies. In addition, the datasets and evaluation metrics in IR image super-resolution tasks are also discussed. Furthermore, the deficiencies in current technologies and possible promising directions for the community to explore are highlighted. To cope with the rapid development in this field, we intend to regularly update the relevant excellent work at \url{<a href="https://github.com/yongsongH/Infrared_Image_SR_Survey" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item130'>[130]</a> <a href ="/abs/2310.02664" title="Abstract" id="2310.02664"> arXiv:2310.02664 </a> (replaced) [<a href="/pdf/2310.02664" title="Download PDF" id="pdf-2310.02664" aria-labelledby="pdf-2310.02664">pdf</a>, <a href="https://arxiv.org/html/2310.02664v2" title="View HTML" id="html-2310.02664" aria-labelledby="html-2310.02664" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2310.02664" title="Other formats" id="oth-2310.02664" aria-labelledby="oth-2310.02664">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> On Memorization in Diffusion Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Gu,+X">Xiangming Gu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Du,+C">Chao Du</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pang,+T">Tianyu Pang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+C">Chongxuan Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+M">Min Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Ye Wang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> TMLR 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Due to their capacity to generate novel and high-quality samples, diffusion models have attracted significant research interest in recent years. Notably, the typical training objective of diffusion models, i.e., denoising score matching, has a closed-form optimal solution that can only generate training data replicating samples. This indicates that a memorization behavior is theoretically expected, which contradicts the common generalization ability of state-of-the-art diffusion models, and thus calls for a deeper understanding. Looking into this, we first observe that memorization behaviors tend to occur on smaller-sized datasets, which motivates our definition of effective model memorization (EMM), a metric measuring the maximum size of training data at which a learned diffusion model approximates its theoretical optimum. Then, we quantify the impact of the influential factors on these memorization behaviors in terms of EMM, focusing primarily on data distribution, model configuration, and training procedure. Besides comprehensive empirical results identifying the influential factors, we surprisingly find that conditioning training data on uninformative random labels can significantly trigger the memorization in diffusion models. Our study holds practical significance for diffusion model users and offers clues to theoretical research in deep generative models. Code is available at <a href="https://github.com/sail-sg/DiffMemorize" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item131'>[131]</a> <a href ="/abs/2310.13344" title="Abstract" id="2310.13344"> arXiv:2310.13344 </a> (replaced) [<a href="/pdf/2310.13344" title="Download PDF" id="pdf-2310.13344" aria-labelledby="pdf-2310.13344">pdf</a>, <a href="https://arxiv.org/html/2310.13344v2" title="View HTML" id="html-2310.13344" aria-labelledby="html-2310.13344" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2310.13344" title="Other formats" id="oth-2310.13344" aria-labelledby="oth-2310.13344">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> DeepFracture: A Generative Approach for Predicting Brittle Fractures with Neural Discrete Representation Learning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+Y">Yuhang Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kanai,+T">Takashi Kanai</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> This is a preprint of an article published in the Computer Graphics Forum. The final authenticated version is available at (<a href="https://doi.org/10.1111/cgf.70002" rel="external noopener nofollow" class="link-external link-https">this https URL</a>). Please also check the project page: <a href="https://nikoloside.github.io/deepfracture/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> Computer Graphics Forum, 15 pages, e70002, 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Graphics (cs.GR)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> In the field of brittle fracture animation, generating realistic destruction animations using physics-based simulation methods is computationally expensive. While techniques based on Voronoi diagrams or pre-fractured patterns are effective for real-time applications, they fail to incorporate collision conditions when determining fractured shapes during runtime. This paper introduces a novel learning-based approach for predicting fractured shapes based on collision dynamics at runtime. Our approach seamlessly integrates realistic brittle fracture animations with rigid body simulations, utilising boundary element method (BEM) brittle fracture simulations to generate training data. To integrate collision scenarios and fractured shapes into a deep learning framework, we introduce generative geometric segmentation, distinct from both instance and semantic segmentation, to represent 3D fragment shapes. We propose an eight-dimensional latent code to address the challenge of optimising multiple discrete fracture pattern targets that share similar continuous collision latent codes. This code will follow a discrete normal distribution corresponding to a specific fracture pattern within our latent impulse representation design. This adaptation enables the prediction of fractured shapes using neural discrete representation learning. Our experimental results show that our approach generates considerably more detailed brittle fractures than existing techniques, while the computational time is typically reduced compared to traditional simulation methods at comparable resolutions. </p> </div> </dd> <dt> <a name='item132'>[132]</a> <a href ="/abs/2311.08816" title="Abstract" id="2311.08816"> arXiv:2311.08816 </a> (replaced) [<a href="/pdf/2311.08816" title="Download PDF" id="pdf-2311.08816" aria-labelledby="pdf-2311.08816">pdf</a>, <a href="https://arxiv.org/html/2311.08816v2" title="View HTML" id="html-2311.08816" aria-labelledby="html-2311.08816" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2311.08816" title="Other formats" id="oth-2311.08816" aria-labelledby="oth-2311.08816">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Texture and Noise Dual Adaptation for Infrared Image Super-Resolution </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Huang,+Y">Yongsong Huang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Miyazaki,+T">Tomo Miyazaki</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Liu,+X">Xiaofeng Liu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Dong,+Y">Yafei Dong</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Omachi,+S">Shinichiro Omachi</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted by Pattern Recognition </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Recent efforts have explored leveraging visible light images to enrich texture details in infrared (IR) super-resolution. However, this direct adaptation approach often becomes a double-edged sword, as it improves texture at the cost of introducing noise and blurring artifacts. To address these challenges, we propose the Target-oriented Domain Adaptation SRGAN (DASRGAN), an innovative framework specifically engineered for robust IR super-resolution model adaptation. DASRGAN operates on the synergy of two key components: 1) Texture-Oriented Adaptation (TOA) to refine texture details meticulously, and 2) Noise-Oriented Adaptation (NOA), dedicated to minimizing noise transfer. Specifically, TOA uniquely integrates a specialized discriminator, incorporating a prior extraction branch, and employs a Sobel-guided adversarial loss to align texture distributions effectively. Concurrently, NOA utilizes a noise adversarial loss to distinctly separate the generative and Gaussian noise pattern distributions during adversarial training. Our extensive experiments confirm DASRGAN's superiority. Comparative analyses against leading methods across multiple benchmarks and upsampling factors reveal that DASRGAN sets new state-of-the-art performance standards. Code are available at \url{<a href="https://github.com/yongsongH/DASRGAN" rel="external noopener nofollow" class="link-external link-https">this https URL</a>}. </p> </div> </dd> <dt> <a name='item133'>[133]</a> <a href ="/abs/2311.11782" title="Abstract" id="2311.11782"> arXiv:2311.11782 </a> (replaced) [<a href="/pdf/2311.11782" title="Download PDF" id="pdf-2311.11782" aria-labelledby="pdf-2311.11782">pdf</a>, <a href="https://arxiv.org/html/2311.11782v2" title="View HTML" id="html-2311.11782" aria-labelledby="html-2311.11782" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2311.11782" title="Other formats" id="oth-2311.11782" aria-labelledby="oth-2311.11782">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Robust Tumor Segmentation with Hyperspectral Imaging and Graph Neural Networks </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Mostafa,+M+L">Mayar Lotfy Mostafa</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Alperovich,+A">Anna Alperovich</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Giannantonio,+T">Tommaso Giannantonio</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Barz,+B">Bjorn Barz</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Zhang,+X">Xiaohan Zhang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Holm,+F">Felix Holm</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Navab,+N">Nassir Navab</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Boehm,+F">Felix Boehm</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Schwamborn,+C">Carolin Schwamborn</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Hoffmann,+T+K">Thomas K. Hoffmann</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Schuler,+P+J">Patrick J. Schuler</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 18 pages, 5 figures, The German Conference on Pattern Recognition (GCPR) 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Computer Vision and Pattern Recognition (cs.CV); Machine Learning (cs.LG) </div> <p class='mathjax'> Segmenting the boundary between tumor and healthy tissue during surgical cancer resection poses a significant challenge. In recent years, Hyperspectral Imaging (HSI) combined with Machine Learning (ML) has emerged as a promising solution. However, due to the extensive information contained within the spectral domain, most ML approaches primarily classify individual HSI (super-)pixels, or tiles, without taking into account their spatial context. In this paper, we propose an improved methodology that leverages the spatial context of tiles for more robust and smoother segmentation. To address the irregular shapes of tiles, we utilize Graph Neural Networks (GNNs) to propagate context information across neighboring regions. The features for each tile within the graph are extracted using a Convolutional Neural Network (CNN), which is trained simultaneously with the subsequent GNN. Moreover, we incorporate local image quality metrics into the loss function to enhance the training procedure's robustness against low-quality regions in the training images. We demonstrate the superiority of our proposed method using a clinical ex vivo dataset consisting of 51 HSI images from 30 patients. Despite the limited dataset, the GNN-based model significantly outperforms context-agnostic approaches, accurately distinguishing between healthy and tumor tissues, even in images from previously unseen patients. Furthermore, we show that our carefully designed loss function, accounting for local image quality, results in additional improvements. Our findings demonstrate that context-aware GNN algorithms can robustly find tumor demarcations on HSI images, ultimately contributing to better surgery success and patient outcome. </p> </div> </dd> <dt> <a name='item134'>[134]</a> <a href ="/abs/2403.14715" title="Abstract" id="2403.14715"> arXiv:2403.14715 </a> (replaced) [<a href="/pdf/2403.14715" title="Download PDF" id="pdf-2403.14715" aria-labelledby="pdf-2403.14715">pdf</a>, <a href="https://arxiv.org/html/2403.14715v3" title="View HTML" id="html-2403.14715" aria-labelledby="html-2403.14715" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2403.14715" title="Other formats" id="oth-2403.14715" aria-labelledby="oth-2403.14715">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Towards Understanding Why Label Smoothing Degrades Selective Classification and How to Fix It </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xia,+G">Guoxuan Xia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Laurent,+O">Olivier Laurent</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Franchi,+G">Gianni Franchi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bouganis,+C">Christos-Savvas Bouganis</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Published as a conference paper at ICLR 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Label smoothing (LS) is a popular regularisation method for training neural networks as it is effective in improving test accuracy and is simple to implement. ``Hard'' one-hot labels are ``smoothed'' by uniformly distributing probability mass to other classes, reducing overfitting. Prior work has suggested that in some cases LS can degrade selective classification (SC) -- where the aim is to reject misclassifications using a model's uncertainty. In this work, we first demonstrate empirically across an extended range of large-scale tasks and architectures that LS consistently degrades SC. We then address a gap in existing knowledge, providing an explanation for this behaviour by analysing logit-level gradients: LS degrades the uncertainty rank ordering of correct vs incorrect predictions by suppressing the max logit more when a prediction is likely to be correct, and less when it is likely to be wrong. This elucidates previously reported experimental results where strong classifiers underperform in SC. We then demonstrate the empirical effectiveness of post-hoc logit normalisation for recovering lost SC performance caused by LS. Furthermore, linking back to our gradient analysis, we again provide an explanation for why such normalisation is effective. </p> </div> </dd> <dt> <a name='item135'>[135]</a> <a href ="/abs/2405.16406" title="Abstract" id="2405.16406"> arXiv:2405.16406 </a> (replaced) [<a href="/pdf/2405.16406" title="Download PDF" id="pdf-2405.16406" aria-labelledby="pdf-2405.16406">pdf</a>, <a href="/format/2405.16406" title="Other formats" id="oth-2405.16406" aria-labelledby="oth-2405.16406">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SpinQuant: LLM quantization with learned rotations </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zechun Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+C">Changsheng Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fedorov,+I">Igor Fedorov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Soran,+B">Bilge Soran</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Choudhary,+D">Dhruv Choudhary</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Krishnamoorthi,+R">Raghuraman Krishnamoorthi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chandra,+V">Vikas Chandra</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tian,+Y">Yuandong Tian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Blankevoort,+T">Tijmen Blankevoort</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> ICLR 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Post-training quantization (PTQ) techniques applied to weights, activations, and the KV cache greatly reduce memory usage, latency, and power consumption of Large Language Models (LLMs), but may lead to large quantization errors when outliers are present. Rotating activation or weight matrices helps remove outliers and benefits quantization. In this work, we identify a collection of applicable rotation parameterizations that lead to identical outputs in full-precision Transformer architectures while enhancing quantization accuracy. In addition, we find that some random rotations lead to much better quantization than others, with an up to 13 points difference in downstream zero-shot reasoning performance. As a result, we propose SpinQuant, a novel approach that incorporates learned rotation matrices for optimal quantized network accuracy. With 4-bit quantization of weight, activation, and KV-cache, SpinQuant narrows the accuracy gap on zero-shot reasoning tasks with full precision to merely 2.9 points on the LLaMA-2 7B model, surpassing LLM-QAT by 19.1 points and SmoothQuant by 25.0 points. Furthermore, SpinQuant also outperforms concurrent work QuaRot, which applies random rotations to remove outliers. In particular, for LLaMA-3 8B models that are hard to quantize, SpinQuant reduces the gap to full precision by up to 45.1% relative to QuaRot. Code is available at <a href="https://github.com/facebookresearch/SpinQuant" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item136'>[136]</a> <a href ="/abs/2406.00341" title="Abstract" id="2406.00341"> arXiv:2406.00341 </a> (replaced) [<a href="/pdf/2406.00341" title="Download PDF" id="pdf-2406.00341" aria-labelledby="pdf-2406.00341">pdf</a>, <a href="https://arxiv.org/html/2406.00341v2" title="View HTML" id="html-2406.00341" aria-labelledby="html-2406.00341" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.00341" title="Other formats" id="oth-2406.00341" aria-labelledby="oth-2406.00341">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> DSCA: A Digital Subtraction Angiography Sequence Dataset and Spatio-Temporal Model for Cerebral Artery Segmentation </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Zhang,+J">Jiong Zhang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Xie,+Q">Qihang Xie</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Mou,+L">Lei Mou</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Zhang,+D">Dan Zhang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Chen,+D">Da Chen</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Shan,+C">Caifeng Shan</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Zhao,+Y">Yitian Zhao</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Su,+R">Ruisheng Su</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Guo,+M">Mengguo Guo</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Published by TMI </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Cerebrovascular diseases (CVDs) remain a leading cause of global disability and mortality. Digital Subtraction Angiography (DSA) sequences, recognized as the gold standard for diagnosing CVDs, can clearly visualize the dynamic flow and reveal pathological conditions within the cerebrovasculature. Therefore, precise segmentation of cerebral arteries (CAs) and classification between their main trunks and branches are crucial for physicians to accurately quantify diseases. However, achieving accurate CA segmentation in DSA sequences remains a challenging task due to small vessels with low contrast, and ambiguity between vessels and residual skull structures. Moreover, the lack of publicly available datasets limits exploration in the field. In this paper, we introduce a DSA Sequence-based Cerebral Artery segmentation dataset (DSCA), the publicly accessible dataset designed specifically for pixel-level semantic segmentation of CAs. Additionally, we propose DSANet, a spatio-temporal network for CA segmentation in DSA sequences. Unlike existing DSA segmentation methods that focus only on a single frame, the proposed DSANet introduces a separate temporal encoding branch to capture dynamic vessel details across multiple frames. To enhance small vessel segmentation and improve vessel connectivity, we design a novel TemporalFormer module to capture global context and correlations among sequential frames. Furthermore, we develop a Spatio-Temporal Fusion (STF) module to effectively integrate spatial and temporal features from the encoder. Extensive experiments demonstrate that DSANet outperforms other state-of-the-art methods in CA segmentation, achieving a Dice of 0.9033. </p> </div> </dd> <dt> <a name='item137'>[137]</a> <a href ="/abs/2407.04903" title="Abstract" id="2407.04903"> arXiv:2407.04903 </a> (replaced) [<a href="/pdf/2407.04903" title="Download PDF" id="pdf-2407.04903" aria-labelledby="pdf-2407.04903">pdf</a>, <a href="https://arxiv.org/html/2407.04903v3" title="View HTML" id="html-2407.04903" aria-labelledby="html-2407.04903" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2407.04903" title="Other formats" id="oth-2407.04903" aria-labelledby="oth-2407.04903">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MMSci: A Dataset for Graduate-Level Multi-Discipline Multimodal Scientific Understanding </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Z">Zekun Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+X">Xianjun Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Choi,+K">Kyuri Choi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+W">Wanrong Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hsieh,+R">Ryan Hsieh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+H">HyeonJung Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lim,+J+H">Jin Hyuk Lim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ji,+S">Sungyoung Ji</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+B">Byungju Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yan,+X">Xifeng Yan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Petzold,+L+R">Linda Ruth Petzold</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wilson,+S+D">Stephen D. Wilson</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lim,+W">Woosang Lim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+W+Y">William Yang Wang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Code and data are available at <a href="https://github.com/Leezekun/MMSci" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Scientific figure interpretation is a crucial capability for AI-driven scientific assistants built on advanced Large Vision Language Models. However, current datasets and benchmarks primarily focus on simple charts or other relatively straightforward figures from limited science domains. To address this gap, we present a comprehensive dataset compiled from peer-reviewed Nature Communications articles covering 72 scientific fields, encompassing complex visualizations such as schematic diagrams, microscopic images, and experimental data which require graduate-level expertise to interpret. We evaluated 19 proprietary and open-source models on two benchmark tasks, figure captioning and multiple-choice, and conducted human expert annotation. Our analysis revealed significant task challenges and performance gaps among models. Beyond serving as a benchmark, this dataset serves as a valuable resource for large-scale training. Fine-tuning Qwen2-VL-7B with our task-specific data achieved better performance than GPT-4o and even human experts in multiple-choice evaluations. Furthermore, continuous pre-training on our interleaved article and figure data substantially enhanced the model's downstream task performance in materials science. We have released our dataset to support further research. </p> </div> </dd> <dt> <a name='item138'>[138]</a> <a href ="/abs/2408.16340" title="Abstract" id="2408.16340"> arXiv:2408.16340 </a> (replaced) [<a href="/pdf/2408.16340" title="Download PDF" id="pdf-2408.16340" aria-labelledby="pdf-2408.16340">pdf</a>, <a href="https://arxiv.org/html/2408.16340v4" title="View HTML" id="html-2408.16340" aria-labelledby="html-2408.16340" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2408.16340" title="Other formats" id="oth-2408.16340" aria-labelledby="oth-2408.16340">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Learned Image Transmission with Hierarchical Variational Autoencoder </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Zhang,+G">Guangyi Zhang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Li,+H">Hanlei Li</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Cai,+Y">Yunlong Cai</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Hu,+Q">Qiyu Hu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Yu,+G">Guanding Yu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Zhang,+R">Runmin Zhang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> In this paper, we introduce an innovative hierarchical joint source-channel coding (HJSCC) framework for image transmission, utilizing a hierarchical variational autoencoder (VAE). Our approach leverages a combination of bottom-up and top-down paths at the transmitter to autoregressively generate multiple hierarchical representations of the original image. These representations are then directly mapped to channel symbols for transmission by the JSCC encoder. We extend this framework to scenarios with a feedback link, modeling transmission over a noisy channel as a probabilistic sampling process and deriving a novel generative formulation for JSCC with feedback. Compared with existing approaches, our proposed HJSCC provides enhanced adaptability by dynamically adjusting transmission bandwidth, encoding these representations into varying amounts of channel symbols. Extensive experiments on images of varying resolutions demonstrate that our proposed model outperforms existing baselines in rate-distortion performance and maintains robustness against channel noise. The source code will be made available upon acceptance. </p> </div> </dd> <dt> <a name='item139'>[139]</a> <a href ="/abs/2409.03685" title="Abstract" id="2409.03685"> arXiv:2409.03685 </a> (replaced) [<a href="/pdf/2409.03685" title="Download PDF" id="pdf-2409.03685" aria-labelledby="pdf-2409.03685">pdf</a>, <a href="https://arxiv.org/html/2409.03685v2" title="View HTML" id="html-2409.03685" aria-labelledby="html-2409.03685" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.03685" title="Other formats" id="oth-2409.03685" aria-labelledby="oth-2409.03685">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> View-Invariant Policy Learning via Zero-Shot Novel View Synthesis </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Tian,+S">Stephen Tian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wulfe,+B">Blake Wulfe</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sargent,+K">Kyle Sargent</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+K">Katherine Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zakharov,+S">Sergey Zakharov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guizilini,+V">Vitor Guizilini</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+J">Jiajun Wu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted to CoRL 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV); Machine Learning (cs.LG) </div> <p class='mathjax'> Large-scale visuomotor policy learning is a promising approach toward developing generalizable manipulation systems. Yet, policies that can be deployed on diverse embodiments, environments, and observational modalities remain elusive. In this work, we investigate how knowledge from large-scale visual data of the world may be used to address one axis of variation for generalizable manipulation: observational viewpoint. Specifically, we study single-image novel view synthesis models, which learn 3D-aware scene-level priors by rendering images of the same scene from alternate camera viewpoints given a single input image. For practical application to diverse robotic data, these models must operate zero-shot, performing view synthesis on unseen tasks and environments. We empirically analyze view synthesis models within a simple data-augmentation scheme that we call View Synthesis Augmentation (VISTA) to understand their capabilities for learning viewpoint-invariant policies from single-viewpoint demonstration data. Upon evaluating the robustness of policies trained with our method to out-of-distribution camera viewpoints, we find that they outperform baselines in both simulated and real-world manipulation tasks. Videos and additional visualizations are available at <a href="https://s-tian.github.io/projects/vista" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item140'>[140]</a> <a href ="/abs/2410.00255" title="Abstract" id="2410.00255"> arXiv:2410.00255 </a> (replaced) [<a href="/pdf/2410.00255" title="Download PDF" id="pdf-2410.00255" aria-labelledby="pdf-2410.00255">pdf</a>, <a href="https://arxiv.org/html/2410.00255v2" title="View HTML" id="html-2410.00255" aria-labelledby="html-2410.00255" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.00255" title="Other formats" id="oth-2410.00255" aria-labelledby="oth-2410.00255">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Robin3D: Improving 3D Large Language Model via Robust Instruction Tuning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Kang,+W">Weitai Kang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+H">Haifeng Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shang,+Y">Yuzhang Shang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shah,+M">Mubarak Shah</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yan,+Y">Yan Yan</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 8 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Recent advancements in 3D Large Language Models (3DLLMs) have highlighted their potential in building general-purpose agents in the 3D real world, yet challenges remain due to the lack of high-quality robust instruction-following data, leading to limited discriminative power and generalization of 3DLLMs. In this paper, we introduce Robin3D, a powerful 3DLLM trained on large-scale instruction-following data generated by our novel data engine, Robust Instruction Generation (RIG) engine. RIG generates two key instruction data: 1) the Adversarial Instruction-following data, which features mixed negative and positive samples to enhance the model's discriminative understanding. 2) the Diverse Instruction-following data, which contains various instruction styles to enhance model's generalization. As a result, we construct 1 million instruction-following data, consisting of 344K Adversarial samples, 508K Diverse samples, and 165K benchmark training set samples. To better handle these complex instructions, Robin3D first incorporates Relation-Augmented Projector to enhance spatial understanding, and then strengthens the object referring and grounding ability through ID-Feature Bonding. Robin3D consistently outperforms previous methods across five widely-used 3D multimodal learning benchmarks, without the need for task-specific fine-tuning. Notably, we achieve a 7.8\% improvement in the grounding task (Multi3DRefer) and a 6.9\% improvement in the captioning task (Scan2Cap). </p> </div> </dd> <dt> <a name='item141'>[141]</a> <a href ="/abs/2411.05195" title="Abstract" id="2411.05195"> arXiv:2411.05195 </a> (replaced) [<a href="/pdf/2411.05195" title="Download PDF" id="pdf-2411.05195" aria-labelledby="pdf-2411.05195">pdf</a>, <a href="https://arxiv.org/html/2411.05195v2" title="View HTML" id="html-2411.05195" aria-labelledby="html-2411.05195" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.05195" title="Other formats" id="oth-2411.05195" aria-labelledby="oth-2411.05195">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Exploring How Generative MLLMs Perceive More Than CLIP with the Same Vision Encoder </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+S">Siting Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Koh,+P+W">Pang Wei Koh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Du,+S+S">Simon Shaolei Du</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 17 pages, 3 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computation and Language (cs.CL); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Recent research has shown that CLIP models struggle with visual reasoning tasks that require grounding compositionality, understanding spatial relationships, or capturing fine-grained details. One natural hypothesis is that the CLIP vision encoder does not embed essential information for these tasks. However, we find that this is not always the case: The encoder gathers query-relevant visual information, while CLIP fails to extract it. In particular, we show that another branch of Vision-Language Models (VLMs), Generative Multimodal Large Language Models (MLLMs), achieve significantly higher accuracy than CLIP in many of these tasks using the same vision encoder and weights, indicating that these Generative MLLMs perceive more -- as they extract and utilize visual information more effectively. We conduct a series of controlled experiments and reveal that their success is attributed to multiple key design choices, including patch tokens, position embeddings, and prompt-based weighting. On the other hand, enhancing the training data alone or applying a stronger text encoder does not suffice to solve the task, and additional text tokens offer little benefit. Interestingly, we find that fine-grained visual reasoning is not exclusive to generative models trained by an autoregressive loss: When converted into CLIP-like encoders by contrastive finetuning, these MLLMs still outperform CLIP under the same cosine similarity-based evaluation protocol. Our study highlights the importance of VLM architectural choices and suggests directions for improving the performance of CLIP-like contrastive VLMs. </p> </div> </dd> <dt> <a name='item142'>[142]</a> <a href ="/abs/2412.07175" title="Abstract" id="2412.07175"> arXiv:2412.07175 </a> (replaced) [<a href="/pdf/2412.07175" title="Download PDF" id="pdf-2412.07175" aria-labelledby="pdf-2412.07175">pdf</a>, <a href="https://arxiv.org/html/2412.07175v2" title="View HTML" id="html-2412.07175" aria-labelledby="html-2412.07175" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.07175" title="Other formats" id="oth-2412.07175" aria-labelledby="oth-2412.07175">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Robust Feature Engineering Techniques for Designing Efficient Motor Imagery-Based BCI-Systems </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Gardezi,+S+S">Syed Saim Gardezi</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Jawed,+S">Soyiba Jawed</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Khan,+M">Mahnoor Khan</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Bukhari,+M">Muneeba Bukhari</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Khan,+R+A">Rizwan Ahmed Khan</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 26 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Signal Processing (eess.SP)</span>; Computer Vision and Pattern Recognition (cs.CV); Machine Learning (cs.LG) </div> <p class='mathjax'> A multitude of individuals across the globe grapple with motor disabilities. Neural prosthetics utilizing Brain-Computer Interface (BCI) technology exhibit promise for improving motor rehabilitation outcomes. The intricate nature of EEG data poses a significant hurdle for current BCI systems. Recently, a qualitative repository of EEG signals tied to both upper and lower limb execution of motor and motor imagery tasks has been unveiled. Despite this, the productivity of the Machine Learning (ML) Models that were trained on this dataset was alarmingly deficient, and the evaluation framework seemed insufficient. To enhance outcomes, robust feature engineering (signal processing) methodologies are implemented. A collection of time domain, frequency domain, and wavelet-derived features was obtained from 16-channel EEG signals, and the Maximum Relevance Minimum Redundancy (MRMR) approach was employed to identify the four most significant features. For classification K Nearest Neighbors (KNN), Support Vector Machine (SVM), Decision Tree (DT), and Na茂ve Bayes (NB) models were implemented with these selected features, evaluating their effectiveness through metrics such as testing accuracy, precision, recall, and F1 Score. By leveraging SVM with a Gaussian Kernel, a remarkable maximum testing accuracy of 92.50% for motor activities and 95.48% for imagery activities is achieved. These results are notably more dependable and gratifying compared to the previous study, where the peak accuracy was recorded at 74.36%. This research work provides an in-depth analysis of the MI Limb EEG dataset and it will help in designing and developing simple, cost-effective and reliable BCI systems for neuro-rehabilitation. </p> </div> </dd> <dt> <a name='item143'>[143]</a> <a href ="/abs/2501.18362" title="Abstract" id="2501.18362"> arXiv:2501.18362 </a> (replaced) [<a href="/pdf/2501.18362" title="Download PDF" id="pdf-2501.18362" aria-labelledby="pdf-2501.18362">pdf</a>, <a href="/format/2501.18362" title="Other formats" id="oth-2501.18362" aria-labelledby="oth-2501.18362">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MedXpertQA: Benchmarking Expert-Level Medical Reasoning and Understanding </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zuo,+Y">Yuxin Zuo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qu,+S">Shang Qu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yifei Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Z">Zhangren Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+X">Xuekai Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hua,+E">Ermo Hua</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+K">Kaiyan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ding,+N">Ning Ding</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+B">Bowen Zhou</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL); Computer Vision and Pattern Recognition (cs.CV); Machine Learning (cs.LG) </div> <p class='mathjax'> We introduce MedXpertQA, a highly challenging and comprehensive benchmark to evaluate expert-level medical knowledge and advanced reasoning. MedXpertQA includes 4,460 questions spanning 17 specialties and 11 body systems. It includes two subsets, Text for text evaluation and MM for multimodal evaluation. Notably, MM introduces expert-level exam questions with diverse images and rich clinical information, including patient records and examination results, setting it apart from traditional medical multimodal benchmarks with simple QA pairs generated from image captions. MedXpertQA applies rigorous filtering and augmentation to address the insufficient difficulty of existing benchmarks like MedQA, and incorporates specialty board questions to improve clinical relevance and comprehensiveness. We perform data synthesis to mitigate data leakage risk and conduct multiple rounds of expert reviews to ensure accuracy and reliability. We evaluate 16 leading models on MedXpertQA. Moreover, medicine is deeply connected to real-world decision-making, providing a rich and representative setting for assessing reasoning abilities beyond mathematics and code. To this end, we develop a reasoning-oriented subset to facilitate the assessment of o1-like models. </p> </div> </dd> <dt> <a name='item144'>[144]</a> <a href ="/abs/2502.06997" title="Abstract" id="2502.06997"> arXiv:2502.06997 </a> (replaced) [<a href="/pdf/2502.06997" title="Download PDF" id="pdf-2502.06997" aria-labelledby="pdf-2502.06997">pdf</a>, <a href="https://arxiv.org/html/2502.06997v2" title="View HTML" id="html-2502.06997" aria-labelledby="html-2502.06997" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.06997" title="Other formats" id="oth-2502.06997" aria-labelledby="oth-2502.06997">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Conditional diffusion model with spatial attention and latent embedding for medical image segmentation </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Hejrati,+B">Behzad Hejrati</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Banerjee,+S">Soumyanil Banerjee</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Glide-Hurst,+C">Carri Glide-Hurst</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Dong,+M">Ming Dong</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 13 pages, 5 figures, 3 tables, Accepted in MICCAI 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Diffusion models have been used extensively for high quality image and video generation tasks. In this paper, we propose a novel conditional diffusion model with spatial attention and latent embedding (cDAL) for medical image segmentation. In cDAL, a convolutional neural network (CNN) based discriminator is used at every time-step of the diffusion process to distinguish between the generated labels and the real ones. A spatial attention map is computed based on the features learned by the discriminator to help cDAL generate more accurate segmentation of discriminative regions in an input image. Additionally, we incorporated a random latent embedding into each layer of our model to significantly reduce the number of training and sampling time-steps, thereby making it much faster than other diffusion models for image segmentation. We applied cDAL on 3 publicly available medical image segmentation datasets (MoNuSeg, Chest X-ray and Hippocampus) and observed significant qualitative and quantitative improvements with higher Dice scores and mIoU over the state-of-the-art algorithms. The source code is publicly available at <a href="https://github.com/Hejrati/cDAL/" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> </dl> <div class='paging'>Total of 144 entries </div> <div class='morefewer'>Showing up to 2000 entries per page: <a href=/list/cs.CV/new?skip=0&show=1000 rel="nofollow"> fewer</a> | <span style="color: #454545">more</span> | <span style="color: #454545">all</span> </div> </div> </div> </div> </main> <footer style="clear: both;"> <div class="columns is-desktop" role="navigation" aria-label="Secondary" style="margin: -0.75em -0.75em 0.75em -0.75em">  <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>   </div> </footer> </div> <script src="/static/base/1.0.1/js/member_acknowledgement.js"></script> </body> </html>