CINXE.COM

Computer Vision and Pattern Recognition

<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"> <head> <title>Computer Vision and Pattern Recognition </title> <meta name="viewport" content="width=device-width, initial-scale=1"> <link rel="apple-touch-icon" sizes="180x180" href="/static/browse/0.3.4/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="/static/browse/0.3.4/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="/static/browse/0.3.4/images/icons/favicon-16x16.png"> <link rel="manifest" href="/static/browse/0.3.4/images/icons/site.webmanifest"> <link rel="mask-icon" href="/static/browse/0.3.4/images/icons/safari-pinned-tab.svg" color="#5bbad5"> <meta name="msapplication-TileColor" content="#da532c"> <meta name="theme-color" content="#ffffff"> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/arXiv.css?v=20240822" /> <link rel="stylesheet" type="text/css" media="print" href="/static/browse/0.3.4/css/arXiv-print.css?v=20200611" /> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/browse_search.css" /> <script language="javascript" src="/static/browse/0.3.4/js/accordion.js" /></script> <script src="/static/browse/0.3.4/js/mathjaxToggle.min.js" type="text/javascript"></script> <script type="text/javascript" language="javascript">mathjaxToggle();</script> </head> <body class="with-cu-identity"> <div class="flex-wrap-footer"> <header> <a href="#content" class="is-sr-only">Skip to main content</a> <!-- start desktop header --> <div class="columns is-vcentered is-hidden-mobile" id="cu-identity"> <div class="column" id="cu-logo"> <a href="https://www.cornell.edu/"><img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University" /></a> </div><div class="column" id="support-ack"> <span id="support-ack-url">We gratefully acknowledge support from the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors.</span> <a href="https://info.arxiv.org/about/donate.html" class="btn-header-donate">Donate</a> </div> </div> <div id="header" class="is-hidden-mobile"> <a aria-hidden="true" tabindex="-1" href="/IgnoreMe"></a> <div class="header-breadcrumbs"> <a href="/"><img src="/static/browse/0.3.4/images/arxiv-logo-one-color-white.svg" alt="arxiv logo" style="height:40px;"/></a> <span>&gt;</span> <a href="/list/cs.CV/recent">cs.CV</a> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div><!-- /end desktop header --> <div class="mobile-header"> <div class="columns is-mobile"> <div class="column logo-arxiv"><a href="https://arxiv.org/"><img src="/static/browse/0.3.4/images/arxiv-logomark-small-white.svg" alt="arXiv logo" style="height:60px;" /></a></div> <div class="column logo-cornell"><a href="https://www.cornell.edu/"> <picture> <source media="(min-width: 501px)" srcset="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg 400w" sizes="400w" /> <source srcset="/static/browse/0.3.4/images/icons/cu/cornell_seal_simple_black.svg 2x" /> <img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University Logo" /> </picture> </a></div> <div class="column nav" id="toggle-container" role="menubar"> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-white"><title>open search</title><path d="M505 442.7L405.3 343c-4.5-4.5-10.6-7-17-7H372c27.6-35.3 44-79.7 44-128C416 93.1 322.9 0 208 0S0 93.1 0 208s93.1 208 208 208c48.3 0 92.7-16.4 128-44v16.3c0 6.4 2.5 12.5 7 17l99.7 99.7c9.4 9.4 24.6 9.4 33.9 0l28.3-28.3c9.4-9.4 9.4-24.6.1-34zM208 336c-70.7 0-128-57.2-128-128 0-70.7 57.2-128 128-128 70.7 0 128 57.2 128 128 0 70.7-57.2 128-128 128z"/></svg></button> <div class="mobile-toggle-block toggle-target"> <form class="mobile-search-form" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <input class="input" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <input type="hidden" name="source" value="header"> <input type="hidden" name="searchtype" value="all"> <button class="button">GO</button> </div> </form> </div> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-white" role="menu"><title>open navigation menu</title><path d="M16 132h416c8.837 0 16-7.163 16-16V76c0-8.837-7.163-16-16-16H16C7.163 60 0 67.163 0 76v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16z"/ ></svg></button> <div class="mobile-toggle-block toggle-target"> <nav class="mobile-menu" aria-labelledby="mobilemenulabel"> <h2 id="mobilemenulabel">quick links</h2> <ul> <li><a href="https://arxiv.org/login">Login</a></li> <li><a href="https://info.arxiv.org/help">Help Pages</a></li> <li><a href="https://info.arxiv.org/about">About</a></li> </ul> </nav> </div> </div> </div> </div><!-- /end mobile-header --> </header> <main> <div id="content"> <div id='content-inner'> <div id='dlpage'> <h1>Computer Vision and Pattern Recognition</h1> <ul> <li><a href="#item0">New submissions</a></li> <li><a href="#item83">Cross-lists</a></li> <li><a href="#item111">Replacements</a></li> </ul> <p>See <a id="recent-cs.CV" aria-labelledby="recent-cs.CV" href="/list/cs.CV/recent">recent</a> articles</p> <h3>Showing new listings for Friday, 22 November 2024</h3> <div class='paging'>Total of 180 entries </div> <div class='morefewer'>Showing up to 2000 entries per page: <a href=/list/cs.CV/new?skip=0&amp;show=1000 rel="nofollow"> fewer</a> | <span style="color: #454545">more</span> | <span style="color: #454545">all</span> </div> <dl id='articles'> <h3>New submissions (showing 82 of 82 entries)</h3> <dt> <a name='item1'>[1]</a> <a href ="/abs/2411.13572" title="Abstract" id="2411.13572"> arXiv:2411.13572 </a> [<a href="/pdf/2411.13572" title="Download PDF" id="pdf-2411.13572" aria-labelledby="pdf-2411.13572">pdf</a>, <a href="https://arxiv.org/html/2411.13572v1" title="View HTML" id="html-2411.13572" aria-labelledby="html-2411.13572" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13572" title="Other formats" id="oth-2411.13572" aria-labelledby="oth-2411.13572">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Public Health Advocacy Dataset: A Dataset of Tobacco Usage Videos from Social Media </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chappa,+N+V+R">Naga VS Raviteja Chappa</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=McCormick,+C">Charlotte McCormick</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gongora,+S+R">Susana Rodriguez Gongora</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Dobbs,+P+D">Page Daniel Dobbs</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Luu,+K">Khoa Luu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Under review at International Journal of Computer Vision (IJCV); 29 figures, 5 figures; </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> The Public Health Advocacy Dataset (PHAD) is a comprehensive collection of 5,730 videos related to tobacco products sourced from social media platforms like TikTok and YouTube. This dataset encompasses 4.3 million frames and includes detailed metadata such as user engagement metrics, video descriptions, and search keywords. This is the first dataset with these features providing a valuable resource for analyzing tobacco-related content and its impact. Our research employs a two-stage classification approach, incorporating a Vision-Language (VL) Encoder, demonstrating superior performance in accurately categorizing various types of tobacco products and usage scenarios. The analysis reveals significant user engagement trends, particularly with vaping and e-cigarette content, highlighting areas for targeted public health interventions. The PHAD addresses the need for multi-modal data in public health research, offering insights that can inform regulatory policies and public health strategies. This dataset is a crucial step towards understanding and mitigating the impact of tobacco usage, ensuring that public health efforts are more inclusive and effective. </p> </div> </dd> <dt> <a name='item2'>[2]</a> <a href ="/abs/2411.13578" title="Abstract" id="2411.13578"> arXiv:2411.13578 </a> [<a href="/pdf/2411.13578" title="Download PDF" id="pdf-2411.13578" aria-labelledby="pdf-2411.13578">pdf</a>, <a href="https://arxiv.org/html/2411.13578v1" title="View HTML" id="html-2411.13578" aria-labelledby="html-2411.13578" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13578" title="Other formats" id="oth-2411.13578" aria-labelledby="oth-2411.13578">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> COOD: Concept-based Zero-shot OOD Detection </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+Z">Zhendong Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Nian,+Y">Yi Nian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zou,+H+P">Henry Peng Zou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+L">Li Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Hu,+X">Xiyang Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhao,+Y">Yue Zhao</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> How can models effectively detect out-of-distribution (OOD) samples in complex, multi-label settings without extensive retraining? Existing OOD detection methods struggle to capture the intricate semantic relationships and label co-occurrences inherent in multi-label settings, often requiring large amounts of training data and failing to generalize to unseen label combinations. While large language models have revolutionized zero-shot OOD detection, they primarily focus on single-label scenarios, leaving a critical gap in handling real-world tasks where samples can be associated with multiple interdependent labels. To address these challenges, we introduce COOD, a novel zero-shot multi-label OOD detection framework. COOD leverages pre-trained vision-language models, enhancing them with a concept-based label expansion strategy and a new scoring function. By enriching the semantic space with both positive and negative concepts for each label, our approach models complex label dependencies, precisely differentiating OOD samples without the need for additional training. Extensive experiments demonstrate that our method significantly outperforms existing approaches, achieving approximately 95% average AUROC on both VOC and COCO datasets, while maintaining robust performance across varying numbers of labels and different types of OOD samples. </p> </div> </dd> <dt> <a name='item3'>[3]</a> <a href ="/abs/2411.13582" title="Abstract" id="2411.13582"> arXiv:2411.13582 </a> [<a href="/pdf/2411.13582" title="Download PDF" id="pdf-2411.13582" aria-labelledby="pdf-2411.13582">pdf</a>, <a href="https://arxiv.org/html/2411.13582v1" title="View HTML" id="html-2411.13582" aria-labelledby="html-2411.13582" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13582" title="Other formats" id="oth-2411.13582" aria-labelledby="oth-2411.13582">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Deep Feature Response Discriminative Calibration </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xu,+W">Wenxiang Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Qiu,+T">Tian Qiu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhou,+L">Linyun Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Feng,+Z">Zunlei Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Song,+M">Mingli Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+H">Huiqiong Wang</a></div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> Neurocomputing 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Deep neural networks (DNNs) have numerous applications across various domains. Several optimization techniques, such as ResNet and SENet, have been proposed to improve model accuracy. These techniques improve the model performance by adjusting or calibrating feature responses according to a uniform standard. However, they lack the discriminative calibration for different features, thereby introducing limitations in the model output. Therefore, we propose a method that discriminatively calibrates feature responses. The preliminary experimental results indicate that the neural feature response follows a Gaussian distribution. Consequently, we compute confidence values by employing the Gaussian probability density function, and then integrate these values with the original response values. The objective of this integration is to improve the feature discriminability of the neural feature response. Based on the calibration values, we propose a plugin-based calibration module incorporated into a modified ResNet architecture, termed Response Calibration Networks (ResCNet). Extensive experiments on datasets like CIFAR-10, CIFAR-100, SVHN, and ImageNet demonstrate the effectiveness of the proposed approach. The developed code is publicly available at <a href="https://github.com/tcmyxc/ResCNet" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item4'>[4]</a> <a href ="/abs/2411.13588" title="Abstract" id="2411.13588"> arXiv:2411.13588 </a> [<a href="/pdf/2411.13588" title="Download PDF" id="pdf-2411.13588" aria-labelledby="pdf-2411.13588">pdf</a>, <a href="https://arxiv.org/html/2411.13588v1" title="View HTML" id="html-2411.13588" aria-labelledby="html-2411.13588" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13588" title="Other formats" id="oth-2411.13588" aria-labelledby="oth-2411.13588">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Unveiling Redundancy in Diffusion Transformers (DiTs): A Systematic Study </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Sun,+X">Xibo Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Fang,+J">Jiarui Fang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+A">Aoyu Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Pan,+J">Jinzhe Pan</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 9 pages including reference </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> The increased model capacity of Diffusion Transformers (DiTs) and the demand for generating higher resolutions of images and videos have led to a significant rise in inference latency, impacting real-time performance adversely. While prior research has highlighted the presence of high similarity in activation values between adjacent diffusion steps (referred to as redundancy) and proposed various caching mechanisms to mitigate computational overhead, the exploration of redundancy in existing literature remains limited, with findings often not generalizable across different DiT models. This study aims to address this gap by conducting a comprehensive investigation into redundancy across a broad spectrum of mainstream DiT models. Our experimental analysis reveals substantial variations in the distribution of redundancy across diffusion steps among different DiT models. Interestingly, within a single model, the redundancy distribution remains stable regardless of variations in input prompts, step counts, or scheduling strategies. Given the lack of a consistent pattern across diverse models, caching strategies designed for a specific group of models may not easily transfer to others. To overcome this challenge, we introduce a tool for analyzing the redundancy of individual models, enabling subsequent research to develop tailored caching strategies for specific model architectures. The project is publicly available at <a href="https://github.com/xdit-project/DiTCacheAnalysis" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item5'>[5]</a> <a href ="/abs/2411.13590" title="Abstract" id="2411.13590"> arXiv:2411.13590 </a> [<a href="/pdf/2411.13590" title="Download PDF" id="pdf-2411.13590" aria-labelledby="pdf-2411.13590">pdf</a>, <a href="https://arxiv.org/html/2411.13590v1" title="View HTML" id="html-2411.13590" aria-labelledby="html-2411.13590" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13590" title="Other formats" id="oth-2411.13590" aria-labelledby="oth-2411.13590">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Deep learning waterways for rural infrastructure development </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Pierson,+M">Matthew Pierson</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Mehrabi,+Z">Zia Mehrabi</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 18 pages, 6 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Surprisingly a number of Earth&#39;s waterways remain unmapped, with a significant number in low and middle income countries. Here we build a computer vision model (WaterNet) to learn the location of waterways in the United States, based on high resolution satellite imagery and digital elevation models, and then deploy this in novel environments in the African continent. Our outputs provide detail of waterways structures hereto unmapped. When assessed against community needs requests for rural bridge building related to access to schools, health care facilities and agricultural markets, we find these newly generated waterways capture on average 93% (country range: 88-96%) of these requests whereas Open Street Map, and the state of the art data from TDX-Hydro, capture only 36% (5-72%) and 62% (37%-85%), respectively. Because these new machine learning enabled maps are built on public and operational data acquisition this approach offers promise for capturing humanitarian needs and planning for social development in places where cartographic efforts have so far failed to deliver. The improved performance in identifying community needs missed by existing data suggests significant value for rural infrastructure development and better targeting of development interventions. </p> </div> </dd> <dt> <a name='item6'>[6]</a> <a href ="/abs/2411.13591" title="Abstract" id="2411.13591"> arXiv:2411.13591 </a> [<a href="/pdf/2411.13591" title="Download PDF" id="pdf-2411.13591" aria-labelledby="pdf-2411.13591">pdf</a>, <a href="https://arxiv.org/html/2411.13591v1" title="View HTML" id="html-2411.13591" aria-labelledby="html-2411.13591" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13591" title="Other formats" id="oth-2411.13591" aria-labelledby="oth-2411.13591">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Improved GUI Grounding via Iterative Narrowing </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Nguyen,+A">Anthony Nguyen</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> GUI grounding, the task of identifying a precise location on an interface image from a natural language query, plays a crucial role in enhancing the capabilities of Vision-Language Model (VLM) agents. While general VLMs, such as GPT-4V, demonstrate strong performance across various tasks, their proficiency in GUI grounding remains suboptimal. Recent studies have focused on fine-tuning these models specifically for one-shot GUI grounding, yielding significant improvements over baseline performance. We introduce a visual prompting framework called Iterative Narrowing (IN) to further enhance the performance of both general and fine-tuned models in GUI grounding. For evaluation, we tested our method on a comprehensive benchmark comprising different UI platforms. </p> </div> </dd> <dt> <a name='item7'>[7]</a> <a href ="/abs/2411.13595" title="Abstract" id="2411.13595"> arXiv:2411.13595 </a> [<a href="/pdf/2411.13595" title="Download PDF" id="pdf-2411.13595" aria-labelledby="pdf-2411.13595">pdf</a>, <a href="/format/2411.13595" title="Other formats" id="oth-2411.13595" aria-labelledby="oth-2411.13595">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Towards Accessible Learning: Deep Learning-Based Potential Dysgraphia Detection and OCR for Potentially Dysgraphic Handwriting </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=D,+V">Vydeki D</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Bhandari,+D">Divyansh Bhandari</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Patil,+P+P">Pranav Pratap Patil</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kulkarni,+A+A">Aarush Anand Kulkarni</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Dysgraphia is a learning disorder that affects handwriting abilities, making it challenging for children to write legibly and consistently. Early detection and monitoring are crucial for providing timely support and interventions. This study applies deep learning techniques to address the dual tasks of dysgraphia detection and optical character recognition (OCR) on handwriting samples from children with potential dysgraphic symptoms. Using a dataset of handwritten samples from Malaysian schoolchildren, we developed a custom Convolutional Neural Network (CNN) model, alongside VGG16 and ResNet50, to classify handwriting as dysgraphic or non-dysgraphic. The custom CNN model outperformed the pre-trained models, achieving a test accuracy of 91.8% with high precision, recall, and AUC, demonstrating its robustness in identifying dysgraphic handwriting features. Additionally, an OCR pipeline was created to segment and recognize individual characters in dysgraphic handwriting, achieving a character recognition accuracy of approximately 43.5%. This research highlights the potential of deep learning in supporting dysgraphia assessment, laying a foundation for tools that could assist educators and clinicians in identifying dysgraphia and tracking handwriting progress over time. The findings contribute to advancements in assistive technologies for learning disabilities, offering hope for more accessible and accurate diagnostic tools in educational and clinical settings. </p> </div> </dd> <dt> <a name='item8'>[8]</a> <a href ="/abs/2411.13597" title="Abstract" id="2411.13597"> arXiv:2411.13597 </a> [<a href="/pdf/2411.13597" title="Download PDF" id="pdf-2411.13597" aria-labelledby="pdf-2411.13597">pdf</a>, <a href="https://arxiv.org/html/2411.13597v1" title="View HTML" id="html-2411.13597" aria-labelledby="html-2411.13597" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13597" title="Other formats" id="oth-2411.13597" aria-labelledby="oth-2411.13597">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Enhancing Bidirectional Sign Language Communication: Integrating YOLOv8 and NLP for Real-Time Gesture Recognition &amp; Translation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Bhuiyan,+H+J">Hasnat Jamil Bhuiyan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Mozumder,+M+F">Mubtasim Fuad Mozumder</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Khan,+M+R+I">Md. Rabiul Islam Khan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ahmed,+M+S">Md. Sabbir Ahmed</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Nahim,+N+Z">Nabuat Zaman Nahim</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> The primary concern of this research is to take American Sign Language (ASL) data through real time camera footage and be able to convert the data and information into text. Adding to that, we are also putting focus on creating a framework that can also convert text into sign language in real time which can help us break the language barrier for the people who are in need. In this work, for recognising American Sign Language (ASL), we have used the You Only Look Once(YOLO) model and Convolutional Neural Network (CNN) model. YOLO model is run in real time and automatically extracts discriminative spatial-temporal characteristics from the raw video stream without the need for any prior knowledge, eliminating design flaws. The CNN model here is also run in real time for sign language detection. We have introduced a novel method for converting text based input to sign language by making a framework that will take a sentence as input, identify keywords from that sentence and then show a video where sign language is performed with respect to the sentence given as input in real time. To the best of our knowledge, this is a rare study to demonstrate bidirectional sign language communication in real time in the American Sign Language (ASL). </p> </div> </dd> <dt> <a name='item9'>[9]</a> <a href ="/abs/2411.13604" title="Abstract" id="2411.13604"> arXiv:2411.13604 </a> [<a href="/pdf/2411.13604" title="Download PDF" id="pdf-2411.13604" aria-labelledby="pdf-2411.13604">pdf</a>, <a href="https://arxiv.org/html/2411.13604v1" title="View HTML" id="html-2411.13604" aria-labelledby="html-2411.13604" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13604" title="Other formats" id="oth-2411.13604" aria-labelledby="oth-2411.13604">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> RadPhi-3: Small Language Models for Radiology </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ranjit,+M">Mercy Ranjit</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Shaury">Shaury Srivastav</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ganu,+T">Tanuja Ganu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Computation and Language (cs.CL); Machine Learning (cs.LG) </div> <p class='mathjax'> LLM based copilot assistants are useful in everyday tasks. There is a proliferation in the exploration of AI assistant use cases to support radiology workflows in a reliable manner. In this work, we present RadPhi-3, a Small Language Model instruction tuned from Phi-3-mini-4k-instruct with 3.8B parameters to assist with various tasks in radiology workflows. While impression summary generation has been the primary task which has been explored in prior works w.r.t radiology reports of Chest X-rays, we also explore other useful tasks like change summary generation comparing the current radiology report and its prior report, section extraction from radiology reports, tagging the reports with various pathologies and tubes, lines or devices present in them etc. In-addition, instruction tuning RadPhi-3 involved learning from a credible knowledge source used by radiologists, <a href="http://Radiopaedia.org" rel="external noopener nofollow" class="link-external link-http">this http URL</a>. RadPhi-3 can be used both to give reliable answers for radiology related queries as well as perform useful tasks related to radiology reports. RadPhi-3 achieves SOTA results on the RaLEs radiology report generation benchmark. </p> </div> </dd> <dt> <a name='item10'>[10]</a> <a href ="/abs/2411.13607" title="Abstract" id="2411.13607"> arXiv:2411.13607 </a> [<a href="/pdf/2411.13607" title="Download PDF" id="pdf-2411.13607" aria-labelledby="pdf-2411.13607">pdf</a>, <a href="https://arxiv.org/html/2411.13607v1" title="View HTML" id="html-2411.13607" aria-labelledby="html-2411.13607" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13607" title="Other formats" id="oth-2411.13607" aria-labelledby="oth-2411.13607">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> VioPose: Violin Performance 4D Pose Estimation by Hierarchical Audiovisual Inference </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yoo,+S+J">Seong Jong Yoo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Shrestha,+S">Snehesh Shrestha</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Muresanu,+I">Irina Muresanu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ferm%C3%BCller,+C">Cornelia Ferm眉ller</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted by WACV 2025 in Round 1. First two authors contributed equally </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Musicians delicately control their bodies to generate music. Sometimes, their motions are too subtle to be captured by the human eye. To analyze how they move to produce the music, we need to estimate precise 4D human pose (3D pose over time). However, current state-of-the-art (SoTA) visual pose estimation algorithms struggle to produce accurate monocular 4D poses because of occlusions, partial views, and human-object interactions. They are limited by the viewing angle, pixel density, and sampling rate of the cameras and fail to estimate fast and subtle movements, such as in the musical effect of vibrato. We leverage the direct causal relationship between the music produced and the human motions creating them to address these challenges. We propose VioPose: a novel multimodal network that hierarchically estimates dynamics. High-level features are cascaded to low-level features and integrated into Bayesian updates. Our architecture is shown to produce accurate pose sequences, facilitating precise motion analysis, and outperforms SoTA. As part of this work, we collected the largest and the most diverse calibrated violin-playing dataset, including video, sound, and 3D motion capture poses. Project page: is available at <a href="https://sj-yoo.info/viopose/" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item11'>[11]</a> <a href ="/abs/2411.13609" title="Abstract" id="2411.13609"> arXiv:2411.13609 </a> [<a href="/pdf/2411.13609" title="Download PDF" id="pdf-2411.13609" aria-labelledby="pdf-2411.13609">pdf</a>, <a href="https://arxiv.org/html/2411.13609v1" title="View HTML" id="html-2411.13609" aria-labelledby="html-2411.13609" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13609" title="Other formats" id="oth-2411.13609" aria-labelledby="oth-2411.13609">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> What You See Is What Matters: A Novel Visual and Physics-Based Metric for Evaluating Video Generation Quality </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+Z">Zihan Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+S">Songlin Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Hao,+L">Lingyan Hao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Song,+B">Bowen Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Hu,+X">Xinyu Hu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> As video generation models advance rapidly, assessing the quality of generated videos has become increasingly critical. Existing metrics, such as Fr茅chet Video Distance (FVD), Inception Score (IS), and ClipSim, measure quality primarily in latent space rather than from a human visual perspective, often overlooking key aspects like appearance and motion consistency to physical laws. In this paper, we propose a novel metric, VAMP (Visual Appearance and Motion Plausibility), that evaluates both the visual appearance and physical plausibility of generated videos. VAMP is composed of two main components: an appearance score, which assesses color, shape, and texture consistency across frames, and a motion score, which evaluates the realism of object movements. We validate VAMP through two experiments: corrupted video evaluation and generated video evaluation. In the corrupted video evaluation, we introduce various types of corruptions into real videos and measure the correlation between corruption severity and VAMP scores. In the generated video evaluation, we use state-of-the-art models to generate videos from carefully designed prompts and compare VAMP&#39;s performance to human evaluators&#39; rankings. Our results demonstrate that VAMP effectively captures both visual fidelity and temporal consistency, offering a more comprehensive evaluation of video quality than traditional methods. </p> </div> </dd> <dt> <a name='item12'>[12]</a> <a href ="/abs/2411.13610" title="Abstract" id="2411.13610"> arXiv:2411.13610 </a> [<a href="/pdf/2411.13610" title="Download PDF" id="pdf-2411.13610" aria-labelledby="pdf-2411.13610">pdf</a>, <a href="https://arxiv.org/html/2411.13610v1" title="View HTML" id="html-2411.13610" aria-labelledby="html-2411.13610" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13610" title="Other formats" id="oth-2411.13610" aria-labelledby="oth-2411.13610">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Video2BEV: Transforming Drone Videos to BEVs for Video-based Geo-localization </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ju,+H">Hao Ju</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zheng,+Z">Zhedong Zheng</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Existing approaches to drone visual geo-localization predominantly adopt the image-based setting, where a single drone-view snapshot is matched with images from other platforms. Such task formulation, however, underutilizes the inherent video output of the drone and is sensitive to occlusions and environmental constraints. To address these limitations, we formulate a new video-based drone geo-localization task and propose the Video2BEV paradigm. This paradigm transforms the video into a Bird&#39;s Eye View (BEV), simplifying the subsequent matching process. In particular, we employ Gaussian Splatting to reconstruct a 3D scene and obtain the BEV projection. Different from the existing transform methods, \eg, polar transform, our BEVs preserve more fine-grained details without significant distortion. To further improve model scalability toward diverse BEVs and satellite figures, our Video2BEV paradigm also incorporates a diffusion-based module for generating hard negative samples, which facilitates discriminative feature learning. To validate our approach, we introduce UniV, a new video-based geo-localization dataset that extends the image-based University-1652 dataset. UniV features flight paths at $30^\circ$ and $45^\circ$ elevation angles with increased frame rates of up to 10 frames per second (FPS). Extensive experiments on the UniV dataset show that our Video2BEV paradigm achieves competitive recall rates and outperforms conventional video-based methods. Compared to other methods, our proposed approach exhibits robustness at lower elevations with more occlusions. </p> </div> </dd> <dt> <a name='item13'>[13]</a> <a href ="/abs/2411.13619" title="Abstract" id="2411.13619"> arXiv:2411.13619 </a> [<a href="/pdf/2411.13619" title="Download PDF" id="pdf-2411.13619" aria-labelledby="pdf-2411.13619">pdf</a>, <a href="https://arxiv.org/html/2411.13619v1" title="View HTML" id="html-2411.13619" aria-labelledby="html-2411.13619" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13619" title="Other formats" id="oth-2411.13619" aria-labelledby="oth-2411.13619">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Non-Linear Outlier Synthesis for Out-of-Distribution Detection </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Doorenbos,+L">Lars Doorenbos</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Sznitman,+R">Raphael Sznitman</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=M%C3%A1rquez-Neila,+P">Pablo M谩rquez-Neila</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> The reliability of supervised classifiers is severely hampered by their limitations in dealing with unexpected inputs, leading to great interest in out-of-distribution (OOD) detection. Recently, OOD detectors trained on synthetic outliers, especially those generated by large diffusion models, have shown promising results in defining robust OOD decision boundaries. Building on this progress, we present NCIS, which enhances the quality of synthetic outliers by operating directly in the diffusion&#39;s model embedding space rather than combining disjoint models as in previous work and by modeling class-conditional manifolds with a conditional volume-preserving network for more expressive characterization of the training distribution. We demonstrate that these improvements yield new state-of-the-art OOD detection results on standard ImageNet100 and CIFAR100 benchmarks and provide insights into the importance of data pre-processing and other key design choices. We make our code available at \url{<a href="https://github.com/LarsDoorenbos/NCIS" rel="external noopener nofollow" class="link-external link-https">this https URL</a>}. </p> </div> </dd> <dt> <a name='item14'>[14]</a> <a href ="/abs/2411.13620" title="Abstract" id="2411.13620"> arXiv:2411.13620 </a> [<a href="/pdf/2411.13620" title="Download PDF" id="pdf-2411.13620" aria-labelledby="pdf-2411.13620">pdf</a>, <a href="https://arxiv.org/html/2411.13620v1" title="View HTML" id="html-2411.13620" aria-labelledby="html-2411.13620" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13620" title="Other formats" id="oth-2411.13620" aria-labelledby="oth-2411.13620">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Robust SG-NeRF: Robust Scene Graph Aided Neural Surface Reconstruction </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gu,+Y">Yi Gu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ye,+D">Dongjun Ye</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+Z">Zhaorui Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+J">Jiaxu Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Cao,+J">Jiahang Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xu,+R">Renjing Xu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> <a href="https://rsg-nerf.github.io/RSG-NeRF/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Neural surface reconstruction relies heavily on accurate camera poses as input. Despite utilizing advanced pose estimators like COLMAP or ARKit, camera poses can still be noisy. Existing pose-NeRF joint optimization methods handle poses with small noise (inliers) effectively but struggle with large noise (outliers), such as mirrored poses. In this work, we focus on mitigating the impact of outlier poses. Our method integrates an inlier-outlier confidence estimation scheme, leveraging scene graph information gathered during the data preparation phase. Unlike previous works directly using rendering metrics as the reference, we employ a detached color network that omits the viewing direction as input to minimize the impact caused by shape-radiance ambiguities. This enhanced confidence updating strategy effectively differentiates between inlier and outlier poses, allowing us to sample more rays from inlier poses to construct more reliable radiance fields. Additionally, we introduce a re-projection loss based on the current Signed Distance Function (SDF) and pose estimations, strengthening the constraints between matching image pairs. For outlier poses, we adopt a Monte Carlo re-localization method to find better solutions. We also devise a scene graph updating strategy to provide more accurate information throughout the training process. We validate our approach on the SG-NeRF and DTU datasets. Experimental results on various datasets demonstrate that our methods can consistently improve the reconstruction qualities and pose accuracies. </p> </div> </dd> <dt> <a name='item15'>[15]</a> <a href ="/abs/2411.13623" title="Abstract" id="2411.13623"> arXiv:2411.13623 </a> [<a href="/pdf/2411.13623" title="Download PDF" id="pdf-2411.13623" aria-labelledby="pdf-2411.13623">pdf</a>, <a href="https://arxiv.org/html/2411.13623v1" title="View HTML" id="html-2411.13623" aria-labelledby="html-2411.13623" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13623" title="Other formats" id="oth-2411.13623" aria-labelledby="oth-2411.13623">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Unsupervised Foundation Model-Agnostic Slide-Level Representation Learning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lenz,+T">Tim Lenz</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Neidlinger,+P">Peter Neidlinger</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ligero,+M">Marta Ligero</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=W%C3%B6lflein,+G">Georg W枚lflein</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=van+Treeck,+M">Marko van Treeck</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kather,+J+N">Jakob Nikolas Kather</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Representation learning of pathology whole-slide images (WSIs) has primarily relied on weak supervision with Multiple Instance Learning (MIL). This approach leads to slide representations highly tailored to a specific clinical task. Self-supervised learning (SSL) has been successfully applied to train histopathology foundation models (FMs) for patch embedding generation. However, generating patient or slide level embeddings remains challenging. Existing approaches for slide representation learning extend the principles of SSL from patch level learning to entire slides by aligning different augmentations of the slide or by utilizing multimodal data. By integrating tile embeddings from multiple FMs, we propose a new single modality SSL method in feature space that generates useful slide representations. Our contrastive pretraining strategy, called COBRA, employs multiple FMs and an architecture based on Mamba-2. COBRA exceeds performance of state-of-the-art slide encoders on four different public CPTAC cohorts on average by at least +3.8% AUC, despite only being pretrained on 3048 WSIs from TCGA. Additionally, COBRA is readily compatible at inference time with previously unseen feature extractors. </p> </div> </dd> <dt> <a name='item16'>[16]</a> <a href ="/abs/2411.13626" title="Abstract" id="2411.13626"> arXiv:2411.13626 </a> [<a href="/pdf/2411.13626" title="Download PDF" id="pdf-2411.13626" aria-labelledby="pdf-2411.13626">pdf</a>, <a href="https://arxiv.org/html/2411.13626v1" title="View HTML" id="html-2411.13626" aria-labelledby="html-2411.13626" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13626" title="Other formats" id="oth-2411.13626" aria-labelledby="oth-2411.13626">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Principles of Visual Tokens for Efficient Video Understanding </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Hao,+X">Xinyue Hao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+G">Gen Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gowda,+S+N">Shreyank N Gowda</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Fisher,+R+B">Robert B Fisher</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Huang,+J">Jonathan Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Arnab,+A">Anurag Arnab</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Sevilla-Lara,+L">Laura Sevilla-Lara</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Video understanding has made huge strides in recent years, relying largely on the power of the transformer architecture. As this architecture is notoriously expensive and video is highly redundant, research into improving efficiency has become particularly relevant. This has led to many creative solutions, including token merging and token selection. While most methods succeed in reducing the cost of the model and maintaining accuracy, an interesting pattern arises: most methods do not outperform the random sampling baseline. In this paper we take a closer look at this phenomenon and make several observations. First, we develop an oracle for the value of tokens which exposes a clear Pareto distribution where most tokens have remarkably low value, and just a few carry most of the perceptual information. Second, we analyze why this oracle is extremely hard to learn, as it does not consistently coincide with visual cues. Third, we observe that easy videos need fewer tokens to maintain accuracy. We build on these and further insights to propose a lightweight video model we call LITE that can select a small number of tokens effectively, outperforming state-of-the-art and existing baselines across datasets (Kinetics400 and Something-Something-V2) in the challenging trade-off of computation (GFLOPs) vs accuracy. </p> </div> </dd> <dt> <a name='item17'>[17]</a> <a href ="/abs/2411.13628" title="Abstract" id="2411.13628"> arXiv:2411.13628 </a> [<a href="/pdf/2411.13628" title="Download PDF" id="pdf-2411.13628" aria-labelledby="pdf-2411.13628">pdf</a>, <a href="https://arxiv.org/html/2411.13628v1" title="View HTML" id="html-2411.13628" aria-labelledby="html-2411.13628" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13628" title="Other formats" id="oth-2411.13628" aria-labelledby="oth-2411.13628">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MambaDETR: Query-based Temporal Modeling using State Space Model for Multi-View 3D Object Detection </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ning,+T">Tong Ning</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lu,+K">Ke Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jiang,+X">Xirui Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xue,+J">Jian Xue</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Utilizing temporal information to improve the performance of 3D detection has made great progress recently in the field of autonomous driving. Traditional transformer-based temporal fusion methods suffer from quadratic computational cost and information decay as the length of the frame sequence increases. In this paper, we propose a novel method called MambaDETR, whose main idea is to implement temporal fusion in the efficient state space. Moreover, we design a Motion Elimination module to remove the relatively static objects for temporal fusion. On the standard nuScenes benchmark, our proposed MambaDETR achieves remarkable result in the 3D object detection task, exhibiting state-of-the-art performance among existing temporal fusion methods. </p> </div> </dd> <dt> <a name='item18'>[18]</a> <a href ="/abs/2411.13631" title="Abstract" id="2411.13631"> arXiv:2411.13631 </a> [<a href="/pdf/2411.13631" title="Download PDF" id="pdf-2411.13631" aria-labelledby="pdf-2411.13631">pdf</a>, <a href="/format/2411.13631" title="Other formats" id="oth-2411.13631" aria-labelledby="oth-2411.13631">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Sparse Input View Synthesis: 3D Representations and Reliable Priors </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Somraj,+N">Nagabhushan Somraj</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> PhD Thesis of Nagabhushan S N, Dept of ECE, Indian Institute of Science (IISc); Advisor: Dr. Rajiv Soundararajan; Thesis Reviewers: Dr. Kaushik Mitra (IIT Madras), Dr. Aniket Bera (Purdue University); Submitted: May 2024; Accepted and Defended: Sep 2024; Abstract condensed, please check the PDF for full abstract </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Novel view synthesis refers to the problem of synthesizing novel viewpoints of a scene given the images from a few viewpoints. This is a fundamental problem in computer vision and graphics, and enables a vast variety of applications such as meta-verse, free-view watching of events, video gaming, video stabilization and video compression. Recent 3D representations such as radiance fields and multi-plane images significantly improve the quality of images rendered from novel viewpoints. However, these models require a dense sampling of input views for high quality renders. Their performance goes down significantly when only a few input views are available. In this thesis, we focus on the sparse input novel view synthesis problem for both static and dynamic scenes. </p> </div> </dd> <dt> <a name='item19'>[19]</a> <a href ="/abs/2411.13632" title="Abstract" id="2411.13632"> arXiv:2411.13632 </a> [<a href="/pdf/2411.13632" title="Download PDF" id="pdf-2411.13632" aria-labelledby="pdf-2411.13632">pdf</a>, <a href="/format/2411.13632" title="Other formats" id="oth-2411.13632" aria-labelledby="oth-2411.13632">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> ID-Patch: Robust ID Association for Group Photo Personalization </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+Y">Yimeng Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhi,+T">Tiancheng Zhi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+J">Jing Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Sang,+S">Shen Sang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jiang,+L">Liming Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yan,+Q">Qing Yan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+S">Sijia Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Luo,+L">Linjie Luo</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Project Page is: <a href="https://byteaigc.github.io/ID-Patch/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> The ability to synthesize personalized group photos and specify the positions of each identity offers immense creative potential. While such imagery can be visually appealing, it presents significant challenges for existing technologies. A persistent issue is identity (ID) leakage, where injected facial features interfere with one another, resulting in low face resemblance, incorrect positioning, and visual artifacts. Existing methods suffer from limitations such as the reliance on segmentation models, increased runtime, or a high probability of ID leakage. To address these challenges, we propose ID-Patch, a novel method that provides robust association between identities and 2D positions. Our approach generates an ID patch and ID embeddings from the same facial features: the ID patch is positioned on the conditional image for precise spatial control, while the ID embeddings integrate with text embeddings to ensure high resemblance. Experimental results demonstrate that ID-Patch surpasses baseline methods across metrics, such as face ID resemblance, ID-position association accuracy, and generation efficiency. Project Page is: <a href="https://byteaigc.github.io/ID-Patch/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item20'>[20]</a> <a href ="/abs/2411.13674" title="Abstract" id="2411.13674"> arXiv:2411.13674 </a> [<a href="/pdf/2411.13674" title="Download PDF" id="pdf-2411.13674" aria-labelledby="pdf-2411.13674">pdf</a>, <a href="https://arxiv.org/html/2411.13674v1" title="View HTML" id="html-2411.13674" aria-labelledby="html-2411.13674" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13674" title="Other formats" id="oth-2411.13674" aria-labelledby="oth-2411.13674">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> FabuLight-ASD: Unveiling Speech Activity via Body Language </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Carneiro,+H">Hugo Carneiro</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wermter,+S">Stefan Wermter</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 23 pages, 8 figures, 3 tables, accepted for publication in Neural Computing and Applications </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG); Neural and Evolutionary Computing (cs.NE); Sound (cs.SD) </div> <p class='mathjax'> Active speaker detection (ASD) in multimodal environments is crucial for various applications, from video conferencing to human-robot interaction. This paper introduces FabuLight-ASD, an advanced ASD model that integrates facial, audio, and body pose information to enhance detection accuracy and robustness. Our model builds upon the existing Light-ASD framework by incorporating human pose data, represented through skeleton graphs, which minimises computational overhead. Using the Wilder Active Speaker Detection (WASD) dataset, renowned for reliable face and body bounding box annotations, we demonstrate FabuLight-ASD&#39;s effectiveness in real-world scenarios. Achieving an overall mean average precision (mAP) of 94.3%, FabuLight-ASD outperforms Light-ASD, which has an overall mAP of 93.7% across various challenging scenarios. The incorporation of body pose information shows a particularly advantageous impact, with notable improvements in mAP observed in scenarios with speech impairment, face occlusion, and human voice background noise. Furthermore, efficiency analysis indicates only a modest increase in parameter count (27.3%) and multiply-accumulate operations (up to 2.4%), underscoring the model&#39;s efficiency and feasibility. These findings validate the efficacy of FabuLight-ASD in enhancing ASD performance through the integration of body pose data. FabuLight-ASD&#39;s code and model weights are available at <a href="https://github.com/knowledgetechnologyuhh/FabuLight-ASD" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item21'>[21]</a> <a href ="/abs/2411.13683" title="Abstract" id="2411.13683"> arXiv:2411.13683 </a> [<a href="/pdf/2411.13683" title="Download PDF" id="pdf-2411.13683" aria-labelledby="pdf-2411.13683">pdf</a>, <a href="https://arxiv.org/html/2411.13683v1" title="View HTML" id="html-2411.13683" aria-labelledby="html-2411.13683" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13683" title="Other formats" id="oth-2411.13683" aria-labelledby="oth-2411.13683">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Extending Video Masked Autoencoders to 128 frames </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gundavarapu,+N+B">Nitesh Bharadwaj Gundavarapu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Friedman,+L">Luke Friedman</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Goyal,+R">Raghav Goyal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Hegde,+C">Chaitra Hegde</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Agustsson,+E">Eirikur Agustsson</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Waghmare,+S+M">Sagar M. Waghmare</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Sirotenko,+M">Mikhail Sirotenko</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yang,+M">Ming-Hsuan Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Weyand,+T">Tobias Weyand</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gong,+B">Boqing Gong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Sigal,+L">Leonid Sigal</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 10.5 pages of main paper, 25 pages total, 4 figures and 10 tables. To appear in NeurIPS&#39;24 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Video understanding has witnessed significant progress with recent video foundation models demonstrating strong performance owing to self-supervised pre-training objectives; Masked Autoencoders (MAE) being the design of choice. Nevertheless, the majority of prior works that leverage MAE pre-training have focused on relatively short video representations (16 / 32 frames in length) largely due to hardware memory and compute limitations that scale poorly with video length due to the dense memory-intensive self-attention decoding. One natural strategy to address these challenges is to subsample tokens to reconstruct during decoding (or decoder masking). In this work, we propose an effective strategy for prioritizing tokens which allows training on longer video sequences (128 frames) and gets better performance than, more typical, random and uniform masking strategies. The core of our approach is an adaptive decoder masking strategy that prioritizes the most important tokens and uses quantized tokens as reconstruction objectives. Our adaptive strategy leverages a powerful MAGVIT-based tokenizer that jointly learns the tokens and their priority. We validate our design choices through exhaustive ablations and observe improved performance of the resulting long-video (128 frames) encoders over short-video (32 frames) counterparts. With our long-video masked autoencoder (LVMAE) strategy, we surpass state-of-the-art on Diving48 by 3.9 points and EPIC-Kitchens-100 verb classification by 2.5 points while relying on a simple core architecture and video-only pre-training (unlike some of the prior works that require millions of labeled video-text pairs or specialized encoders). </p> </div> </dd> <dt> <a name='item22'>[22]</a> <a href ="/abs/2411.13697" title="Abstract" id="2411.13697"> arXiv:2411.13697 </a> [<a href="/pdf/2411.13697" title="Download PDF" id="pdf-2411.13697" aria-labelledby="pdf-2411.13697">pdf</a>, <a href="/format/2411.13697" title="Other formats" id="oth-2411.13697" aria-labelledby="oth-2411.13697">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Decompose and Leverage Preferences from Expert Models for Improving Trustworthiness of MLLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Cao,+R">Rui Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jiang,+Y">Yuming Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Schlichtkrull,+M">Michael Schlichtkrull</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Vlachos,+A">Andreas Vlachos</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Multimodal Large Language Models (MLLMs) can enhance trustworthiness by aligning with human preferences. As human preference labeling is laborious, recent works employ evaluation models for assessing MLLMs&#39; responses, using the model-based assessments to automate preference dataset construction. This approach, however, faces challenges with MLLMs&#39; lengthy and compositional responses, which often require diverse reasoning skills that a single evaluation model may not fully possess. Additionally, most existing methods rely on closed-source models as evaluators. To address limitations, we propose DecompGen, a decomposable framework that uses an ensemble of open-sourced expert models. DecompGen breaks down each response into atomic verification tasks, assigning each task to an appropriate expert model to generate fine-grained assessments. The DecompGen feedback is used to automatically construct our preference dataset, DGPref. MLLMs aligned with DGPref via preference learning show improvements in trustworthiness, demonstrating the effectiveness of DecompGen. </p> </div> </dd> <dt> <a name='item23'>[23]</a> <a href ="/abs/2411.13716" title="Abstract" id="2411.13716"> arXiv:2411.13716 </a> [<a href="/pdf/2411.13716" title="Download PDF" id="pdf-2411.13716" aria-labelledby="pdf-2411.13716">pdf</a>, <a href="https://arxiv.org/html/2411.13716v1" title="View HTML" id="html-2411.13716" aria-labelledby="html-2411.13716" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13716" title="Other formats" id="oth-2411.13716" aria-labelledby="oth-2411.13716">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Developing Normative Gait Cycle Parameters for Clinical Analysis Using Human Pose Estimation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ranjan,+R">Rahm Ranjan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ahmedt-Aristizabal,+D">David Ahmedt-Aristizabal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Armin,+M+A">Mohammad Ali Armin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kim,+J">Juno Kim</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Gait analysis using computer vision is an emerging field in AI, offering clinicians an objective, multi-feature approach to analyse complex movements. Despite its promise, current applications using RGB video data alone are limited in measuring clinically relevant spatial and temporal kinematics and establishing normative parameters essential for identifying movement abnormalities within a gait cycle. This paper presents a data-driven method using RGB video data and 2D human pose estimation for developing normative kinematic gait parameters. By analysing joint angles, an established kinematic measure in biomechanics and clinical practice, we aim to enhance gait analysis capabilities and improve explainability. Our cycle-wise kinematic analysis enables clinicians to simultaneously measure and compare multiple joint angles, assessing individuals against a normative population using just monocular RGB video. This approach expands clinical capacity, supports objective decision-making, and automates the identification of specific spatial and temporal deviations and abnormalities within the gait cycle. </p> </div> </dd> <dt> <a name='item24'>[24]</a> <a href ="/abs/2411.13731" title="Abstract" id="2411.13731"> arXiv:2411.13731 </a> [<a href="/pdf/2411.13731" title="Download PDF" id="pdf-2411.13731" aria-labelledby="pdf-2411.13731">pdf</a>, <a href="https://arxiv.org/html/2411.13731v1" title="View HTML" id="html-2411.13731" aria-labelledby="html-2411.13731" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13731" title="Other formats" id="oth-2411.13731" aria-labelledby="oth-2411.13731">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Delta-Influence: Unlearning Poisons via Influence Functions </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+W">Wenjie Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+J">Jiawei Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=de+Witt,+C+S">Christian Schroeder de Witt</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Prabhu,+A">Ameya Prabhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Sanyal,+A">Amartya Sanyal</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted at NeurIPS Workshop on Attributing Model Behavior at Scale (ATTRIB @ NeurIPS 2024) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Addressing data integrity challenges, such as unlearning the effects of data poisoning after model training, is necessary for the reliable deployment of machine learning models. State-of-the-art influence functions, such as EK-FAC, often fail to accurately attribute abnormal model behavior to the specific poisoned training data responsible for the data poisoning attack. In addition, traditional unlearning algorithms often struggle to effectively remove the influence of poisoned samples, particularly when only a few affected examples can be identified. To address these challenge, we introduce $\Delta$-Influence, a novel approach that leverages influence functions to trace abnormal model behavior back to the responsible poisoned training data using as little as just one poisoned test example. $\Delta$-Influence applies data transformations that sever the link between poisoned training data and compromised test points without significantly affecting clean data. This allows $\Delta$-Influence to detect large negative shifts in influence scores following data transformations, a phenomenon we term as influence collapse, thereby accurately identifying poisoned training data. Unlearning this subset, e.g. through retraining, effectively eliminates the data poisoning. We validate our method across three vision-based poisoning attacks and three datasets, benchmarking against four detection algorithms and five unlearning strategies. We show that $\Delta$-Influence consistently achieves the best unlearning across all settings, showing the promise of influence functions for corrective unlearning. Our code is publicly available at: \url{<a href="https://github.com/andyisokay/delta-influence" rel="external noopener nofollow" class="link-external link-https">this https URL</a>} </p> </div> </dd> <dt> <a name='item25'>[25]</a> <a href ="/abs/2411.13753" title="Abstract" id="2411.13753"> arXiv:2411.13753 </a> [<a href="/pdf/2411.13753" title="Download PDF" id="pdf-2411.13753" aria-labelledby="pdf-2411.13753">pdf</a>, <a href="https://arxiv.org/html/2411.13753v1" title="View HTML" id="html-2411.13753" aria-labelledby="html-2411.13753" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13753" title="Other formats" id="oth-2411.13753" aria-labelledby="oth-2411.13753">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> FAST-Splat: Fast, Ambiguity-Free Semantics Transfer in Gaussian Splatting </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Shorinwa,+O">Ola Shorinwa</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Sun,+J">Jiankai Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Schwager,+M">Mac Schwager</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> We present FAST-Splat for fast, ambiguity-free semantic Gaussian Splatting, which seeks to address the main limitations of existing semantic Gaussian Splatting methods, namely: slow training and rendering speeds; high memory usage; and ambiguous semantic object localization. In deriving FAST-Splat , we formulate open-vocabulary semantic Gaussian Splatting as the problem of extending closed-set semantic distillation to the open-set (open-vocabulary) setting, enabling FAST-Splat to provide precise semantic object localization results, even when prompted with ambiguous user-provided natural-language queries. Further, by exploiting the explicit form of the Gaussian Splatting scene representation to the fullest extent, FAST-Splat retains the remarkable training and rendering speeds of Gaussian Splatting. Specifically, while existing semantic Gaussian Splatting methods distill semantics into a separate neural field or utilize neural models for dimensionality reduction, FAST-Splat directly augments each Gaussian with specific semantic codes, preserving the training, rendering, and memory-usage advantages of Gaussian Splatting over neural field methods. These Gaussian-specific semantic codes, together with a hash-table, enable semantic similarity to be measured with open-vocabulary user prompts and further enable FAST-Splat to respond with unambiguous semantic object labels and 3D masks, unlike prior methods. In experiments, we demonstrate that FAST-Splat is 4x to 6x faster to train with a 13x faster data pre-processing step, achieves between 18x to 75x faster rendering speeds, and requires about 3x smaller GPU memory, compared to the best-competing semantic Gaussian Splatting methods. Further, FAST-Splat achieves relatively similar or better semantic segmentation performance compared to existing methods. After the review period, we will provide links to the project website and the codebase. </p> </div> </dd> <dt> <a name='item26'>[26]</a> <a href ="/abs/2411.13774" title="Abstract" id="2411.13774"> arXiv:2411.13774 </a> [<a href="/pdf/2411.13774" title="Download PDF" id="pdf-2411.13774" aria-labelledby="pdf-2411.13774">pdf</a>, <a href="https://arxiv.org/html/2411.13774v1" title="View HTML" id="html-2411.13774" aria-labelledby="html-2411.13774" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13774" title="Other formats" id="oth-2411.13774" aria-labelledby="oth-2411.13774">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Segment Any Class (SAC): Multi-Class Few-Shot Semantic Segmentation via Class Region Proposals </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zakir,+H+M">Hussni Mohd Zakir</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ho,+E+T+W">Eric Tatt Wei Ho</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 8 pages, 2 figures, 3 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> The Segment-Anything Model (SAM) is a vision foundation model for segmentation with a prompt-driven framework. SAM generates class-agnostic masks based on user-specified instance-referring prompts. However, adapting SAM for automated segmentation -- where manual input is absent -- of specific object classes often requires additional model training. We present Segment Any Class (SAC), a novel, training-free approach that task-adapts SAM for Multi-class segmentation. SAC generates Class-Region Proposals (CRP) on query images which allows us to automatically generate class-aware prompts on probable locations of class instances. CRPs are derived from elementary intra-class and inter-class feature distinctions without any additional training. Our method is versatile, accommodating any N-way K-shot configurations for the multi-class few-shot semantic segmentation (FSS) task. Unlike gradient-learning adaptation of generalist models which risk the loss of generalization and potentially suffer from catastrophic forgetting, SAC solely utilizes automated prompting and achieves superior results over state-of-the-art methods on the COCO-20i benchmark, particularly excelling in high N-way class scenarios. SAC is an interesting demonstration of a prompt-only approach to adapting foundation models for novel tasks with small, limited datasets without any modifications to the foundation model itself. This method offers interesting benefits such as intrinsic immunity to concept or feature loss and rapid, online task adaptation of foundation models. </p> </div> </dd> <dt> <a name='item27'>[27]</a> <a href ="/abs/2411.13787" title="Abstract" id="2411.13787"> arXiv:2411.13787 </a> [<a href="/pdf/2411.13787" title="Download PDF" id="pdf-2411.13787" aria-labelledby="pdf-2411.13787">pdf</a>, <a href="https://arxiv.org/html/2411.13787v1" title="View HTML" id="html-2411.13787" aria-labelledby="html-2411.13787" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13787" title="Other formats" id="oth-2411.13787" aria-labelledby="oth-2411.13787">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Edge-Cloud Routing for Text-to-Image Model with Token-Level Multi-Metric Prediction </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xin,+Z">Zewei Xin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+Q">Qinya Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Niu,+C">Chaoyue Niu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wu,+F">Fan Wu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Large text-to-image models demonstrate impressive generation capabilities; however, their substantial size necessitates expensive cloud servers for deployment. Conversely, light-weight models can be deployed on edge devices at lower cost but often with inferior generation quality for complex user prompts. To strike a balance between performance and cost, we propose a routing framework, called \texttt{RouteT2I}, which dynamically selects either the large cloud model or the light-weight edge model for each user prompt. Since generated image quality is challenging to measure directly, \texttt{RouteT2I} establishes multi-dimensional quality metrics, particularly, by evaluating the similarity between the generated images and both positive and negative texts that describe each specific quality metric. \texttt{RouteT2I} then predicts the expected quality of the generated images by identifying key tokens in the prompt and comparing their impact on the quality. \texttt{RouteT2I} further introduces the Pareto relative superiority to compare the multi-metric quality of the generated images. Based on this comparison and predefined cost constraints, \texttt{RouteT2I} allocates prompts to either the edge or the cloud. Evaluation reveals that \texttt{RouteT2I} significantly reduces the number of requesting large cloud model while maintaining high-quality image generation. </p> </div> </dd> <dt> <a name='item28'>[28]</a> <a href ="/abs/2411.13794" title="Abstract" id="2411.13794"> arXiv:2411.13794 </a> [<a href="/pdf/2411.13794" title="Download PDF" id="pdf-2411.13794" aria-labelledby="pdf-2411.13794">pdf</a>, <a href="https://arxiv.org/html/2411.13794v1" title="View HTML" id="html-2411.13794" aria-labelledby="html-2411.13794" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13794" title="Other formats" id="oth-2411.13794" aria-labelledby="oth-2411.13794">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> GalaxyEdit: Large-Scale Image Editing Dataset with Enhanced Diffusion Adapter </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Bala,+A">Aniruddha Bala</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jaiswal,+R">Rohan Jaiswal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Rashid,+L">Loay Rashid</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Roheda,+S">Siddharth Roheda</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 10 pages, 6 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Training of large-scale text-to-image and image-to-image models requires a huge amount of annotated data. While text-to-image datasets are abundant, data available for instruction-based image-to-image tasks like object addition and removal is limited. This is because of the several challenges associated with the data generation process, such as, significant human effort, limited automation, suboptimal end-to-end models, data diversity constraints and high expenses. We propose an automated data generation pipeline aimed at alleviating such limitations, and introduce GalaxyEdit - a large-scale image editing dataset for add and remove operations. We fine-tune the SD v1.5 model on our dataset and find that our model can successfully handle a broader range of objects and complex editing instructions, outperforming state-of-the-art methods in FID scores by 11.2\% and 26.1\% for add and remove tasks respectively. Furthermore, in light of on-device usage scenarios, we expand our research to include task-specific lightweight adapters leveraging the ControlNet-xs architecture. While ControlNet-xs excels in canny and depth guided generation, we propose to improve the communication between the control network and U-Net for more intricate add and remove tasks. We achieve this by enhancing ControlNet-xs with non-linear interaction layers based on Volterra filters. Our approach outperforms ControlNet-xs in both add/remove and canny-guided image generation tasks, highlighting the effectiveness of the proposed enhancement. </p> </div> </dd> <dt> <a name='item29'>[29]</a> <a href ="/abs/2411.13797" title="Abstract" id="2411.13797"> arXiv:2411.13797 </a> [<a href="/pdf/2411.13797" title="Download PDF" id="pdf-2411.13797" aria-labelledby="pdf-2411.13797">pdf</a>, <a href="https://arxiv.org/html/2411.13797v1" title="View HTML" id="html-2411.13797" aria-labelledby="html-2411.13797" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13797" title="Other formats" id="oth-2411.13797" aria-labelledby="oth-2411.13797">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Hugging Rain Man: A Novel Facial Action Units Dataset for Analyzing Atypical Facial Expressions in Children with Autism Spectrum Disorder </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ji,+Y">Yanfeng Ji</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+S">Shutong Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xu,+R">Ruyi Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+J">Jingying Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jiang,+X">Xinzhou Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Deng,+Z">Zhengyu Deng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Quan,+Y">Yuxuan Quan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+J">Junpeng Liu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Portions of the dataset, features, and pretrained models are accessible at: <a href="https://github.com/Jonas-DL/Hugging-Rain-Man" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Children with Autism Spectrum Disorder (ASD) often exhibit atypical facial expressions. However, the specific objective facial features that underlie this subjective perception remain unclear. In this paper, we introduce a novel dataset, Hugging Rain Man (HRM), which includes facial action units (AUs) manually annotated by FACS experts for both children with ASD and typical development (TD). The dataset comprises a rich collection of posed and spontaneous facial expressions, totaling approximately 130,000 frames, along with 22 AUs, 10 Action Descriptors (ADs), and atypicality ratings. A statistical analysis of static images from the HRM reveals significant differences between the ASD and TD groups across multiple AUs and ADs when displaying the same emotional expressions, confirming that participants with ASD tend to demonstrate more irregular and diverse expression patterns. Subsequently, a temporal regression method was presented to analyze atypicality of dynamic sequences, thereby bridging the gap between subjective perception and objective facial characteristics. Furthermore, baseline results for AU detection are provided for future research reference. This work not only contributes to our understanding of the unique facial expression characteristics associated with ASD but also provides potential tools for ASD early screening. Portions of the dataset, features, and pretrained models are accessible at: \url{<a href="https://github.com/Jonas-DL/Hugging-Rain-Man" rel="external noopener nofollow" class="link-external link-https">this https URL</a>}. </p> </div> </dd> <dt> <a name='item30'>[30]</a> <a href ="/abs/2411.13807" title="Abstract" id="2411.13807"> arXiv:2411.13807 </a> [<a href="/pdf/2411.13807" title="Download PDF" id="pdf-2411.13807" aria-labelledby="pdf-2411.13807">pdf</a>, <a href="https://arxiv.org/html/2411.13807v1" title="View HTML" id="html-2411.13807" aria-labelledby="html-2411.13807" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13807" title="Other formats" id="oth-2411.13807" aria-labelledby="oth-2411.13807">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MagicDriveDiT: High-Resolution Long Video Generation for Autonomous Driving with Adaptive Control </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gao,+R">Ruiyuan Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+K">Kai Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xiao,+B">Bo Xiao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Hong,+L">Lanqing Hong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+Z">Zhenguo Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xu,+Q">Qiang Xu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Project Website: <a href="https://flymin.github.io/magicdrivedit/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> The rapid advancement of diffusion models has greatly improved video synthesis, especially in controllable video generation, which is essential for applications like autonomous driving. However, existing methods are limited by scalability and how control conditions are integrated, failing to meet the needs for high-resolution and long videos for autonomous driving applications. In this paper, we introduce MagicDriveDiT, a novel approach based on the DiT architecture, and tackle these challenges. Our method enhances scalability through flow matching and employs a progressive training strategy to manage complex scenarios. By incorporating spatial-temporal conditional encoding, MagicDriveDiT achieves precise control over spatial-temporal latents. Comprehensive experiments show its superior performance in generating realistic street scene videos with higher resolution and more frames. MagicDriveDiT significantly improves video generation quality and spatial-temporal controls, expanding its potential applications across various tasks in autonomous driving. </p> </div> </dd> <dt> <a name='item31'>[31]</a> <a href ="/abs/2411.13836" title="Abstract" id="2411.13836"> arXiv:2411.13836 </a> [<a href="/pdf/2411.13836" title="Download PDF" id="pdf-2411.13836" aria-labelledby="pdf-2411.13836">pdf</a>, <a href="https://arxiv.org/html/2411.13836v1" title="View HTML" id="html-2411.13836" aria-labelledby="html-2411.13836" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13836" title="Other formats" id="oth-2411.13836" aria-labelledby="oth-2411.13836">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CLIPer: Hierarchically Improving Spatial Representation of CLIP for Open-Vocabulary Semantic Segmentation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Sun,+L">Lin Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Cao,+J">Jiale Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xie,+J">Jin Xie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jiang,+X">Xiaoheng Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Pang,+Y">Yanwei Pang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Homepange and code: <a href="https://linsun449.github.io/cliper" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Contrastive Language-Image Pre-training (CLIP) exhibits strong zero-shot classification ability on various image-level tasks, leading to the research to adapt CLIP for pixel-level open-vocabulary semantic segmentation without additional training. The key is to improve spatial representation of image-level CLIP, such as replacing self-attention map at last layer with self-self attention map or vision foundation model based attention map. In this paper, we present a novel hierarchical framework, named CLIPer, that hierarchically improves spatial representation of CLIP. The proposed CLIPer includes an early-layer fusion module and a fine-grained compensation module. We observe that, the embeddings and attention maps at early layers can preserve spatial structural information. Inspired by this, we design the early-layer fusion module to generate segmentation map with better spatial coherence. Afterwards, we employ a fine-grained compensation module to compensate the local details using the self-attention maps of diffusion model. We conduct the experiments on seven segmentation datasets. Our proposed CLIPer achieves the state-of-the-art performance on these datasets. For instance, using ViT-L, CLIPer has the mIoU of 69.8% and 43.3% on VOC and COCO Object, outperforming ProxyCLIP by 9.2% and 4.1% respectively. </p> </div> </dd> <dt> <a name='item32'>[32]</a> <a href ="/abs/2411.13840" title="Abstract" id="2411.13840"> arXiv:2411.13840 </a> [<a href="/pdf/2411.13840" title="Download PDF" id="pdf-2411.13840" aria-labelledby="pdf-2411.13840">pdf</a>, <a href="https://arxiv.org/html/2411.13840v1" title="View HTML" id="html-2411.13840" aria-labelledby="html-2411.13840" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13840" title="Other formats" id="oth-2411.13840" aria-labelledby="oth-2411.13840">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Segment Anything in Light Fields for Real-Time Applications via Constrained Prompting </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Goncharov,+N">Nikolai Goncharov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Dansereau,+D+G">Donald G. Dansereau</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Segmented light field images can serve as a powerful representation in many of computer vision tasks exploiting geometry and appearance of objects, such as object pose tracking. In the light field domain, segmentation presents an additional objective of recognizing the same segment through all the views. Segment Anything Model 2 (SAM 2) allows producing semantically meaningful segments for monocular images and videos. However, using SAM 2 directly on light fields is highly ineffective due to unexploited constraints. In this work, we present a novel light field segmentation method that adapts SAM 2 to the light field domain without retraining or modifying the model. By utilizing the light field domain constraints, the method produces high quality and view-consistent light field masks, outperforming the SAM 2 video tracking baseline and working 7 times faster, with a real-time speed. We achieve this by exploiting the epipolar geometry cues to propagate the masks between the views, probing the SAM 2 latent space to estimate their occlusion, and further prompting SAM 2 for their refinement. </p> </div> </dd> <dt> <a name='item33'>[33]</a> <a href ="/abs/2411.13842" title="Abstract" id="2411.13842"> arXiv:2411.13842 </a> [<a href="/pdf/2411.13842" title="Download PDF" id="pdf-2411.13842" aria-labelledby="pdf-2411.13842">pdf</a>, <a href="https://arxiv.org/html/2411.13842v1" title="View HTML" id="html-2411.13842" aria-labelledby="html-2411.13842" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13842" title="Other formats" id="oth-2411.13842" aria-labelledby="oth-2411.13842">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Detecting Human Artifacts from Text-to-Image Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+K">Kaihong Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+L">Lingzhi Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+J">Jianming Zhang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Despite recent advancements, text-to-image generation models often produce images containing artifacts, especially in human figures. These artifacts appear as poorly generated human bodies, including distorted, missing, or extra body parts, leading to visual inconsistencies with typical human anatomy and greatly impairing overall fidelity. In this study, we address this challenge by curating Human Artifact Dataset (HAD), the first large-scale dataset specifically designed to identify and localize human artifacts. HAD comprises over 37,000 images generated by several popular text-to-image models, annotated for human artifact localization. Using this dataset, we train the Human Artifact Detection Models (HADM), which can identify diverse artifact types across multiple generative domains and demonstrate strong generalization, even on images from unseen generators. Additionally, to further improve generators&#39; perception of human structural coherence, we use the predictions from our HADM as feedback for diffusion model finetuning. Our experiments confirm a reduction in human artifacts in the resulting model. Furthermore, we showcase a novel application of our HADM in an iterative inpainting framework to correct human artifacts in arbitrary images directly, demonstrating its utility in improving image quality. Our dataset and detection models are available at: \url{<a href="https://github.com/wangkaihong/HADM" rel="external noopener nofollow" class="link-external link-https">this https URL</a>}. </p> </div> </dd> <dt> <a name='item34'>[34]</a> <a href ="/abs/2411.13847" title="Abstract" id="2411.13847"> arXiv:2411.13847 </a> [<a href="/pdf/2411.13847" title="Download PDF" id="pdf-2411.13847" aria-labelledby="pdf-2411.13847">pdf</a>, <a href="https://arxiv.org/html/2411.13847v1" title="View HTML" id="html-2411.13847" aria-labelledby="html-2411.13847" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13847" title="Other formats" id="oth-2411.13847" aria-labelledby="oth-2411.13847">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Multitask Learning for SAR Ship Detection with Gaussian-Mask Joint Segmentation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhao,+M">Ming Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+X">Xin Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kaup,+A">Andr茅 Kaup</a></div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> IEEE Transactions on Geoscience and Remote Sensing, vol. 61, pp. 1-16, 2023, Art no. 5214516 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Detecting ships in synthetic aperture radar (SAR) images is challenging due to strong speckle noise, complex surroundings, and varying scales. This paper proposes MLDet, a multitask learning framework for SAR ship detection, consisting of object detection, speckle suppression, and target segmentation tasks. An angle classification loss with aspect ratio weighting is introduced to improve detection accuracy by addressing angular periodicity and object proportions. The speckle suppression task uses a dual-feature fusion attention mechanism to reduce noise and fuse shallow and denoising features, enhancing robustness. The target segmentation task, leveraging a rotated Gaussian-mask, aids the network in extracting target regions from cluttered backgrounds and improves detection efficiency with pixel-level predictions. The Gaussian-mask ensures ship centers have the highest probabilities, gradually decreasing outward under a Gaussian distribution. Additionally, a weighted rotated boxes fusion (WRBF) strategy combines multi-direction anchor predictions, filtering anchors beyond boundaries or with high overlap but low confidence. Extensive experiments on SSDD+ and HRSID datasets demonstrate the effectiveness and superiority of MLDet. </p> </div> </dd> <dt> <a name='item35'>[35]</a> <a href ="/abs/2411.13852" title="Abstract" id="2411.13852"> arXiv:2411.13852 </a> [<a href="/pdf/2411.13852" title="Download PDF" id="pdf-2411.13852" aria-labelledby="pdf-2411.13852">pdf</a>, <a href="/format/2411.13852" title="Other formats" id="oth-2411.13852" aria-labelledby="oth-2411.13852">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Dealing with Synthetic Data Contamination in Online Continual Learning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+M">Maorong Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Michel,+N">Nicolas Michel</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Mao,+J">Jiafeng Mao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yamasaki,+T">Toshihiko Yamasaki</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted to NeurIPS&#39;24 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Image generation has shown remarkable results in generating high-fidelity realistic images, in particular with the advancement of diffusion-based models. However, the prevalence of AI-generated images may have side effects for the machine learning community that are not clearly identified. Meanwhile, the success of deep learning in computer vision is driven by the massive dataset collected on the Internet. The extensive quantity of synthetic data being added to the Internet would become an obstacle for future researchers to collect &#34;clean&#34; datasets without AI-generated content. Prior research has shown that using datasets contaminated by synthetic images may result in performance degradation when used for training. In this paper, we investigate the potential impact of contaminated datasets on Online Continual Learning (CL) research. We experimentally show that contaminated datasets might hinder the training of existing online CL methods. Also, we propose Entropy Selection with Real-synthetic similarity Maximization (ESRM), a method to alleviate the performance deterioration caused by synthetic images when training online CL models. Experiments show that our method can significantly alleviate performance deterioration, especially when the contamination is severe. For reproducibility, the source code of our work is available at <a href="https://github.com/maorong-wang/ESRM" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item36'>[36]</a> <a href ="/abs/2411.13860" title="Abstract" id="2411.13860"> arXiv:2411.13860 </a> [<a href="/pdf/2411.13860" title="Download PDF" id="pdf-2411.13860" aria-labelledby="pdf-2411.13860">pdf</a>, <a href="https://arxiv.org/html/2411.13860v1" title="View HTML" id="html-2411.13860" aria-labelledby="html-2411.13860" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13860" title="Other formats" id="oth-2411.13860" aria-labelledby="oth-2411.13860">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Decoupled Sparse Priors Guided Diffusion Compression Model for Point Clouds </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+X">Xiaoge Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wu,+Z">Zijie Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Nasim,+M">Mehwish Nasim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Feng,+M">Mingtao Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Mian,+A">Ajmal Mian</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Image and Video Processing (eess.IV) </div> <p class='mathjax'> Lossy compression methods rely on an autoencoder to transform a point cloud into latent points for storage, leaving the inherent redundancy of latent representations unexplored. To reduce redundancy in latent points, we propose a sparse priors guided method that achieves high reconstruction quality, especially at high compression ratios. This is accomplished by a dual-density scheme separately processing the latent points (intended for reconstruction) and the decoupled sparse priors (intended for storage). Our approach features an efficient dual-density data flow that relaxes size constraints on latent points, and hybridizes a progressive conditional diffusion model to encapsulate essential details for reconstruction within the conditions, which are decoupled hierarchically to intra-point and inter-point priors. Specifically, our method encodes the original point cloud into latent points and decoupled sparse priors through separate encoders. Latent points serve as intermediates, while sparse priors act as adaptive conditions. We then employ a progressive attention-based conditional denoiser to generate latent points conditioned on the decoupled priors, allowing the denoiser to dynamically attend to geometric and semantic cues from the priors at each encoding and decoding layer. Additionally, we integrate the local distribution into the arithmetic encoder and decoder to enhance local context modeling of the sparse points. The original point cloud is reconstructed through a point decoder. Compared to state-of-the-art, our method obtains superior rate-distortion trade-off, evidenced by extensive evaluations on the ShapeNet dataset and standard test datasets from MPEG group including 8iVFB, and Owlii. </p> </div> </dd> <dt> <a name='item37'>[37]</a> <a href ="/abs/2411.13873" title="Abstract" id="2411.13873"> arXiv:2411.13873 </a> [<a href="/pdf/2411.13873" title="Download PDF" id="pdf-2411.13873" aria-labelledby="pdf-2411.13873">pdf</a>, <a href="https://arxiv.org/html/2411.13873v1" title="View HTML" id="html-2411.13873" aria-labelledby="html-2411.13873" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13873" title="Other formats" id="oth-2411.13873" aria-labelledby="oth-2411.13873">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Sli2Vol+: Segmenting 3D Medical Images Based on an Object Estimation Guided Correspondence Flow Network </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=An,+D">Delin An</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gu,+P">Pengfei Gu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Sonka,+M">Milan Sonka</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+C">Chaoli Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+D+Z">Danny Z. Chen</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Deep learning (DL) methods have shown remarkable successes in medical image segmentation, often using large amounts of annotated data for model training. However, acquiring a large number of diverse labeled 3D medical image datasets is highly difficult and expensive. Recently, mask propagation DL methods were developed to reduce the annotation burden on 3D medical images. For example, Sli2Vol~\cite{yeung2021sli2vol} proposed a self-supervised framework (SSF) to learn correspondences by matching neighboring slices via slice reconstruction in the training stage; the learned correspondences were then used to propagate a labeled slice to other slices in the test stage. But, these methods are still prone to error accumulation due to the inter-slice propagation of reconstruction errors. Also, they do not handle discontinuities well, which can occur between consecutive slices in 3D images, as they emphasize exploiting object continuity. To address these challenges, in this work, we propose a new SSF, called \proposed, {for segmenting any anatomical structures in 3D medical images using only a single annotated slice per training and testing volume.} Specifically, in the training stage, we first propagate an annotated 2D slice of a training volume to the other slices, generating pseudo-labels (PLs). Then, we develop a novel Object Estimation Guided Correspondence Flow Network to learn reliable correspondences between consecutive slices and corresponding PLs in a self-supervised manner. In the test stage, such correspondences are utilized to propagate a single annotated slice to the other slices of a test volume. We demonstrate the effectiveness of our method on various medical image segmentation tasks with different datasets, showing better generalizability across different organs, modalities, and modals. Code is available at \url{<a href="https://github.com/adlsn/Sli2Volplus" rel="external noopener nofollow" class="link-external link-https">this https URL</a>} </p> </div> </dd> <dt> <a name='item38'>[38]</a> <a href ="/abs/2411.13886" title="Abstract" id="2411.13886"> arXiv:2411.13886 </a> [<a href="/pdf/2411.13886" title="Download PDF" id="pdf-2411.13886" aria-labelledby="pdf-2411.13886">pdf</a>, <a href="https://arxiv.org/html/2411.13886v1" title="View HTML" id="html-2411.13886" aria-labelledby="html-2411.13886" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13886" title="Other formats" id="oth-2411.13886" aria-labelledby="oth-2411.13886">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CLFace: A Scalable and Resource-Efficient Continual Learning Framework for Lifelong Face Recognition </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Hasan,+M+M">Md Mahedi Hasan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Sami,+S+M">Shoaib Meraj Sami</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Nasrabadi,+N">Nasser Nasrabadi</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted for publication in the IEEE/CVF Winter Conference on Applications of Computer Vision (WACV 2025) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> An important aspect of deploying face recognition (FR) algorithms in real-world applications is their ability to learn new face identities from a continuous data stream. However, the online training of existing deep neural network-based FR algorithms, which are pre-trained offline on large-scale stationary datasets, encounter two major challenges: (I) catastrophic forgetting of previously learned identities, and (II) the need to store past data for complete retraining from scratch, leading to significant storage constraints and privacy concerns. In this paper, we introduce CLFace, a continual learning framework designed to preserve and incrementally extend the learned knowledge. CLFace eliminates the classification layer, resulting in a resource-efficient FR model that remains fixed throughout lifelong learning and provides label-free supervision to a student model, making it suitable for open-set face recognition during incremental steps. We introduce an objective function that employs feature-level distillation to reduce drift between feature maps of the student and teacher models across multiple stages. Additionally, it incorporates a geometry-preserving distillation scheme to maintain the orientation of the teacher model&#39;s feature embedding. Furthermore, a contrastive knowledge distillation is incorporated to continually enhance the discriminative power of the feature representation by matching similarities between new identities. Experiments on several benchmark FR datasets demonstrate that CLFace outperforms baseline approaches and state-of-the-art methods on unseen identities using both in-domain and out-of-domain datasets. </p> </div> </dd> <dt> <a name='item39'>[39]</a> <a href ="/abs/2411.13901" title="Abstract" id="2411.13901"> arXiv:2411.13901 </a> [<a href="/pdf/2411.13901" title="Download PDF" id="pdf-2411.13901" aria-labelledby="pdf-2411.13901">pdf</a>, <a href="/format/2411.13901" title="Other formats" id="oth-2411.13901" aria-labelledby="oth-2411.13901">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Dressing the Imagination: A Dataset for AI-Powered Translation of Text into Fashion Outfits and A Novel KAN Adapter for Enhanced Feature Adaptation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Deshmukh,+G">Gayatri Deshmukh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=De,+S">Somsubhra De</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Sehgal,+C">Chirag Sehgal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gupta,+J+S">Jishu Sen Gupta</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Mittal,+S">Sparsh Mittal</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Under review at a conference </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Specialized datasets that capture the fashion industry&#39;s rich language and styling elements can boost progress in AI-driven fashion design. We present FLORA (Fashion Language Outfit Representation for Apparel Generation), the first comprehensive dataset containing 4,330 curated pairs of fashion outfits and corresponding textual descriptions. Each description utilizes industry-specific terminology and jargon commonly used by professional fashion designers, providing precise and detailed insights into the outfits. Hence, the dataset captures the delicate features and subtle stylistic elements necessary to create high-fidelity fashion designs. We demonstrate that fine-tuning generative models on the FLORA dataset significantly enhances their capability to generate accurate and stylistically rich images from textual descriptions of fashion sketches. FLORA will catalyze the creation of advanced AI models capable of comprehending and producing subtle, stylistically rich fashion designs. It will also help fashion designers and end-users to bring their ideas to life. <br>As a second orthogonal contribution, we introduce KAN Adapters, which leverage Kolmogorov-Arnold Networks (KAN) as adaptive modules. They serve as replacements for traditional MLP-based LoRA adapters. With learnable spline-based activations, KAN Adapters excel in modeling complex, non-linear relationships, achieving superior fidelity, faster convergence and semantic alignment. Extensive experiments and ablation studies on our proposed FLORA dataset validate the superiority of KAN Adapters over LoRA adapters. To foster further research and collaboration, we will open-source both the FLORA and our implementation code. </p> </div> </dd> <dt> <a name='item40'>[40]</a> <a href ="/abs/2411.13909" title="Abstract" id="2411.13909"> arXiv:2411.13909 </a> [<a href="/pdf/2411.13909" title="Download PDF" id="pdf-2411.13909" aria-labelledby="pdf-2411.13909">pdf</a>, <a href="https://arxiv.org/html/2411.13909v1" title="View HTML" id="html-2411.13909" aria-labelledby="html-2411.13909" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13909" title="Other formats" id="oth-2411.13909" aria-labelledby="oth-2411.13909">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Panther: Illuminate the Sight of Multimodal LLMs with Instruction-Guided Visual Prompts </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+H">Honglin Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gao,+Y">Yuting Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhu,+C">Chenglu Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+J">Jingdong Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yang,+M">Ming Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yang,+L">Lin Yang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Multimodal large language models (MLLMs) are closing the gap to human visual perception capability rapidly, while, still lag behind on attending to subtle images details or locating small objects precisely, etc. Common schemes to tackle these issues include deploying multiple vision encoders or operating on original high-resolution images. Few studies have concentrated on taking the textual instruction into improving visual representation, resulting in losing focus in some vision-centric tasks, a phenomenon we herein termed as Amblyopia. In this work, we introduce Panther, a MLLM that closely adheres to user instruction and locates targets of interests precisely, with the finesse of a black panther. Specifically, Panther comprises three integral components: Panther-VE, Panther-Bridge, and Panther-Decoder. Panther-VE integrates user instruction information at the early stages of the vision encoder, thereby extracting the most relevant and useful visual representations. The Panther-Bridge module, equipped with powerful filtering capabilities, significantly reduces redundant visual information, leading to a substantial savings in training costs. The Panther-Decoder is versatile and can be employed with any decoder-only architecture of LLMs without discrimination. Experimental results, particularly on vision-centric benchmarks, have demonstrated the effectiveness of Panther. </p> </div> </dd> <dt> <a name='item41'>[41]</a> <a href ="/abs/2411.13918" title="Abstract" id="2411.13918"> arXiv:2411.13918 </a> [<a href="/pdf/2411.13918" title="Download PDF" id="pdf-2411.13918" aria-labelledby="pdf-2411.13918">pdf</a>, <a href="https://arxiv.org/html/2411.13918v1" title="View HTML" id="html-2411.13918" aria-labelledby="html-2411.13918" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13918" title="Other formats" id="oth-2411.13918" aria-labelledby="oth-2411.13918">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Quantization without Tears </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Fu,+M">Minghao Fu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yu,+H">Hao Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Shao,+J">Jie Shao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhou,+J">Junjie Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhu,+K">Ke Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wu,+J">Jianxin Wu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Deep neural networks, while achieving remarkable success across diverse tasks, demand significant resources, including computation, GPU memory, bandwidth, storage, and energy. Network quantization, as a standard compression and acceleration technique, reduces storage costs and enables potential inference acceleration by discretizing network weights and activations into a finite set of integer values. However, current quantization methods are often complex and sensitive, requiring extensive task-specific hyperparameters, where even a single misconfiguration can impair model performance, limiting generality across different models and tasks. In this paper, we propose Quantization without Tears (QwT), a method that simultaneously achieves quantization speed, accuracy, simplicity, and generality. The key insight of QwT is to incorporate a lightweight additional structure into the quantized network to mitigate information loss during quantization. This structure consists solely of a small set of linear layers, keeping the method simple and efficient. More importantly, it provides a closed-form solution, allowing us to improve accuracy effortlessly under 2 minutes. Extensive experiments across various vision, language, and multimodal tasks demonstrate that QwT is both highly effective and versatile. In fact, our approach offers a robust solution for network quantization that combines simplicity, accuracy, and adaptability, which provides new insights for the design of novel quantization paradigms. </p> </div> </dd> <dt> <a name='item42'>[42]</a> <a href ="/abs/2411.13927" title="Abstract" id="2411.13927"> arXiv:2411.13927 </a> [<a href="/pdf/2411.13927" title="Download PDF" id="pdf-2411.13927" aria-labelledby="pdf-2411.13927">pdf</a>, <a href="https://arxiv.org/html/2411.13927v1" title="View HTML" id="html-2411.13927" aria-labelledby="html-2411.13927" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13927" title="Other formats" id="oth-2411.13927" aria-labelledby="oth-2411.13927">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Multimodal 3D Reasoning Segmentation with Complex Scenes </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jiang,+X">Xueying Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lu,+L">Lewei Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Shao,+L">Ling Shao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lu,+S">Shijian Lu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> The recent development in multimodal learning has greatly advanced the research in 3D scene understanding in various real-world tasks such as embodied AI. However, most existing work shares two typical constraints: 1) they are short of reasoning ability for interaction and interpretation of human intension and 2) they focus on scenarios with single-category objects only which leads to over-simplified textual descriptions due to the negligence of multi-object scenarios and spatial relations among objects. We bridge the research gaps by proposing a 3D reasoning segmentation task for multiple objects in scenes. The task allows producing 3D segmentation masks and detailed textual explanations as enriched by 3D spatial relations among objects. To this end, we create ReasonSeg3D, a large-scale and high-quality benchmark that integrates 3D spatial relations with generated question-answer pairs and 3D segmentation masks. In addition, we design MORE3D, a simple yet effective method that enables multi-object 3D reasoning segmentation with user questions and textual outputs. Extensive experiments show that MORE3D excels in reasoning and segmenting complex multi-object 3D scenes, and the created ReasonSeg3D offers a valuable platform for future exploration of 3D reasoning segmentation. The dataset and code will be released. </p> </div> </dd> <dt> <a name='item43'>[43]</a> <a href ="/abs/2411.13929" title="Abstract" id="2411.13929"> arXiv:2411.13929 </a> [<a href="/pdf/2411.13929" title="Download PDF" id="pdf-2411.13929" aria-labelledby="pdf-2411.13929">pdf</a>, <a href="https://arxiv.org/html/2411.13929v1" title="View HTML" id="html-2411.13929" aria-labelledby="html-2411.13929" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13929" title="Other formats" id="oth-2411.13929" aria-labelledby="oth-2411.13929">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Transforming Engineering Diagrams: A Novel Approach for P&amp;ID Digitization using Transformers </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=St%C3%BCrmer,+J+M">Jan Marius St眉rmer</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Graumann,+M">Marius Graumann</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Koch,+T">Tobias Koch</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> The digitization of complex technical systems, such as Piping and Instrumentation Diagrams (P&amp;IDs), is crucial for efficient maintenance and operation of complex systems in hydraulic and process engineering. Previous approaches often rely on separate modules that analyze diagram elements individually, neglecting the diagram&#39;s overall structure. We address this limitation by proposing a novel approach that utilizes the Relationformer, a state-of-the-art deep learning architecture, to extract graphs from P&amp;IDs. Our method leverages the ability of the Relationformer to simultaneously detect objects and their relationships in images, making it suitable for the task of graph extraction from engineering diagrams. We apply our proposed approach to both real-world and synthetically created P&amp;ID datasets, and evaluate its effectiveness by comparing it with a modular digitization approach based on recent literature. We present PID2Graph, the first publicly accessible P&amp;ID dataset featuring comprehensive labels for the graph structure, including symbols, nodes and their connections that is used for evaluation. To understand the effect of patching and stitching of both of the approaches, we compare values before and after merging the patches. For the real-world data, the Relationformer achieves convincing results, outperforming the modular digitization approach for edge detection by more than 25%. Our work provides a comprehensive framework for assessing the performance of P&amp;ID digitization methods and opens up new avenues for research in this area using transformer architectures. The P&amp;ID dataset used for evaluation will be published and publicly available upon acceptance of the paper. </p> </div> </dd> <dt> <a name='item44'>[44]</a> <a href ="/abs/2411.13949" title="Abstract" id="2411.13949"> arXiv:2411.13949 </a> [<a href="/pdf/2411.13949" title="Download PDF" id="pdf-2411.13949" aria-labelledby="pdf-2411.13949">pdf</a>, <a href="https://arxiv.org/html/2411.13949v1" title="View HTML" id="html-2411.13949" aria-labelledby="html-2411.13949" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13949" title="Other formats" id="oth-2411.13949" aria-labelledby="oth-2411.13949">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Separable Mixture of Low-Rank Adaptation for Continual Visual Instruction Tuning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+Z">Ziqi Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Che,+C">Chang Che</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+Q">Qi Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+Y">Yangyang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Shi,+Z">Zenglin Shi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+M">Meng Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Visual instruction tuning (VIT) enables multimodal large language models (MLLMs) to effectively handle a wide range of vision tasks by framing them as language-based instructions. Building on this, continual visual instruction tuning (CVIT) extends the capability of MLLMs to incrementally learn new tasks, accommodating evolving functionalities. While prior work has advanced CVIT through the development of new benchmarks and approaches to mitigate catastrophic forgetting, these efforts largely follow traditional continual learning paradigms, neglecting the unique challenges specific to CVIT. We identify a dual form of catastrophic forgetting in CVIT, where MLLMs not only forget previously learned visual understanding but also experience a decline in instruction following abilities as they acquire new tasks. To address this, we introduce the Separable Mixture of Low-Rank Adaptation (SMoLoRA) framework, which employs separable routing through two distinct modules - one for visual understanding and another for instruction following. This dual-routing design enables specialized adaptation in both domains, preventing forgetting while improving performance. Furthermore, we propose a novel CVIT benchmark that goes beyond existing benchmarks by additionally evaluating a model&#39;s ability to generalize to unseen tasks and handle diverse instructions across various tasks. Extensive experiments demonstrate that SMoLoRA outperforms existing methods in mitigating dual forgetting, improving generalization to unseen tasks, and ensuring robustness in following diverse instructions. </p> </div> </dd> <dt> <a name='item45'>[45]</a> <a href ="/abs/2411.13961" title="Abstract" id="2411.13961"> arXiv:2411.13961 </a> [<a href="/pdf/2411.13961" title="Download PDF" id="pdf-2411.13961" aria-labelledby="pdf-2411.13961">pdf</a>, <a href="https://arxiv.org/html/2411.13961v1" title="View HTML" id="html-2411.13961" aria-labelledby="html-2411.13961" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13961" title="Other formats" id="oth-2411.13961" aria-labelledby="oth-2411.13961">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Zero-Shot Low-Light Image Enhancement via Joint Frequency Domain Priors Guided Diffusion </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=He,+J">Jinhong He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Palaiahnakote,+S">Shivakumara Palaiahnakote</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ning,+A">Aoxiang Ning</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xue,+M">Minglong Xue</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Due to the singularity of real-world paired datasets and the complexity of low-light environments, this leads to supervised methods lacking a degree of scene generalisation. Meanwhile, limited by poor lighting and content guidance, existing zero-shot methods cannot handle unknown severe degradation well. To address this problem, we will propose a new zero-shot low-light enhancement method to compensate for the lack of light and structural information in the diffusion sampling process by effectively combining the wavelet and Fourier frequency domains to construct rich a priori information. The key to the inspiration comes from the similarity between the wavelet and Fourier frequency domains: both light and structure information are closely related to specific frequency domain regions, respectively. Therefore, by transferring the diffusion process to the wavelet low-frequency domain and combining the wavelet and Fourier frequency domains by continuously decomposing them in the inverse process, the constructed rich illumination prior is utilised to guide the image generation enhancement process. Sufficient experiments show that the framework is robust and effective in various scenarios. The code will be available at: \href{<a href="https://github.com/hejh8/Joint-Wavelet-and-Fourier-priors-guided-diffusion" rel="external noopener nofollow" class="link-external link-https">this https URL</a>}{<a href="https://github.com/hejh8/Joint-Wavelet-and-Fourier-priors-guided-diffusion" rel="external noopener nofollow" class="link-external link-https">this https URL</a>}. </p> </div> </dd> <dt> <a name='item46'>[46]</a> <a href ="/abs/2411.13975" title="Abstract" id="2411.13975"> arXiv:2411.13975 </a> [<a href="/pdf/2411.13975" title="Download PDF" id="pdf-2411.13975" aria-labelledby="pdf-2411.13975">pdf</a>, <a href="https://arxiv.org/html/2411.13975v1" title="View HTML" id="html-2411.13975" aria-labelledby="html-2411.13975" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13975" title="Other formats" id="oth-2411.13975" aria-labelledby="oth-2411.13975">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Transforming Static Images Using Generative Models for Video Salient Object Detection </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Cho,+S">Suhwan Cho</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lee,+M">Minhyeok Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lee,+J">Jungho Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lee,+S">Sangyoun Lee</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> In many video processing tasks, leveraging large-scale image datasets is a common strategy, as image data is more abundant and facilitates comprehensive knowledge transfer. A typical approach for simulating video from static images involves applying spatial transformations, such as affine transformations and spline warping, to create sequences that mimic temporal progression. However, in tasks like video salient object detection, where both appearance and motion cues are critical, these basic image-to-video techniques fail to produce realistic optical flows that capture the independent motion properties of each object. In this study, we show that image-to-video diffusion models can generate realistic transformations of static images while understanding the contextual relationships between image components. This ability allows the model to generate plausible optical flows, preserving semantic integrity while reflecting the independent motion of scene elements. By augmenting individual images in this way, we create large-scale image-flow pairs that significantly enhance model training. Our approach achieves state-of-the-art performance across all public benchmark datasets, outperforming existing approaches. </p> </div> </dd> <dt> <a name='item47'>[47]</a> <a href ="/abs/2411.13981" title="Abstract" id="2411.13981"> arXiv:2411.13981 </a> [<a href="/pdf/2411.13981" title="Download PDF" id="pdf-2411.13981" aria-labelledby="pdf-2411.13981">pdf</a>, <a href="https://arxiv.org/html/2411.13981v1" title="View HTML" id="html-2411.13981" aria-labelledby="html-2411.13981" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13981" title="Other formats" id="oth-2411.13981" aria-labelledby="oth-2411.13981">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> On the Fairness, Diversity and Reliability of Text-to-Image Generative Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Vice,+J">Jordan Vice</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Akhtar,+N">Naveed Akhtar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Hartley,+R">Richard Hartley</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Mian,+A">Ajmal Mian</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> This research is supported by the NISDRG project #20100007, funded by the Australian Government </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> The widespread availability of multimodal generative models has sparked critical discussions on their fairness, reliability, and potential for misuse. While text-to-image models can produce high-fidelity, user-guided images, they also exhibit unpredictable behavior and vulnerabilities, which can be exploited to manipulate class or concept representations. To address this, we propose an evaluation framework designed to assess model reliability through their responses to globally- and locally-applied `semantic&#39; perturbations in the embedding space, pinpointing inputs that trigger unreliable behavior. Our approach offers deeper insights into two essential aspects: (i) generative diversity, evaluating the breadth of visual representations for learned concepts, and (ii) generative fairness, examining how removing concepts from input prompts affects semantic guidance. Beyond these evaluations, our method lays the groundwork for detecting unreliable, bias-injected models and retrieval of bias provenance. We will release our code. <br>Keywords: Fairness, Reliability, AI Ethics, Bias, Text-to-Image Models </p> </div> </dd> <dt> <a name='item48'>[48]</a> <a href ="/abs/2411.13982" title="Abstract" id="2411.13982"> arXiv:2411.13982 </a> [<a href="/pdf/2411.13982" title="Download PDF" id="pdf-2411.13982" aria-labelledby="pdf-2411.13982">pdf</a>, <a href="https://arxiv.org/html/2411.13982v1" title="View HTML" id="html-2411.13982" aria-labelledby="html-2411.13982" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13982" title="Other formats" id="oth-2411.13982" aria-labelledby="oth-2411.13982">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Safety Without Semantic Disruptions: Editing-free Safe Image Generation via Context-preserving Dual Latent Reconstruction </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Vice,+J">Jordan Vice</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Akhtar,+N">Naveed Akhtar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Hartley,+R">Richard Hartley</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Mian,+A">Ajmal Mian</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> This research is supported by the NISDRG project #20100007, funded by the Australian Government </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Training multimodal generative models on large, uncurated datasets can result in users being exposed to harmful, unsafe and controversial or culturally-inappropriate outputs. While model editing has been proposed to remove or filter undesirable concepts in embedding and latent spaces, it can inadvertently damage learned manifolds, distorting concepts in close semantic proximity. We identify limitations in current model editing techniques, showing that even benign, proximal concepts may become misaligned. To address the need for safe content generation, we propose a modular, dynamic solution that leverages safety-context embeddings and a dual reconstruction process using tunable weighted summation in the latent space to generate safer images. Our method preserves global context without compromising the structural integrity of the learned manifolds. We achieve state-of-the-art results on safe image generation benchmarks, while offering controllable variation of model safety. We identify trade-offs between safety and censorship, which presents a necessary perspective in the development of ethical AI models. We will release our code. <br>Keywords: Text-to-Image Models, Generative AI, Safety, Reliability, Model Editing </p> </div> </dd> <dt> <a name='item49'>[49]</a> <a href ="/abs/2411.13997" title="Abstract" id="2411.13997"> arXiv:2411.13997 </a> [<a href="/pdf/2411.13997" title="Download PDF" id="pdf-2411.13997" aria-labelledby="pdf-2411.13997">pdf</a>, <a href="https://arxiv.org/html/2411.13997v1" title="View HTML" id="html-2411.13997" aria-labelledby="html-2411.13997" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13997" title="Other formats" id="oth-2411.13997" aria-labelledby="oth-2411.13997">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Mirror Target YOLO: An Improved YOLOv8 Method with Indirect Vision for Heritage Buildings Fire Detection </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liang,+J">Jian Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Cheng,+J">JunSheng Cheng</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Fires can cause severe damage to heritage buildings, making timely fire detection essential. Traditional dense cabling and drilling can harm these structures, so reducing the number of cameras to minimize such impact is challenging. Additionally, avoiding false alarms due to noise sensitivity and preserving the expertise of managers in fire-prone areas is crucial. To address these needs, we propose a fire detection method based on indirect vision, called Mirror Target YOLO (MITA-YOLO). MITA-YOLO integrates indirect vision deployment and an enhanced detection module. It uses mirror angles to achieve indirect views, solving issues with limited visibility in irregular spaces and aligning each indirect view with the target monitoring area. The Target-Mask module is designed to automatically identify and isolate the indirect vision areas in each image, filtering out non-target areas. This enables the model to inherit managers&#39; expertise in assessing fire-risk zones, improving focus and resistance to interference in fire <a href="http://detection.In" rel="external noopener nofollow" class="link-external link-http">this http URL</a> our experiments, we created an 800-image fire dataset with indirect vision. Results show that MITA-YOLO significantly reduces camera requirements while achieving superior detection performance compared to other mainstream models. </p> </div> </dd> <dt> <a name='item50'>[50]</a> <a href ="/abs/2411.14001" title="Abstract" id="2411.14001"> arXiv:2411.14001 </a> [<a href="/pdf/2411.14001" title="Download PDF" id="pdf-2411.14001" aria-labelledby="pdf-2411.14001">pdf</a>, <a href="https://arxiv.org/html/2411.14001v1" title="View HTML" id="html-2411.14001" aria-labelledby="html-2411.14001" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14001" title="Other formats" id="oth-2411.14001" aria-labelledby="oth-2411.14001">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Graph Domain Adaptation with Dual-branch Encoder and Two-level Alignment for Whole Slide Image-based Survival Prediction </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Shou,+Y">Yuntao Shou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yan,+P">Peiqiang Yan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yuan,+X">Xingjian Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Cao,+X">Xiangyong Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhao,+Q">Qian Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Meng,+D">Deyu Meng</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 12 pages, 6 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> In recent years, histopathological whole slide image (WSI)- based survival analysis has attracted much attention in medical image analysis. In practice, WSIs usually come from different hospitals or laboratories, which can be seen as different domains, and thus may have significant differences in imaging equipment, processing procedures, and sample sources. These differences generally result in large gaps in distribution between different WSI domains, and thus the survival analysis models trained on one domain may fail to transfer to another. To address this issue, we propose a Dual-branch Encoder and Two-level Alignment (DETA) framework to explore both feature and category-level alignment between different WSI domains. Specifically, we first formulate the concerned problem as graph domain adaptation (GDA) by virtue the graph representation of WSIs. Then we construct a dual-branch graph encoder, including the message passing branch and the shortest path branch, to explicitly and implicitly extract semantic information from the graph-represented WSIs. To realize GDA, we propose a two-level alignment approach: at the category level, we develop a coupling technique by virtue of the dual-branch structure, leading to reduced divergence between the category distributions of the two domains; at the feature level, we introduce an adversarial perturbation strategy to better augment source domain feature, resulting in improved alignment in feature distribution. To the best of our knowledge, our work is the first attempt to alleviate the domain shift issue for WSI data analysis. Extensive experiments on four TCGA datasets have validated the effectiveness of our proposed DETA framework and demonstrated its superior performance in WSI-based survival analysis. </p> </div> </dd> <dt> <a name='item51'>[51]</a> <a href ="/abs/2411.14002" title="Abstract" id="2411.14002"> arXiv:2411.14002 </a> [<a href="/pdf/2411.14002" title="Download PDF" id="pdf-2411.14002" aria-labelledby="pdf-2411.14002">pdf</a>, <a href="https://arxiv.org/html/2411.14002v1" title="View HTML" id="html-2411.14002" aria-labelledby="html-2411.14002" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14002" title="Other formats" id="oth-2411.14002" aria-labelledby="oth-2411.14002">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SEMPose: A Single End-to-end Network for Multi-object Pose Estimation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+X">Xin Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+H">Hao Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xue,+S">Shibei Xue</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhao,+D">Dezong Zhao</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> In computer vision, estimating the six-degree-of-freedom pose from an RGB image is a fundamental task. However, this task becomes highly challenging in multi-object scenes. Currently, the best methods typically employ an indirect strategy, which identifies 2D and 3D correspondences, and then solves with the Perspective-n-Points method. Yet, this approach cannot be trained end-to-end. Direct methods, on the other hand, suffer from lower accuracy due to challenges such as varying object sizes and occlusions. To address these issues, we propose SEMPose, an end-to-end multi-object pose estimation network. SEMPose utilizes a well-designed texture-shape guided feature pyramid network, effectively tackling the challenge of object size variations. Additionally, it employs an iterative refinement head structure, progressively regressing rotation and translation separately to enhance estimation accuracy. During training, we alleviate the impact of occlusion by selecting positive samples from visible parts. Experimental results demonstrate that SEMPose can perform inference at 32 FPS without requiring inputs other than the RGB image. It can accurately estimate the poses of multiple objects in real time, with inference time unaffected by the number of target objects. On the LM-O and YCB-V datasets, our method outperforms other RGB-based single-model methods, achieving higher accuracy. Even when compared with multi-model methods and approaches that use additional refinement, our results remain competitive. </p> </div> </dd> <dt> <a name='item52'>[52]</a> <a href ="/abs/2411.14039" title="Abstract" id="2411.14039"> arXiv:2411.14039 </a> [<a href="/pdf/2411.14039" title="Download PDF" id="pdf-2411.14039" aria-labelledby="pdf-2411.14039">pdf</a>, <a href="https://arxiv.org/html/2411.14039v1" title="View HTML" id="html-2411.14039" aria-labelledby="html-2411.14039" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14039" title="Other formats" id="oth-2411.14039" aria-labelledby="oth-2411.14039">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Uterine Ultrasound Image Captioning Using Deep Learning Techniques </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Boulesnane,+A">Abdennour Boulesnane</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Mokhtari,+B">Boutheina Mokhtari</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Segueni,+O+R">Oumnia Rana Segueni</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Segueni,+S">Slimane Segueni</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Medical imaging has significantly revolutionized medical diagnostics and treatment planning, progressing from early X-ray usage to sophisticated methods like MRIs, CT scans, and ultrasounds. This paper investigates the use of deep learning for medical image captioning, with a particular focus on uterine ultrasound images. These images are vital in obstetrics and gynecology for diagnosing and monitoring various conditions across different age groups. However, their interpretation is often challenging due to their complexity and variability. To address this, a deep learning-based medical image captioning system was developed, integrating Convolutional Neural Networks with a Bidirectional Gated Recurrent Unit network. This hybrid model processes both image and text features to generate descriptive captions for uterine ultrasound images. Our experimental results demonstrate the effectiveness of this approach over baseline methods, with the proposed model achieving superior performance in generating accurate and informative captions, as indicated by higher BLEU and ROUGE scores. By enhancing the interpretation of uterine ultrasound images, our research aims to assist medical professionals in making timely and accurate diagnoses, ultimately contributing to improved patient care. </p> </div> </dd> <dt> <a name='item53'>[53]</a> <a href ="/abs/2411.14053" title="Abstract" id="2411.14053"> arXiv:2411.14053 </a> [<a href="/pdf/2411.14053" title="Download PDF" id="pdf-2411.14053" aria-labelledby="pdf-2411.14053">pdf</a>, <a href="https://arxiv.org/html/2411.14053v1" title="View HTML" id="html-2411.14053" aria-labelledby="html-2411.14053" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14053" title="Other formats" id="oth-2411.14053" aria-labelledby="oth-2411.14053">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Stereo Anything: Unifying Stereo Matching with Large-Scale Mixed Data </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Guo,+X">Xianda Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+C">Chenming Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+Y">Youmin Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Nie,+D">Dujun Nie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+R">Ruilin Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zheng,+W">Wenzhao Zheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Poggi,+M">Matteo Poggi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+L">Long Chen</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Code will be available at \url{<a href="https://github.com/XiandaGuo/OpenStereo" rel="external noopener nofollow" class="link-external link-https">this https URL</a>} </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Stereo matching has been a pivotal component in 3D vision, aiming to find corresponding points between pairs of stereo images to recover depth information. In this work, we introduce StereoAnything, a highly practical solution for robust stereo matching. Rather than focusing on a specialized model, our goal is to develop a versatile foundational model capable of handling stereo images across diverse environments. To this end, we scale up the dataset by collecting labeled stereo images and generating synthetic stereo pairs from unlabeled monocular images. To further enrich the model&#39;s ability to generalize across different conditions, we introduce a novel synthetic dataset that complements existing data by adding variability in baselines, camera angles, and scene types. We extensively evaluate the zero-shot capabilities of our model on five public datasets, showcasing its impressive ability to generalize to new, unseen data. Code will be available at \url{<a href="https://github.com/XiandaGuo/OpenStereo" rel="external noopener nofollow" class="link-external link-https">this https URL</a>}. </p> </div> </dd> <dt> <a name='item54'>[54]</a> <a href ="/abs/2411.14062" title="Abstract" id="2411.14062"> arXiv:2411.14062 </a> [<a href="/pdf/2411.14062" title="Download PDF" id="pdf-2411.14062" aria-labelledby="pdf-2411.14062">pdf</a>, <a href="https://arxiv.org/html/2411.14062v1" title="View HTML" id="html-2411.14062" aria-labelledby="html-2411.14062" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14062" title="Other formats" id="oth-2411.14062" aria-labelledby="oth-2411.14062">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MMGenBench: Evaluating the Limits of LMMs from the Text-to-Image Generation Perspective </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Huang,+H">Hailang Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+Y">Yong Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Huang,+Z">Zixuan Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+H">Huaqiu Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Huang,+T">Tongwen Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chu,+X">Xiangxiang Chu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+R">Richong Zhang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> This project is available at: <a href="https://github.com/lerogo/MMGenBench" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Machine Learning (cs.LG) </div> <p class='mathjax'> Large Multimodal Models (LMMs) have demonstrated remarkable capabilities. While existing benchmarks for evaluating LMMs mainly focus on image comprehension, few works evaluate them from the image generation perspective. To address this issue, we propose a straightforward automated evaluation pipeline. Specifically, this pipeline requires LMMs to generate an image-prompt from a given input image. Subsequently, it employs text-to-image generative models to create a new image based on these generated prompts. Finally, we evaluate the performance of LMMs by comparing the original image with the generated one. Furthermore, we introduce MMGenBench-Test, a comprehensive benchmark developed to evaluate LMMs across 13 distinct image patterns, and MMGenBench-Domain, targeting the performance evaluation of LMMs within the generative image domain. A thorough evaluation involving over 50 popular LMMs demonstrates the effectiveness and reliability in both the pipeline and benchmark. Our observations indicate that numerous LMMs excelling in existing benchmarks fail to adequately complete the basic tasks, related to image understanding and description. This finding highlights the substantial potential for performance improvement in current LMMs and suggests avenues for future model optimization. Concurrently, our pipeline facilitates the efficient assessment of LMMs performance across diverse domains by using solely image inputs. </p> </div> </dd> <dt> <a name='item55'>[55]</a> <a href ="/abs/2411.14064" title="Abstract" id="2411.14064"> arXiv:2411.14064 </a> [<a href="/pdf/2411.14064" title="Download PDF" id="pdf-2411.14064" aria-labelledby="pdf-2411.14064">pdf</a>, <a href="/format/2411.14064" title="Other formats" id="oth-2411.14064" aria-labelledby="oth-2411.14064">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Multi LoRA Meets Vision: Merging multiple adapters to create a multi task model </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kesim,+E">Ege Kesim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Helli,+S+S">Selahattin Serdar Helli</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Parameter efficient finetuning (PEFT) methods are widely used in LLMs and generative models in computer vision. Especially one can use multiple of these during inference to change the behavior of the base model. In this paper we investigated whether multiple LoRA adapters trained on computer vision tasks can be merged together and used during inference without loss in performance. By achieving this, multitask models can be created just by merging different LoRAs. Merging these will reduce inference time and it will not require any additional retraining. We have trained adapters on six different tasks and evaluated their performance when they are merged together. For comparison we used a model with a frozen backbone and finetuned its head. Our results show that even with simple merging techniques creating a multitask model by merging adapters is achievable by slightly loosing performance in some cases. In our experiments we merged up to three adapters together. Depending on the task and the similarity of the data adapters were trained on, merges can outperform head finetuning. We have observed that LoRAs trained with dissimilar datasets tend to perform better compared to model trained on similar datasets. </p> </div> </dd> <dt> <a name='item56'>[56]</a> <a href ="/abs/2411.14095" title="Abstract" id="2411.14095"> arXiv:2411.14095 </a> [<a href="/pdf/2411.14095" title="Download PDF" id="pdf-2411.14095" aria-labelledby="pdf-2411.14095">pdf</a>, <a href="https://arxiv.org/html/2411.14095v1" title="View HTML" id="html-2411.14095" aria-labelledby="html-2411.14095" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14095" title="Other formats" id="oth-2411.14095" aria-labelledby="oth-2411.14095">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> WARLearn: Weather-Adaptive Representation Learning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Agarwal,+S">Shubham Agarwal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Birman,+R">Raz Birman</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Hadar,+O">Ofer Hadar</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted for publication in IEEE/CVF Winter Conference on Applications of Computer Vision (WACV), 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> This paper introduces WARLearn, a novel framework designed for adaptive representation learning in challenging and adversarial weather conditions. Leveraging the in-variance principal used in Barlow Twins, we demonstrate the capability to port the existing models initially trained on clear weather data to effectively handle adverse weather conditions. With minimal additional training, our method exhibits remarkable performance gains in scenarios characterized by fog and low-light conditions. This adaptive framework extends its applicability beyond adverse weather settings, offering a versatile solution for domains exhibiting variations in data distributions. Furthermore, WARLearn is invaluable in scenarios where data distributions undergo significant shifts over time, enabling models to remain updated and accurate. Our experimental findings reveal a remarkable performance, with a mean average precision (mAP) of 52.6% on unseen real-world foggy dataset (RTTS). Similarly, in low light conditions, our framework achieves a mAP of 55.7% on unseen real-world low light dataset (ExDark). Notably, WARLearn surpasses the performance of state-of-the-art frameworks including FeatEnHancer, Image Adaptive YOLO, DENet, C2PNet, PairLIE and ZeroDCE, by a substantial margin in adverse weather, improving the baseline performance in both foggy and low light conditions. The WARLearn code is available at <a href="https://github.com/ShubhamAgarwal12/WARLearn" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item57'>[57]</a> <a href ="/abs/2411.14119" title="Abstract" id="2411.14119"> arXiv:2411.14119 </a> [<a href="/pdf/2411.14119" title="Download PDF" id="pdf-2411.14119" aria-labelledby="pdf-2411.14119">pdf</a>, <a href="https://arxiv.org/html/2411.14119v1" title="View HTML" id="html-2411.14119" aria-labelledby="html-2411.14119" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14119" title="Other formats" id="oth-2411.14119" aria-labelledby="oth-2411.14119">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Uncertainty-Aware Regression for Socio-Economic Estimation via Multi-View Remote Sensing </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yang,+F">Fan Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ishida,+S">Sahoko Ishida</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+M">Mengyan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jenson,+D">Daniel Jenson</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Mishra,+S">Swapnil Mishra</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Navott,+J">Jhonathan Navott</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Flaxman,+S">Seth Flaxman</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 11 pages, 4 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Remote sensing imagery offers rich spectral data across extensive areas for Earth observation. Many attempts have been made to leverage these data with transfer learning to develop scalable alternatives for estimating socio-economic conditions, reducing reliance on expensive survey-collected data. However, much of this research has primarily focused on daytime satellite imagery due to the limitation that most pre-trained models are trained on 3-band RGB images. Consequently, modeling techniques for spectral bands beyond the visible spectrum have not been thoroughly investigated. Additionally, quantifying uncertainty in remote sensing regression has been less explored, yet it is essential for more informed targeting and iterative collection of ground truth survey data. In this paper, we introduce a novel framework that leverages generic foundational vision models to process remote sensing imagery using combinations of three spectral bands to exploit multi-spectral data. We also employ methods such as heteroscedastic regression and Bayesian modeling to generate uncertainty estimates for the predictions. Experimental results demonstrate that our method outperforms existing models that use RGB or multi-spectral models with unstructured band usage. Moreover, our framework helps identify uncertain predictions, guiding future ground truth data acquisition. </p> </div> </dd> <dt> <a name='item58'>[58]</a> <a href ="/abs/2411.14120" title="Abstract" id="2411.14120"> arXiv:2411.14120 </a> [<a href="/pdf/2411.14120" title="Download PDF" id="pdf-2411.14120" aria-labelledby="pdf-2411.14120">pdf</a>, <a href="https://arxiv.org/html/2411.14120v1" title="View HTML" id="html-2411.14120" aria-labelledby="html-2411.14120" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14120" title="Other formats" id="oth-2411.14120" aria-labelledby="oth-2411.14120">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Point Cloud Resampling with Learnable Heat Diffusion </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xu,+W">Wenqiang Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Dai,+W">Wenrui Dai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xue,+D">Duoduo Xue</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zheng,+Z">Ziyang Zheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+C">Chenglin Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zou,+J">Junni Zou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xiong,+H">Hongkai Xiong</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Generative diffusion models have shown empirical successes in point cloud resampling, generating a denser and more uniform distribution of points from sparse or noisy 3D point clouds by progressively refining noise into structure. However, existing diffusion models employ manually predefined schemes, which often fail to recover the underlying point cloud structure due to the rigid and disruptive nature of the geometric degradation. To address this issue, we propose a novel learnable heat diffusion framework for point cloud resampling, which directly parameterizes the marginal distribution for the forward process by learning the adaptive heat diffusion schedules and local filtering scales of the time-varying heat kernel, and consequently, generates an adaptive conditional prior for the reverse process. Unlike previous diffusion models with a fixed prior, the adaptive conditional prior selectively preserves geometric features of the point cloud by minimizing a refined variational lower bound, guiding the points to evolve towards the underlying surface during the reverse process. Extensive experimental results demonstrate that the proposed point cloud resampling achieves state-of-the-art performance in representative reconstruction tasks including point cloud denoising and upsampling. </p> </div> </dd> <dt> <a name='item59'>[59]</a> <a href ="/abs/2411.14125" title="Abstract" id="2411.14125"> arXiv:2411.14125 </a> [<a href="/pdf/2411.14125" title="Download PDF" id="pdf-2411.14125" aria-labelledby="pdf-2411.14125">pdf</a>, <a href="https://arxiv.org/html/2411.14125v1" title="View HTML" id="html-2411.14125" aria-labelledby="html-2411.14125" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14125" title="Other formats" id="oth-2411.14125" aria-labelledby="oth-2411.14125">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> RestorerID: Towards Tuning-Free Face Restoration with ID Preservation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ying,+J">Jiacheng Ying</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+M">Mushui Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wu,+Z">Zhe Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+R">Runming Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yu,+Z">Zhu Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Fu,+S">Siming Fu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Cao,+S">Si-Yuan Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wu,+C">Chao Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yu,+Y">Yunlong Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Shen,+H">Hui-Liang Shen</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 10 pages, 10 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Blind face restoration has made great progress in producing high-quality and lifelike images. Yet it remains challenging to preserve the ID information especially when the degradation is heavy. Current reference-guided face restoration approaches either require face alignment or personalized test-tuning, which are unfaithful or time-consuming. In this paper, we propose a tuning-free method named RestorerID that incorporates ID preservation during face restoration. RestorerID is a diffusion model-based method that restores low-quality images with varying levels of degradation by using a single reference image. To achieve this, we propose a unified framework to combine the ID injection with the base blind face restoration model. In addition, we design a novel Face ID Rebalancing Adapter (FIR-Adapter) to tackle the problems of content unconsistency and contours misalignment that are caused by information conflicts between the low-quality input and reference image. Furthermore, by employing an Adaptive ID-Scale Adjusting strategy, RestorerID can produce superior restored images across various levels of degradation. Experimental results on the Celeb-Ref dataset and real-world scenarios demonstrate that RestorerID effectively delivers high-quality face restoration with ID preservation, achieving a superior performance compared to the test-tuning approaches and other reference-guided ones. The code of RestorerID is available at \url{<a href="https://github.com/YingJiacheng/RestorerID" rel="external noopener nofollow" class="link-external link-https">this https URL</a>}. </p> </div> </dd> <dt> <a name='item60'>[60]</a> <a href ="/abs/2411.14137" title="Abstract" id="2411.14137"> arXiv:2411.14137 </a> [<a href="/pdf/2411.14137" title="Download PDF" id="pdf-2411.14137" aria-labelledby="pdf-2411.14137">pdf</a>, <a href="https://arxiv.org/html/2411.14137v1" title="View HTML" id="html-2411.14137" aria-labelledby="html-2411.14137" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14137" title="Other formats" id="oth-2411.14137" aria-labelledby="oth-2411.14137">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Visual Contexts Clarify Ambiguous Expressions: A Benchmark Dataset </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Nam,+H">Heejeong Nam</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ahn,+J">Jinwoo Ahn</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> The ability to perform complex reasoning across multimodal inputs is essential for models to effectively interact with humans in real-world scenarios. Advancements in vision-language models have significantly improved performance on tasks that require processing explicit and direct textual inputs, such as Visual Question Answering (VQA) and Visual Grounding (VG). However, less attention has been given to improving the model capabilities to comprehend nuanced and ambiguous forms of communication. This presents a critical challenge, as human language in real-world interactions often convey hidden intentions that rely on context for accurate interpretation. To address this gap, we propose VAGUE, a multimodal benchmark comprising 3.9K indirect human utterances paired with corresponding scenes. Additionally, we contribute a model-based pipeline for generating prompt-solution pairs from input images. Our work aims to delve deeper into the ability of models to understand indirect communication and seek to contribute to the development of models capable of more refined and human-like interactions. Extensive evaluation on multiple VLMs reveals that mainstream models still struggle with indirect communication when required to perform complex linguistic and visual reasoning. We release our code and data at <a href="https://github.com/Hazel-Heejeong-Nam/VAGUE.git" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item61'>[61]</a> <a href ="/abs/2411.14158" title="Abstract" id="2411.14158"> arXiv:2411.14158 </a> [<a href="/pdf/2411.14158" title="Download PDF" id="pdf-2411.14158" aria-labelledby="pdf-2411.14158">pdf</a>, <a href="https://arxiv.org/html/2411.14158v1" title="View HTML" id="html-2411.14158" aria-labelledby="html-2411.14158" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14158" title="Other formats" id="oth-2411.14158" aria-labelledby="oth-2411.14158">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Point Cloud Denoising With Fine-Granularity Dynamic Graph Convolutional Networks </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xu,+W">Wenqiang Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Dai,+W">Wenrui Dai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xue,+D">Duoduo Xue</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zheng,+Z">Ziyang Zheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+C">Chenglin Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zou,+J">Junni Zou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xiong,+H">Hongkai Xiong</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Due to limitations in acquisition equipment, noise perturbations often corrupt 3-D point clouds, hindering down-stream tasks such as surface reconstruction, rendering, and further processing. Existing 3-D point cloud denoising methods typically fail to reliably fit the underlying continuous surface, resulting in a degradation of reconstruction performance. This paper introduces fine-granularity dynamic graph convolutional networks called GD-GCN, a novel approach to denoising in 3-D point clouds. The GD-GCN employs micro-step temporal graph convolution (MST-GConv) to perform feature learning in a gradual manner. Compared with the conventional GCN, which commonly uses discrete integer-step graph convolution, this modification introduces a more adaptable and nuanced approach to feature learning within graph convolution networks. It more accurately depicts the process of fitting the point cloud with noise to the underlying surface by and the learning process for MST-GConv acts like a changing system and is managed through a type of neural network known as neural Partial Differential Equations (PDEs). This means it can adapt and improve over time. GD-GCN approximates the Riemannian metric, calculating distances between points along a low-dimensional manifold. This capability allows it to understand the local geometric structure and effectively capture diverse relationships between points from different geometric regions through geometric graph construction based on Riemannian distances. Additionally, GD-GCN incorporates robust graph spectral filters based on the Bernstein polynomial approximation, which modulate eigenvalues for complex and arbitrary spectral responses, providing theoretical guarantees for BIBO stability. Symmetric channel mixing matrices further enhance filter flexibility by enabling channel-level scaling and shifting in the spectral domain. </p> </div> </dd> <dt> <a name='item62'>[62]</a> <a href ="/abs/2411.14164" title="Abstract" id="2411.14164"> arXiv:2411.14164 </a> [<a href="/pdf/2411.14164" title="Download PDF" id="pdf-2411.14164" aria-labelledby="pdf-2411.14164">pdf</a>, <a href="https://arxiv.org/html/2411.14164v1" title="View HTML" id="html-2411.14164" aria-labelledby="html-2411.14164" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14164" title="Other formats" id="oth-2411.14164" aria-labelledby="oth-2411.14164">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> FoPru: Focal Pruning for Efficient Large Vision-Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jiang,+L">Lei Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Huang,+W">Weizhe Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+T">Tongxuan Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zeng,+Y">Yuting Zeng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+J">Jing Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Cheng,+L">Lechao Cheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xu,+X">Xiaohua Xu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 11 pages, 7 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Large Vision-Language Models (LVLMs) represent a significant advancement toward achieving superior multimodal capabilities by enabling powerful Large Language Models (LLMs) to understand visual input. Typically, LVLMs utilize visual encoders, such as CLIP, to transform images into visual tokens, which are then aligned with textual tokens through projection layers before being input into the LLM for inference. Although existing LVLMs have achieved significant success, their inference efficiency is still limited by the substantial number of visual tokens and the potential redundancy among them. To mitigate this issue, we propose Focal Pruning (FoPru), a training-free method that prunes visual tokens based on the attention-based token significance derived from the vision encoder. Specifically, we introduce two alternative pruning strategies: 1) the rank strategy, which leverages all token significance scores to retain more critical tokens in a global view; 2) the row strategy, which focuses on preserving continuous key information in images from a local perspective. Finally, the selected tokens are reordered to maintain their original positional relationships. Extensive experiments across various LVLMs and multimodal datasets demonstrate that our method can prune a large number of redundant tokens while maintaining high accuracy, leading to significant improvements in inference efficiency. </p> </div> </dd> <dt> <a name='item63'>[63]</a> <a href ="/abs/2411.14169" title="Abstract" id="2411.14169"> arXiv:2411.14169 </a> [<a href="/pdf/2411.14169" title="Download PDF" id="pdf-2411.14169" aria-labelledby="pdf-2411.14169">pdf</a>, <a href="https://arxiv.org/html/2411.14169v1" title="View HTML" id="html-2411.14169" aria-labelledby="html-2411.14169" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14169" title="Other formats" id="oth-2411.14169" aria-labelledby="oth-2411.14169">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Spatiotemporal Decoupling for Efficient Vision-Based Occupancy Forecasting </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xu,+J">Jingyi Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+X">Xieyuanli Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ma,+J">Junyi Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Huang,+J">Jiawei Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xu,+J">Jintao Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+Y">Yue Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Pei,+L">Ling Pei</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> The task of occupancy forecasting (OCF) involves utilizing past and present perception data to predict future occupancy states of autonomous vehicle surrounding environments, which is critical for downstream tasks such as obstacle avoidance and path planning. Existing 3D OCF approaches struggle to predict plausible spatial details for movable objects and suffer from slow inference speeds due to neglecting the bias and uneven distribution of changing occupancy states in both space and time. In this paper, we propose a novel spatiotemporal decoupling vision-based paradigm to explicitly tackle the bias and achieve both effective and efficient 3D OCF. To tackle spatial bias in empty areas, we introduce a novel spatial representation that decouples the conventional dense 3D format into 2D bird&#39;s-eye view (BEV) occupancy with corresponding height values, enabling 3D OCF derived only from 2D predictions thus enhancing efficiency. To reduce temporal bias on static voxels, we design temporal decoupling to improve end-to-end OCF by temporally associating instances via predicted flows. We develop an efficient multi-head network EfficientOCF to achieve 3D OCF with our devised spatiotemporally decoupled representation. A new metric, conditional IoU (C-IoU), is also introduced to provide a robust 3D OCF performance assessment, especially in datasets with missing or incomplete annotations. The experimental results demonstrate that EfficientOCF surpasses existing baseline methods on accuracy and efficiency, achieving state-of-the-art performance with a fast inference time of 82.33ms with a single GPU. Our code will be released as open source. </p> </div> </dd> <dt> <a name='item64'>[64]</a> <a href ="/abs/2411.14179" title="Abstract" id="2411.14179"> arXiv:2411.14179 </a> [<a href="/pdf/2411.14179" title="Download PDF" id="pdf-2411.14179" aria-labelledby="pdf-2411.14179">pdf</a>, <a href="https://arxiv.org/html/2411.14179v1" title="View HTML" id="html-2411.14179" aria-labelledby="html-2411.14179" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14179" title="Other formats" id="oth-2411.14179" aria-labelledby="oth-2411.14179">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CompetitorFormer: Competitor Transformer for 3D Instance Segmentation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+D">Duanchu Wang</a> (1), <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+J">Jing Liu</a> (2), <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gong,+H">Haoran Gong</a> (2), <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Quan,+Y">Yinghui Quan</a> (1), <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+D">Di Wang</a> (2) ((1) School of Electronic Engineering, Xidian University (2) School of Software Engineering, Xian Jiaotong University)</div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Transformer-based methods have become the dominant approach for 3D instance segmentation. These methods predict instance masks via instance queries, ranking them by classification confidence and IoU scores to select the top prediction as the final outcome. However, it has been observed that the current models employ a fixed and higher number of queries than the instances present within a scene. In such instances, multiple queries predict the same instance, yet only a single query is ultimately optimized. The close scores of queries in the lower-level decoders make it challenging for the dominant query to distinguish itself rapidly, which ultimately impairs the model&#39;s accuracy and convergence efficiency. This phenomenon is referred to as inter-query competition. To address this challenge, we put forth a series of plug-and-play competition-oriented designs, collectively designated as the CompetitorFormer, with the aim of reducing competition and facilitating a dominant query. Experiments showed that integrating our designs with state-of-the-art frameworks consistently resulted in significant performance improvements in 3D instance segmentation across a range of datasets. </p> </div> </dd> <dt> <a name='item65'>[65]</a> <a href ="/abs/2411.14193" title="Abstract" id="2411.14193"> arXiv:2411.14193 </a> [<a href="/pdf/2411.14193" title="Download PDF" id="pdf-2411.14193" aria-labelledby="pdf-2411.14193">pdf</a>, <a href="https://arxiv.org/html/2411.14193v1" title="View HTML" id="html-2411.14193" aria-labelledby="html-2411.14193" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14193" title="Other formats" id="oth-2411.14193" aria-labelledby="oth-2411.14193">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> ComfyGI: Automatic Improvement of Image Generation Workflows </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Sobania,+D">Dominik Sobania</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Briesch,+M">Martin Briesch</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Rothlauf,+F">Franz Rothlauf</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG); Neural and Evolutionary Computing (cs.NE) </div> <p class='mathjax'> Automatic image generation is no longer just of interest to researchers, but also to practitioners. However, current models are sensitive to the settings used and automatic optimization methods often require human involvement. To bridge this gap, we introduce ComfyGI, a novel approach to automatically improve workflows for image generation without the need for human intervention driven by techniques from genetic improvement. This enables image generation with significantly higher quality in terms of the alignment with the given description and the perceived aesthetics. On the performance side, we find that overall, the images generated with an optimized workflow are about 50% better compared to the initial workflow in terms of the median ImageReward score. These already good results are even surpassed in our human evaluation, as the participants preferred the images improved by ComfyGI in around 90% of the cases. </p> </div> </dd> <dt> <a name='item66'>[66]</a> <a href ="/abs/2411.14201" title="Abstract" id="2411.14201"> arXiv:2411.14201 </a> [<a href="/pdf/2411.14201" title="Download PDF" id="pdf-2411.14201" aria-labelledby="pdf-2411.14201">pdf</a>, <a href="https://arxiv.org/html/2411.14201v1" title="View HTML" id="html-2411.14201" aria-labelledby="html-2411.14201" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14201" title="Other formats" id="oth-2411.14201" aria-labelledby="oth-2411.14201">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Regional Attention for Shadow Removal </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+H">Hengxing Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+M">Mingjia Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Guo,+X">Xiaojie Guo</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Shadow, as a natural consequence of light interacting with objects, plays a crucial role in shaping the aesthetics of an image, which however also impairs the content visibility and overall visual quality. Recent shadow removal approaches employ the mechanism of attention, due to its effectiveness, as a key component. However, they often suffer from two issues including large model size and high computational complexity for practical use. To address these shortcomings, this work devises a lightweight yet accurate shadow removal framework. First, we analyze the characteristics of the shadow removal task to seek the key information required for reconstructing shadow regions and designing a novel regional attention mechanism to effectively capture such information. Then, we customize a Regional Attention Shadow Removal Model (RASM, in short), which leverages non-shadow areas to assist in restoring shadow ones. Unlike existing attention-based models, our regional attention strategy allows each shadow region to interact more rationally with its surrounding non-shadow areas, for seeking the regional contextual correlation between shadow and non-shadow areas. Extensive experiments are conducted to demonstrate that our proposed method delivers superior performance over other state-of-the-art models in terms of accuracy and efficiency, making it appealing for practical applications. </p> </div> </dd> <dt> <a name='item67'>[67]</a> <a href ="/abs/2411.14205" title="Abstract" id="2411.14205"> arXiv:2411.14205 </a> [<a href="/pdf/2411.14205" title="Download PDF" id="pdf-2411.14205" aria-labelledby="pdf-2411.14205">pdf</a>, <a href="https://arxiv.org/html/2411.14205v1" title="View HTML" id="html-2411.14205" aria-labelledby="html-2411.14205" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14205" title="Other formats" id="oth-2411.14205" aria-labelledby="oth-2411.14205">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Is this Generated Person Existed in Real-world? Fine-grained Detecting and Calibrating Abnormal Human-body </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+Z">Zeqing Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ma,+Q">Qingyang Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wan,+W">Wentao Wan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+H">Haojie Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+K">Keze Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Tian,+Y">Yonghong Tian</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 16 pages, 14 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Recent improvements in visual synthesis have significantly enhanced the depiction of generated human photos, which are pivotal due to their wide applicability and demand. Nonetheless, the existing text-to-image or text-to-video models often generate low-quality human photos that might differ considerably from real-world body structures, referred to as &#34;abnormal human bodies&#34;. Such abnormalities, typically deemed unacceptable, pose considerable challenges in the detection and repair of them within human photos. These challenges require precise abnormality recognition capabilities, which entail pinpointing both the location and the abnormality type. Intuitively, Visual Language Models (VLMs) that have obtained remarkable performance on various visual tasks are quite suitable for this task. However, their performance on abnormality detection in human photos is quite poor. Hence, it is quite important to highlight this task for the research community. In this paper, we first introduce a simple yet challenging task, i.e., \textbf{F}ine-grained \textbf{H}uman-body \textbf{A}bnormality \textbf{D}etection \textbf{(FHAD)}, and construct two high-quality datasets for evaluation. Then, we propose a meticulous framework, named HumanCalibrator, which identifies and repairs abnormalities in human body structures while preserving the other content. Experiments indicate that our HumanCalibrator achieves high accuracy in abnormality detection and accomplishes an increase in visual comparisons while preserving the other visual content. </p> </div> </dd> <dt> <a name='item68'>[68]</a> <a href ="/abs/2411.14208" title="Abstract" id="2411.14208"> arXiv:2411.14208 </a> [<a href="/pdf/2411.14208" title="Download PDF" id="pdf-2411.14208" aria-labelledby="pdf-2411.14208">pdf</a>, <a href="https://arxiv.org/html/2411.14208v1" title="View HTML" id="html-2411.14208" aria-labelledby="html-2411.14208" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14208" title="Other formats" id="oth-2411.14208" aria-labelledby="oth-2411.14208">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Novel View Extrapolation with Video Diffusion Priors </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+K">Kunhao Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Shao,+L">Ling Shao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lu,+S">Shijian Lu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> The field of novel view synthesis has made significant strides thanks to the development of radiance field methods. However, most radiance field techniques are far better at novel view interpolation than novel view extrapolation where the synthesis novel views are far beyond the observed training views. We design ViewExtrapolator, a novel view synthesis approach that leverages the generative priors of Stable Video Diffusion (SVD) for realistic novel view extrapolation. By redesigning the SVD denoising process, ViewExtrapolator refines the artifact-prone views rendered by radiance fields, greatly enhancing the clarity and realism of the synthesized novel views. ViewExtrapolator is a generic novel view extrapolator that can work with different types of 3D rendering such as views rendered from point clouds when only a single view or monocular video is available. Additionally, ViewExtrapolator requires no fine-tuning of SVD, making it both data-efficient and computation-efficient. Extensive experiments demonstrate the superiority of ViewExtrapolator in novel view extrapolation. Project page: \url{<a href="https://kunhao-liu.github.io/ViewExtrapolator/" rel="external noopener nofollow" class="link-external link-https">this https URL</a>}. </p> </div> </dd> <dt> <a name='item69'>[69]</a> <a href ="/abs/2411.14213" title="Abstract" id="2411.14213"> arXiv:2411.14213 </a> [<a href="/pdf/2411.14213" title="Download PDF" id="pdf-2411.14213" aria-labelledby="pdf-2411.14213">pdf</a>, <a href="https://arxiv.org/html/2411.14213v1" title="View HTML" id="html-2411.14213" aria-labelledby="html-2411.14213" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14213" title="Other formats" id="oth-2411.14213" aria-labelledby="oth-2411.14213">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Generative Outpainting To Enhance the Memorability of Short-Form Videos </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Byju,+A">Alan Byju</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ladwa,+A+S">Aman Sudhindra Ladwa</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Sweeney,+L">Lorin Sweeney</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Smeaton,+A+F">Alan F. Smeaton</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> With the expanding use of the short-form video format in advertising, social media, entertainment, education and more, there is a need for such media to both captivate and be remembered. Video memorability indicates to us how likely a video is to be remembered by a viewer who has no emotional or personal connection with its content. This paper presents the results of using generative outpainting to expand the screen size of a short-form video with a view to improving its memorability. Advances in machine learning and deep learning are compared and leveraged to understand how extending the borders of video screensizes can affect their memorability to viewers. Using quantitative evaluation we determine the best-performing model for outpainting and the impact of outpainting based on image saliency on video memorability scores </p> </div> </dd> <dt> <a name='item70'>[70]</a> <a href ="/abs/2411.14219" title="Abstract" id="2411.14219"> arXiv:2411.14219 </a> [<a href="/pdf/2411.14219" title="Download PDF" id="pdf-2411.14219" aria-labelledby="pdf-2411.14219">pdf</a>, <a href="/format/2411.14219" title="Other formats" id="oth-2411.14219" aria-labelledby="oth-2411.14219">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Towards Context-Rich Automated Biodiversity Assessments: Deriving AI-Powered Insights from Camera Trap Data </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Fergus,+P">Paul Fergus</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chalmers,+C">Carl Chalmers</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Matthews,+N">Naomi Matthews</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Nixon,+S">Stuart Nixon</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Burger,+A">Andre Burger</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Hartley,+O">Oliver Hartley</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Sutherland,+C">Chris Sutherland</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lambin,+X">Xavier Lambin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Longmore,+S">Steven Longmore</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wich,+S">Serge Wich</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 32 Pages, 22 images </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Camera traps offer enormous new opportunities in ecological studies, but current automated image analysis methods often lack the contextual richness needed to support impactful conservation outcomes. Here we present an integrated approach that combines deep learning-based vision and language models to improve ecological reporting using data from camera traps. We introduce a two-stage system: YOLOv10-X to localise and classify species (mammals and birds) within images, and a Phi-3.5-vision-instruct model to read YOLOv10-X binding box labels to identify species, overcoming its limitation with hard to classify objects in images. Additionally, Phi-3.5 detects broader variables, such as vegetation type, and time of day, providing rich ecological and environmental context to YOLO&#39;s species detection output. When combined, this output is processed by the model&#39;s natural language system to answer complex queries, and retrieval-augmented generation (RAG) is employed to enrich responses with external information, like species weight and IUCN status (information that cannot be obtained through direct visual analysis). This information is used to automatically generate structured reports, providing biodiversity stakeholders with deeper insights into, for example, species abundance, distribution, animal behaviour, and habitat selection. Our approach delivers contextually rich narratives that aid in wildlife management decisions. By providing contextually rich insights, our approach not only reduces manual effort but also supports timely decision-making in conservation, potentially shifting efforts from reactive to proactive management. </p> </div> </dd> <dt> <a name='item71'>[71]</a> <a href ="/abs/2411.14228" title="Abstract" id="2411.14228"> arXiv:2411.14228 </a> [<a href="/pdf/2411.14228" title="Download PDF" id="pdf-2411.14228" aria-labelledby="pdf-2411.14228">pdf</a>, <a href="https://arxiv.org/html/2411.14228v1" title="View HTML" id="html-2411.14228" aria-labelledby="html-2411.14228" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14228" title="Other formats" id="oth-2411.14228" aria-labelledby="oth-2411.14228">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> FocusLLaVA: A Coarse-to-Fine Approach for Efficient and Effective Visual Token Compression </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhu,+Y">Yuke Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xie,+C">Chi Xie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liang,+S">Shuang Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zheng,+B">Bo Zheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Guo,+S">Sheng Guo</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Recent advances on Multi-modal Large Language Models have demonstrated that high-resolution image input is crucial for model capabilities, especially for fine-grained tasks. However, high-resolution images lead to a quadratic increase in the number of visual tokens input into LLMs, resulting in significant computational costs. Current work develop visual token compression methods to achieve efficiency improvements, often at the expense of performance. We argue that removing visual redundancy can simultaneously improve both efficiency and performance. We build a coarse-to-fine visual token compression method, with a vision-guided sampler for compressing redundant regions with low information density, and a text-guided sampler for selecting visual tokens that are strongly correlated with the user <a href="http://instructions.With" rel="external noopener nofollow" class="link-external link-http">this http URL</a> these two modules, the proposed FocusLLaVA achieves improvements in both efficiency and performance. We validate the effectiveness of our approach on a wide range of evaluation datasets. </p> </div> </dd> <dt> <a name='item72'>[72]</a> <a href ="/abs/2411.14279" title="Abstract" id="2411.14279"> arXiv:2411.14279 </a> [<a href="/pdf/2411.14279" title="Download PDF" id="pdf-2411.14279" aria-labelledby="pdf-2411.14279">pdf</a>, <a href="/format/2411.14279" title="Other formats" id="oth-2411.14279" aria-labelledby="oth-2411.14279">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Looking Beyond Text: Reducing Language bias in Large Vision-Language Models via Multimodal Dual-Attention and Soft-Image Guidance </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhao,+H">Haozhe Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Si,+S">Shuzheng Si</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+L">Liang Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+Y">Yichi Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Sun,+M">Maosong Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+M">Mingjia Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chang,+B">Baobao Chang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 19 pages, 12 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Large vision-language models (LVLMs) have achieved impressive results in various vision-language tasks. However, despite showing promising performance, LVLMs suffer from hallucinations caused by language bias, leading to diminished focus on images and ineffective visual comprehension. We identify two primary reasons for this bias: 1. Different scales of training data between the pretraining stage of LLM and multimodal alignment stage. 2. The learned inference bias due to short-term dependency of text data. Therefore, we propose LACING, a systemic framework designed to address the language bias of LVLMs with muLtimodal duAl-attention meChanIsm (MDA) aNd soft-image Guidance (IFG). Specifically, MDA introduces a parallel dual-attention mechanism that enhances the integration of visual inputs across the model. IFG introduces a learnable soft visual prompt during training and inference to replace visual inputs, designed to compel LVLMs to prioritize text inputs. Then, IFG further proposes a novel decoding strategy using the soft visual prompt to mitigate the model&#39;s over-reliance on adjacent text inputs. Comprehensive experiments demonstrate that our method effectively debiases LVLMs from their language bias, enhancing visual comprehension and reducing hallucinations without requiring additional training resources or data. The code and model are available at [<a href="http://lacing-lvlm.github.io" rel="external noopener nofollow" class="link-external link-http">this http URL</a>](<a href="https://lacing-lvlm.github.io" rel="external noopener nofollow" class="link-external link-https">this https URL</a>). </p> </div> </dd> <dt> <a name='item73'>[73]</a> <a href ="/abs/2411.14280" title="Abstract" id="2411.14280"> arXiv:2411.14280 </a> [<a href="/pdf/2411.14280" title="Download PDF" id="pdf-2411.14280" aria-labelledby="pdf-2411.14280">pdf</a>, <a href="https://arxiv.org/html/2411.14280v1" title="View HTML" id="html-2411.14280" aria-labelledby="html-2411.14280" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14280" title="Other formats" id="oth-2411.14280" aria-labelledby="oth-2411.14280">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> EasyHOI: Unleashing the Power of Large Models for Reconstructing Hand-Object Interactions in the Wild </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+Y">Yumeng Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Long,+X">Xiaoxiao Long</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yang,+Z">Zemin Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+Y">Yuan Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Habermann,+M">Marc Habermann</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Theobalt,+C">Christian Theobalt</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ma,+Y">Yuexin Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+W">Wenping Wang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Project page: <a href="https://lym29.github.io/EasyHOI-page/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Our work aims to reconstruct hand-object interactions from a single-view image, which is a fundamental but ill-posed task. Unlike methods that reconstruct from videos, multi-view images, or predefined 3D templates, single-view reconstruction faces significant challenges due to inherent ambiguities and occlusions. These challenges are further amplified by the diverse nature of hand poses and the vast variety of object shapes and sizes. Our key insight is that current foundational models for segmentation, inpainting, and 3D reconstruction robustly generalize to in-the-wild images, which could provide strong visual and geometric priors for reconstructing hand-object interactions. Specifically, given a single image, we first design a novel pipeline to estimate the underlying hand pose and object shape using off-the-shelf large models. Furthermore, with the initial reconstruction, we employ a prior-guided optimization scheme, which optimizes hand pose to comply with 3D physical constraints and the 2D input image content. We perform experiments across several datasets and show that our method consistently outperforms baselines and faithfully reconstructs a diverse set of hand-object interactions. Here is the link of our project page: <a href="https://lym29.github.io/EasyHOI-page/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item74'>[74]</a> <a href ="/abs/2411.14295" title="Abstract" id="2411.14295"> arXiv:2411.14295 </a> [<a href="/pdf/2411.14295" title="Download PDF" id="pdf-2411.14295" aria-labelledby="pdf-2411.14295">pdf</a>, <a href="https://arxiv.org/html/2411.14295v1" title="View HTML" id="html-2411.14295" aria-labelledby="html-2411.14295" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14295" title="Other formats" id="oth-2411.14295" aria-labelledby="oth-2411.14295">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> StereoCrafter-Zero: Zero-Shot Stereo Video Generation with Noisy Restart </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Shi,+J">Jian Shi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+Q">Qian Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+Z">Zhenyu Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wonka,+P">Peter Wonka</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Generating high-quality stereo videos that mimic human binocular vision requires maintaining consistent depth perception and temporal coherence across frames. While diffusion models have advanced image and video synthesis, generating high-quality stereo videos remains challenging due to the difficulty of maintaining consistent temporal and spatial coherence between left and right views. We introduce \textit{StereoCrafter-Zero}, a novel framework for zero-shot stereo video generation that leverages video diffusion priors without the need for paired training data. Key innovations include a noisy restart strategy to initialize stereo-aware latents and an iterative refinement process that progressively harmonizes the latent space, addressing issues like temporal flickering and view inconsistencies. Comprehensive evaluations, including quantitative metrics and user studies, demonstrate that \textit{StereoCrafter-Zero} produces high-quality stereo videos with improved depth consistency and temporal smoothness, even when depth estimations are imperfect. Our framework is robust and adaptable across various diffusion models, setting a new benchmark for zero-shot stereo video generation and enabling more immersive visual experiences. Our code can be found in~\url{<a href="https://github.com/shijianjian/StereoCrafter-Zero" rel="external noopener nofollow" class="link-external link-https">this https URL</a>}. </p> </div> </dd> <dt> <a name='item75'>[75]</a> <a href ="/abs/2411.14347" title="Abstract" id="2411.14347"> arXiv:2411.14347 </a> [<a href="/pdf/2411.14347" title="Download PDF" id="pdf-2411.14347" aria-labelledby="pdf-2411.14347">pdf</a>, <a href="https://arxiv.org/html/2411.14347v1" title="View HTML" id="html-2411.14347" aria-labelledby="html-2411.14347" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14347" title="Other formats" id="oth-2411.14347" aria-labelledby="oth-2411.14347">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> DINO-X: A Unified Vision Model for Open-World Object Detection and Understanding </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ren,+T">Tianhe Ren</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+Y">Yihao Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jiang,+Q">Qing Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zeng,+Z">Zhaoyang Zeng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xiong,+Y">Yuda Xiong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+W">Wenlong Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ma,+Z">Zhengyu Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Shen,+J">Junyi Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gao,+Y">Yuan Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jiang,+X">Xiaoke Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+X">Xingyu Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Song,+Z">Zhuheng Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+Y">Yuhong Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Huang,+H">Hongjie Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gao,+H">Han Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+S">Shilong Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+H">Hao Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+F">Feng Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yu,+K">Kent Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+L">Lei Zhang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Technical Report </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> In this paper, we introduce DINO-X, which is a unified object-centric vision model developed by IDEA Research with the best open-world object detection performance to date. DINO-X employs the same Transformer-based encoder-decoder architecture as Grounding DINO 1.5 to pursue an object-level representation for open-world object understanding. To make long-tailed object detection easy, DINO-X extends its input options to support text prompt, visual prompt, and customized prompt. With such flexible prompt options, we develop a universal object prompt to support prompt-free open-world detection, making it possible to detect anything in an image without requiring users to provide any prompt. To enhance the model&#39;s core grounding capability, we have constructed a large-scale dataset with over 100 million high-quality grounding samples, referred to as Grounding-100M, for advancing the model&#39;s open-vocabulary detection performance. Pre-training on such a large-scale grounding dataset leads to a foundational object-level representation, which enables DINO-X to integrate multiple perception heads to simultaneously support multiple object perception and understanding tasks, including detection, segmentation, pose estimation, object captioning, object-based QA, etc. Experimental results demonstrate the superior performance of DINO-X. Specifically, the DINO-X Pro model achieves 56.0 AP, 59.8 AP, and 52.4 AP on the COCO, LVIS-minival, and LVIS-val zero-shot object detection benchmarks, respectively. Notably, it scores 63.3 AP and 56.5 AP on the rare classes of LVIS-minival and LVIS-val benchmarks, both improving the previous SOTA performance by 5.8 AP. Such a result underscores its significantly improved capacity for recognizing long-tailed objects. </p> </div> </dd> <dt> <a name='item76'>[76]</a> <a href ="/abs/2411.14384" title="Abstract" id="2411.14384"> arXiv:2411.14384 </a> [<a href="/pdf/2411.14384" title="Download PDF" id="pdf-2411.14384" aria-labelledby="pdf-2411.14384">pdf</a>, <a href="https://arxiv.org/html/2411.14384v1" title="View HTML" id="html-2411.14384" aria-labelledby="html-2411.14384" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14384" title="Other formats" id="oth-2411.14384" aria-labelledby="oth-2411.14384">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Baking Gaussian Splatting into Diffusion Denoiser for Fast and Scalable Single-stage Image-to-3D Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Cai,+Y">Yuanhao Cai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+H">He Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+K">Kai Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liang,+Y">Yixun Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ren,+M">Mengwei Ren</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Luan,+F">Fujun Luan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+Q">Qing Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kim,+S+Y">Soo Ye Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+J">Jianming Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+Z">Zhifei Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhou,+Y">Yuqian Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lin,+Z">Zhe Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yuille,+A">Alan Yuille</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> A novel one-stage 3DGS-based diffusion generates objects and scenes from a single view in ~6 seconds </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Graphics (cs.GR) </div> <p class='mathjax'> Existing feed-forward image-to-3D methods mainly rely on 2D multi-view diffusion models that cannot guarantee 3D consistency. These methods easily collapse when changing the prompt view direction and mainly handle object-centric prompt images. In this paper, we propose a novel single-stage 3D diffusion model, DiffusionGS, for object and scene generation from a single view. DiffusionGS directly outputs 3D Gaussian point clouds at each timestep to enforce view consistency and allow the model to generate robustly given prompt views of any directions, beyond object-centric inputs. Plus, to improve the capability and generalization ability of DiffusionGS, we scale up 3D training data by developing a scene-object mixed training strategy. Experiments show that our method enjoys better generation quality (2.20 dB higher in PSNR and 23.25 lower in FID) and over 5x faster speed (~6s on an A100 GPU) than SOTA methods. The user study and text-to-3D applications also reveals the practical values of our method. Our Project page at <a href="https://caiyuanhao1998.github.io/project/DiffusionGS/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> shows the video and interactive generation results. </p> </div> </dd> <dt> <a name='item77'>[77]</a> <a href ="/abs/2411.14401" title="Abstract" id="2411.14401"> arXiv:2411.14401 </a> [<a href="/pdf/2411.14401" title="Download PDF" id="pdf-2411.14401" aria-labelledby="pdf-2411.14401">pdf</a>, <a href="https://arxiv.org/html/2411.14401v1" title="View HTML" id="html-2411.14401" aria-labelledby="html-2411.14401" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14401" title="Other formats" id="oth-2411.14401" aria-labelledby="oth-2411.14401">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Beyond Training: Dynamic Token Merging for Zero-Shot Video Understanding </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+Y">Yiming Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhao,+Z">Zhuokai Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+Z">Zhaorun Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ding,+Z">Zenghui Ding</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yang,+X">Xianjun Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Sun,+Y">Yining Sun</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Recent advancements in multimodal large language models (MLLMs) have opened new avenues for video understanding. However, achieving high fidelity in zero-shot video tasks remains challenging. Traditional video processing methods rely heavily on fine-tuning to capture nuanced spatial-temporal details, which incurs significant data and computation costs. In contrast, training-free approaches, though efficient, often lack robustness in preserving context-rich features across complex video content. To this end, we propose DYTO, a novel dynamic token merging framework for zero-shot video understanding that adaptively optimizes token efficiency while preserving crucial scene details. DYTO integrates a hierarchical frame selection and a bipartite token merging strategy to dynamically cluster key frames and selectively compress token sequences, striking a balance between computational efficiency with semantic richness. Extensive experiments across multiple benchmarks demonstrate the effectiveness of DYTO, achieving superior performance compared to both fine-tuned and training-free methods and setting a new state-of-the-art for zero-shot video understanding. </p> </div> </dd> <dt> <a name='item78'>[78]</a> <a href ="/abs/2411.14402" title="Abstract" id="2411.14402"> arXiv:2411.14402 </a> [<a href="/pdf/2411.14402" title="Download PDF" id="pdf-2411.14402" aria-labelledby="pdf-2411.14402">pdf</a>, <a href="/format/2411.14402" title="Other formats" id="oth-2411.14402" aria-labelledby="oth-2411.14402">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Multimodal Autoregressive Pre-training of Large Vision Encoders </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Fini,+E">Enrico Fini</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Shukor,+M">Mustafa Shukor</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+X">Xiujun Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Dufter,+P">Philipp Dufter</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Klein,+M">Michal Klein</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Haldimann,+D">David Haldimann</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Aitharaju,+S">Sai Aitharaju</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=da+Costa,+V+G+T">Victor Guilherme Turrisi da Costa</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=B%C3%A9thune,+L">Louis B茅thune</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gan,+Z">Zhe Gan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Toshev,+A+T">Alexander T Toshev</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Eichner,+M">Marcin Eichner</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Nabi,+M">Moin Nabi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yang,+Y">Yinfei Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Susskind,+J+M">Joshua M. Susskind</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=El-Nouby,+A">Alaaeldin El-Nouby</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> <a href="https://github.com/apple/ml-aim" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> We introduce a novel method for pre-training of large-scale vision encoders. Building on recent advancements in autoregressive pre-training of vision models, we extend this framework to a multimodal setting, i.e., images and text. In this paper, we present AIMV2, a family of generalist vision encoders characterized by a straightforward pre-training process, scalability, and remarkable performance across a range of downstream tasks. This is achieved by pairing the vision encoder with a multimodal decoder that autoregressively generates raw image patches and text tokens. Our encoders excel not only in multimodal evaluations but also in vision benchmarks such as localization, grounding, and classification. Notably, our AIMV2-3B encoder achieves 89.5% accuracy on ImageNet-1k with a frozen trunk. Furthermore, AIMV2 consistently outperforms state-of-the-art contrastive models (e.g., CLIP, SigLIP) in multimodal image understanding across diverse settings. </p> </div> </dd> <dt> <a name='item79'>[79]</a> <a href ="/abs/2411.14423" title="Abstract" id="2411.14423"> arXiv:2411.14423 </a> [<a href="/pdf/2411.14423" title="Download PDF" id="pdf-2411.14423" aria-labelledby="pdf-2411.14423">pdf</a>, <a href="https://arxiv.org/html/2411.14423v1" title="View HTML" id="html-2411.14423" aria-labelledby="html-2411.14423" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14423" title="Other formats" id="oth-2411.14423" aria-labelledby="oth-2411.14423">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Unleashing the Potential of Multi-modal Foundation Models and Video Diffusion for 4D Dynamic Physical Scene Simulation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+Z">Zhuoman Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ye,+W">Weicai Ye</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Luximon,+Y">Yan Luximon</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wan,+P">Pengfei Wan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+D">Di Zhang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Homepage: <a href="https://zhuomanliu.github.io/PhysFlow/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Realistic simulation of dynamic scenes requires accurately capturing diverse material properties and modeling complex object interactions grounded in physical principles. However, existing methods are constrained to basic material types with limited predictable parameters, making them insufficient to represent the complexity of real-world materials. We introduce a novel approach that leverages multi-modal foundation models and video diffusion to achieve enhanced 4D dynamic scene simulation. Our method utilizes multi-modal models to identify material types and initialize material parameters through image queries, while simultaneously inferring 3D Gaussian splats for detailed scene representation. We further refine these material parameters using video diffusion with a differentiable Material Point Method (MPM) and optical flow guidance rather than render loss or Score Distillation Sampling (SDS) loss. This integrated framework enables accurate prediction and realistic simulation of dynamic interactions in real-world scenarios, advancing both accuracy and flexibility in physics-based simulations. </p> </div> </dd> <dt> <a name='item80'>[80]</a> <a href ="/abs/2411.14429" title="Abstract" id="2411.14429"> arXiv:2411.14429 </a> [<a href="/pdf/2411.14429" title="Download PDF" id="pdf-2411.14429" aria-labelledby="pdf-2411.14429">pdf</a>, <a href="https://arxiv.org/html/2411.14429v1" title="View HTML" id="html-2411.14429" aria-labelledby="html-2411.14429" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14429" title="Other formats" id="oth-2411.14429" aria-labelledby="oth-2411.14429">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Revisiting the Integration of Convolution and Attention for Vision Backbone </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhu,+L">Lei Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+X">Xinjiang Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+W">Wayne Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lau,+R+W+H">Rynson W. H. Lau</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> NeurIPS 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Convolutions (Convs) and multi-head self-attentions (MHSAs) are typically considered alternatives to each other for building vision backbones. Although some works try to integrate both, they apply the two operators simultaneously at the finest pixel granularity. With Convs responsible for per-pixel feature extraction already, the question is whether we still need to include the heavy MHSAs at such a fine-grained level. In fact, this is the root cause of the scalability issue w.r.t. the input resolution for vision transformers. To address this important problem, we propose in this work to use MSHAs and Convs in parallel \textbf{at different granularity levels} instead. Specifically, in each layer, we use two different ways to represent an image: a fine-grained regular grid and a coarse-grained set of semantic slots. We apply different operations to these two representations: Convs to the grid for local features, and MHSAs to the slots for global features. A pair of fully differentiable soft clustering and dispatching modules is introduced to bridge the grid and set representations, thus enabling local-global fusion. Through extensive experiments on various vision tasks, we empirically verify the potential of the proposed integration scheme, named \textit{GLMix}: by offloading the burden of fine-grained features to light-weight Convs, it is sufficient to use MHSAs in a few (e.g., 64) semantic slots to match the performance of recent state-of-the-art backbones, while being more efficient. Our visualization results also demonstrate that the soft clustering module produces a meaningful semantic grouping effect with only IN1k classification supervision, which may induce better interpretability and inspire new weakly-supervised semantic segmentation approaches. Code will be available at \url{<a href="https://github.com/rayleizhu/GLMix" rel="external noopener nofollow" class="link-external link-https">this https URL</a>}. </p> </div> </dd> <dt> <a name='item81'>[81]</a> <a href ="/abs/2411.14430" title="Abstract" id="2411.14430"> arXiv:2411.14430 </a> [<a href="/pdf/2411.14430" title="Download PDF" id="pdf-2411.14430" aria-labelledby="pdf-2411.14430">pdf</a>, <a href="https://arxiv.org/html/2411.14430v1" title="View HTML" id="html-2411.14430" aria-labelledby="html-2411.14430" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14430" title="Other formats" id="oth-2411.14430" aria-labelledby="oth-2411.14430">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Stable Flow: Vital Layers for Training-Free Image Editing </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Avrahami,+O">Omri Avrahami</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Patashnik,+O">Or Patashnik</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Fried,+O">Ohad Fried</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Nemchinov,+E">Egor Nemchinov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Aberman,+K">Kfir Aberman</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lischinski,+D">Dani Lischinski</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Cohen-Or,+D">Daniel Cohen-Or</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Project page is available at <a href="https://omriavrahami.com/stable-flow" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Graphics (cs.GR); Machine Learning (cs.LG) </div> <p class='mathjax'> Diffusion models have revolutionized the field of content synthesis and editing. Recent models have replaced the traditional UNet architecture with the Diffusion Transformer (DiT), and employed flow-matching for improved training and sampling. However, they exhibit limited generation diversity. In this work, we leverage this limitation to perform consistent image edits via selective injection of attention features. The main challenge is that, unlike the UNet-based models, DiT lacks a coarse-to-fine synthesis structure, making it unclear in which layers to perform the injection. Therefore, we propose an automatic method to identify &#34;vital layers&#34; within DiT, crucial for image formation, and demonstrate how these layers facilitate a range of controlled stable edits, from non-rigid modifications to object addition, using the same mechanism. Next, to enable real-image editing, we introduce an improved image inversion method for flow models. Finally, we evaluate our approach through qualitative and quantitative comparisons, along with a user study, and demonstrate its effectiveness across multiple applications. The project page is available at <a href="https://omriavrahami.com/stable-flow" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item82'>[82]</a> <a href ="/abs/2411.14432" title="Abstract" id="2411.14432"> arXiv:2411.14432 </a> [<a href="/pdf/2411.14432" title="Download PDF" id="pdf-2411.14432" aria-labelledby="pdf-2411.14432">pdf</a>, <a href="https://arxiv.org/html/2411.14432v1" title="View HTML" id="html-2411.14432" aria-labelledby="html-2411.14432" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14432" title="Other formats" id="oth-2411.14432" aria-labelledby="oth-2411.14432">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Insight-V: Exploring Long-Chain Visual Reasoning with Multimodal Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Dong,+Y">Yuhao Dong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+Z">Zuyan Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Sun,+H">Hai-Long Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yang,+J">Jingkang Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Hu,+W">Winston Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Rao,+Y">Yongming Rao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+Z">Ziwei Liu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Large Language Models (LLMs) demonstrate enhanced capabilities and reliability by reasoning more, evolving from Chain-of-Thought prompting to product-level solutions like OpenAI o1. Despite various efforts to improve LLM reasoning, high-quality long-chain reasoning data and optimized training pipelines still remain inadequately explored in vision-language tasks. In this paper, we present Insight-V, an early effort to 1) scalably produce long and robust reasoning data for complex multi-modal tasks, and 2) an effective training pipeline to enhance the reasoning capabilities of multi-modal large language models (MLLMs). Specifically, to create long and structured reasoning data without human labor, we design a two-step pipeline with a progressive strategy to generate sufficiently long and diverse reasoning paths and a multi-granularity assessment method to ensure data quality. We observe that directly supervising MLLMs with such long and complex reasoning data will not yield ideal reasoning ability. To tackle this problem, we design a multi-agent system consisting of a reasoning agent dedicated to performing long-chain reasoning and a summary agent trained to judge and summarize reasoning results. We further incorporate an iterative DPO algorithm to enhance the reasoning agent&#39;s generation stability and quality. Based on the popular LLaVA-NeXT model and our stronger base MLLM, we demonstrate significant performance gains across challenging multi-modal benchmarks requiring visual reasoning. Benefiting from our multi-agent system, Insight-V can also easily maintain or improve performance on perception-focused multi-modal tasks. </p> </div> </dd> </dl> <dl id='articles'> <h3>Cross submissions (showing 28 of 28 entries)</h3> <dt> <a name='item83'>[83]</a> <a href ="/abs/2411.13602" title="Abstract" id="2411.13602"> arXiv:2411.13602 </a> (cross-list from eess.IV) [<a href="/pdf/2411.13602" title="Download PDF" id="pdf-2411.13602" aria-labelledby="pdf-2411.13602">pdf</a>, <a href="/format/2411.13602" title="Other formats" id="oth-2411.13602" aria-labelledby="oth-2411.13602">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Large-scale cross-modality pretrained model enhances cardiovascular state estimation and cardiomyopathy detection from electrocardiograms: An AI system development and multi-center validation study </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Ding,+Z">Zhengyao Ding</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Hu,+Y">Yujian Hu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Xu,+Y">Youyao Xu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Zhao,+C">Chengchen Zhao</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Li,+Z">Ziyu Li</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Mao,+Y">Yiheng Mao</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Li,+H">Haitao Li</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Li,+Q">Qian Li</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wang,+J">Jing Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Chen,+Y">Yue Chen</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Chen,+M">Mengjia Chen</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wang,+L">Longbo Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Chu,+X">Xuesen Chu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Pan,+W">Weichao Pan</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Liu,+Z">Ziyi Liu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wu,+F">Fei Wu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Zhang,+H">Hongkun Zhang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Chen,+T">Ting Chen</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Huang,+Z">Zhengxing Huang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 23 pages, 8 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Cardiovascular diseases (CVDs) present significant challenges for early and accurate diagnosis. While cardiac magnetic resonance imaging (CMR) is the gold standard for assessing cardiac function and diagnosing CVDs, its high cost and technical complexity limit accessibility. In contrast, electrocardiography (ECG) offers promise for large-scale early screening. This study introduces CardiacNets, an innovative model that enhances ECG analysis by leveraging the diagnostic strengths of CMR through cross-modal contrastive learning and generative pretraining. CardiacNets serves two primary functions: (1) it evaluates detailed cardiac function indicators and screens for potential CVDs, including coronary artery disease, cardiomyopathy, pericarditis, heart failure and pulmonary hypertension, using ECG input; and (2) it enhances interpretability by generating high-quality CMR images from ECG data. We train and validate the proposed CardiacNets on two large-scale public datasets (the UK Biobank with 41,519 individuals and the MIMIC-IV-ECG comprising 501,172 samples) as well as three private datasets (FAHZU with 410 individuals, SAHZU with 464 individuals, and QPH with 338 individuals), and the findings demonstrate that CardiacNets consistently outperforms traditional ECG-only models, substantially improving screening accuracy. Furthermore, the generated CMR images provide valuable diagnostic support for physicians of all experience levels. This proof-of-concept study highlights how ECG can facilitate cross-modal insights into cardiac function assessment, paving the way for enhanced CVD screening and diagnosis at a population level. </p> </div> </dd> <dt> <a name='item84'>[84]</a> <a href ="/abs/2411.13615" title="Abstract" id="2411.13615"> arXiv:2411.13615 </a> (cross-list from q-fin.ST) [<a href="/pdf/2411.13615" title="Download PDF" id="pdf-2411.13615" aria-labelledby="pdf-2411.13615">pdf</a>, <a href="/format/2411.13615" title="Other formats" id="oth-2411.13615" aria-labelledby="oth-2411.13615">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Deep Learning Approach to Predict the Fall [of Price] of Cryptocurrency Long Before its Actual Fall </div> <div class='list-authors'><a href="https://arxiv.org/search/q-fin?searchtype=author&amp;query=Meem,+A+T">Anika Tahsin Meem</a>, <a href="https://arxiv.org/search/q-fin?searchtype=author&amp;query=Akter,+M+S">Mst. Shapna Akter</a>, <a href="https://arxiv.org/search/q-fin?searchtype=author&amp;query=Depto,+D+S">Deponker Sarker Depto</a>, <a href="https://arxiv.org/search/q-fin?searchtype=author&amp;query=Mahdy,+M">M.R.C. Mahdy</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 22 pages, 3 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Statistical Finance (q-fin.ST)</span>; Computer Vision and Pattern Recognition (cs.CV); Machine Learning (cs.LG) </div> <p class='mathjax'> In modern times, the cryptocurrency market is one of the world&#39;s most rapidly rising financial markets. The cryptocurrency market is regarded to be more volatile and illiquid than traditional markets such as equities, foreign exchange, and commodities. The risk of this market creates an uncertain condition among the investors. The purpose of this research is to predict the magnitude of the risk factor of the cryptocurrency market. Risk factor is also called volatility. Our approach will assist people who invest in the cryptocurrency market by overcoming the problems and difficulties they experience. Our approach starts with calculating the risk factor of the cryptocurrency market from the existing parameters. In twenty elements of the cryptocurrency market, the risk factor has been predicted using different machine learning algorithms such as CNN, LSTM, BiLSTM, and GRU. All of the models have been applied to the calculated risk factor parameter. A new model has been developed to predict better than the existing models. Our proposed model gives the highest RMSE value of 1.3229 and the lowest RMSE value of 0.0089. Following our model, it will be easier for investors to trade in complicated and challenging financial assets like bitcoin, Ethereum, dogecoin, etc. Where the other existing models, the highest RMSE was 14.5092, and the lower was 0.02769. So, the proposed model performs much better than models with proper generalization. Using our approach, it will be easier for investors to trade in complicated and challenging financial assets like Bitcoin, Ethereum, and Dogecoin. </p> </div> </dd> <dt> <a name='item85'>[85]</a> <a href ="/abs/2411.13677" title="Abstract" id="2411.13677"> arXiv:2411.13677 </a> (cross-list from cs.RO) [<a href="/pdf/2411.13677" title="Download PDF" id="pdf-2411.13677" aria-labelledby="pdf-2411.13677">pdf</a>, <a href="https://arxiv.org/html/2411.13677v1" title="View HTML" id="html-2411.13677" aria-labelledby="html-2411.13677" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13677" title="Other formats" id="oth-2411.13677" aria-labelledby="oth-2411.13677">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Bimanual Dexterity for Complex Tasks </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Shaw,+K">Kenneth Shaw</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+Y">Yulong Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yang,+J">Jiahui Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kumar,+M">Mohan Kumar Srirama</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+R">Ray Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xiong,+H">Haoyu Xiong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Mendonca,+R">Russell Mendonca</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Pathak,+D">Deepak Pathak</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> In CoRL 2024. Website at <a href="https://bidex-teleop.github.io/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV); Machine Learning (cs.LG) </div> <p class='mathjax'> To train generalist robot policies, machine learning methods often require a substantial amount of expert human teleoperation data. An ideal robot for humans collecting data is one that closely mimics them: bimanual arms and dexterous hands. However, creating such a bimanual teleoperation system with over 50 DoF is a significant challenge. To address this, we introduce Bidex, an extremely dexterous, low-cost, low-latency and portable bimanual dexterous teleoperation system which relies on motion capture gloves and teacher arms. We compare Bidex to a Vision Pro teleoperation system and a SteamVR system and find Bidex to produce better quality data for more complex tasks at a faster rate. Additionally, we show Bidex operating a mobile bimanual robot for in the wild tasks. The robot hands (5k USD) and teleoperation system (7k USD) is readily reproducible and can be used on many robot arms including two xArms (16k USD). Website at <a href="https://bidex-teleop.github.io/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item86'>[86]</a> <a href ="/abs/2411.13754" title="Abstract" id="2411.13754"> arXiv:2411.13754 </a> (cross-list from cs.LG) [<a href="/pdf/2411.13754" title="Download PDF" id="pdf-2411.13754" aria-labelledby="pdf-2411.13754">pdf</a>, <a href="https://arxiv.org/html/2411.13754v1" title="View HTML" id="html-2411.13754" aria-labelledby="html-2411.13754" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13754" title="Other formats" id="oth-2411.13754" aria-labelledby="oth-2411.13754">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Learning to Reason Iteratively and Parallelly for Complex Visual Reasoning Scenarios </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jaiswal,+S">Shantanu Jaiswal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Roy,+D">Debaditya Roy</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Fernando,+B">Basura Fernando</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Tan,+C">Cheston Tan</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> NeurIPS 2024 camera ready; source code to be released at: <a href="https://github.com/shantanuj/IPRM_Iterative_and_Parallel_Reasoning_Mechanism" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Complex visual reasoning and question answering (VQA) is a challenging task that requires compositional multi-step processing and higher-level reasoning capabilities beyond the immediate recognition and localization of objects and events. Here, we introduce a fully neural Iterative and Parallel Reasoning Mechanism (IPRM) that combines two distinct forms of computation -- iterative and parallel -- to better address complex VQA scenarios. Specifically, IPRM&#39;s &#34;iterative&#34; computation facilitates compositional step-by-step reasoning for scenarios wherein individual operations need to be computed, stored, and recalled dynamically (e.g. when computing the query &#34;determine the color of pen to the left of the child in red t-shirt sitting at the white table&#34;). Meanwhile, its &#34;parallel&#34; computation allows for the simultaneous exploration of different reasoning paths and benefits more robust and efficient execution of operations that are mutually independent (e.g. when counting individual colors for the query: &#34;determine the maximum occurring color amongst all t-shirts&#34;). We design IPRM as a lightweight and fully-differentiable neural module that can be conveniently applied to both transformer and non-transformer vision-language backbones. It notably outperforms prior task-specific methods and transformer-based attention modules across various image and video VQA benchmarks testing distinct complex reasoning capabilities such as compositional spatiotemporal reasoning (AGQA), situational reasoning (STAR), multi-hop reasoning generalization (CLEVR-Humans) and causal event linking (CLEVRER-Humans). Further, IPRM&#39;s internal computations can be visualized across reasoning steps, aiding interpretability and diagnosis of its errors. </p> </div> </dd> <dt> <a name='item87'>[87]</a> <a href ="/abs/2411.13855" title="Abstract" id="2411.13855"> arXiv:2411.13855 </a> (cross-list from eess.IV) [<a href="/pdf/2411.13855" title="Download PDF" id="pdf-2411.13855" aria-labelledby="pdf-2411.13855">pdf</a>, <a href="https://arxiv.org/html/2411.13855v1" title="View HTML" id="html-2411.13855" aria-labelledby="html-2411.13855" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13855" title="Other formats" id="oth-2411.13855" aria-labelledby="oth-2411.13855">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Multimodal Approach to The Detection and Classification of Skin Diseases </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Yang,+A">Allen Yang</a> (1), <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Yang,+E">Edward Yang</a> (2), ((1) Mission San Jose High School, Fremont, CA, (2) Yale University, New Haven, CT)</div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Computer Vision and Pattern Recognition (cs.CV); Machine Learning (cs.LG) </div> <p class='mathjax'> According to PBS, nearly one-third of Americans lack access to primary care services, and another forty percent delay going to avoid medical costs. As a result, many diseases are left undiagnosed and untreated, even if the disease shows many physical symptoms on the skin. With the rise of AI, self-diagnosis and improved disease recognition have become more promising than ever; in spite of that, existing methods suffer from a lack of large-scale patient databases and outdated methods of study, resulting in studies being limited to only a few diseases or modalities. This study incorporates readily available and easily accessible patient information via image and text for skin disease classification on a new dataset of 26 skin disease types that includes both skin disease images (37K) and associated patient narratives. Using this dataset, baselines for various image models were established that outperform existing methods. Initially, the Resnet-50 model was only able to achieve an accuracy of 70% but, after various optimization techniques, the accuracy was improved to 80%. In addition, this study proposes a novel fine-tuning strategy for sequence classification Large Language Models (LLMs), Chain of Options, which breaks down a complex reasoning task into intermediate steps at training time instead of inference. With Chain of Options and preliminary disease recommendations from the image model, this method achieves state of the art accuracy 91% in diagnosing patient skin disease given just an image of the afflicted area as well as a patient description of the symptoms (such as itchiness or dizziness). Through this research, an earlier diagnosis of skin diseases can occur, and clinicians can work with deep learning models to give a more accurate diagnosis, improving quality of life and saving lives. </p> </div> </dd> <dt> <a name='item88'>[88]</a> <a href ="/abs/2411.13862" title="Abstract" id="2411.13862"> arXiv:2411.13862 </a> (cross-list from eess.IV) [<a href="/pdf/2411.13862" title="Download PDF" id="pdf-2411.13862" aria-labelledby="pdf-2411.13862">pdf</a>, <a href="https://arxiv.org/html/2411.13862v1" title="View HTML" id="html-2411.13862" aria-labelledby="html-2411.13862" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13862" title="Other formats" id="oth-2411.13862" aria-labelledby="oth-2411.13862">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Image Compression Using Novel View Synthesis Priors </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Peng,+L">Luyuan Peng</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Chitre,+M">Mandar Chitre</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Vishnu,+H">Hari Vishnu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Too,+Y+M">Yuen Min Too</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Kalyan,+B">Bharath Kalyan</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Mishra,+R">Rajat Mishra</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Tan,+S+P">Soo Pieng Tan</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Preprint submitted to Ocean Engineering </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Computer Vision and Pattern Recognition (cs.CV); Robotics (cs.RO) </div> <p class='mathjax'> Real-time visual feedback is essential for tetherless control of remotely operated vehicles, particularly during inspection and manipulation tasks. Though acoustic communication is the preferred choice for medium-range communication underwater, its limited bandwidth renders it impractical to transmit images or videos in real-time. To address this, we propose a model-based image compression technique that leverages prior mission information. Our approach employs trained machine-learning based novel view synthesis models, and uses gradient descent optimization to refine latent representations to help generate compressible differences between camera images and rendered images. We evaluate the proposed compression technique using a dataset from an artificial ocean basin, demonstrating superior compression ratios and image quality over existing techniques. Moreover, our method exhibits robustness to introduction of new objects within the scene, highlighting its potential for advancing tetherless remotely operated vehicle operations. </p> </div> </dd> <dt> <a name='item89'>[89]</a> <a href ="/abs/2411.14006" title="Abstract" id="2411.14006"> arXiv:2411.14006 </a> (cross-list from cs.DS) [<a href="/pdf/2411.14006" title="Download PDF" id="pdf-2411.14006" aria-labelledby="pdf-2411.14006">pdf</a>, <a href="https://arxiv.org/html/2411.14006v1" title="View HTML" id="html-2411.14006" aria-labelledby="html-2411.14006" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14006" title="Other formats" id="oth-2411.14006" aria-labelledby="oth-2411.14006">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Experimental comparison of graph-based approximate nearest neighbor search algorithms on edge devices </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ganbarov,+A">Ali Ganbarov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yuan,+J">Jicheng Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Le-Tuan,+A">Anh Le-Tuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Hauswirth,+M">Manfred Hauswirth</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Le-Phuoc,+D">Danh Le-Phuoc</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Data Structures and Algorithms (cs.DS)</span>; Hardware Architecture (cs.AR); Computer Vision and Pattern Recognition (cs.CV); Distributed, Parallel, and Cluster Computing (cs.DC); Performance (cs.PF) </div> <p class='mathjax'> In this paper, we present an experimental comparison of various graph-based approximate nearest neighbor (ANN) search algorithms deployed on edge devices for real-time nearest neighbor search applications, such as smart city infrastructure and autonomous vehicles. To the best of our knowledge, this specific comparative analysis has not been previously conducted. While existing research has explored graph-based ANN algorithms, it has often been limited to single-threaded implementations on standard commodity hardware. Our study leverages the full computational and storage capabilities of edge devices, incorporating additional metrics such as insertion and deletion latency of new vectors and power consumption. This comprehensive evaluation aims to provide valuable insights into the performance and suitability of these algorithms for edge-based real-time tracking systems enhanced by nearest-neighbor search algorithms. </p> </div> </dd> <dt> <a name='item90'>[90]</a> <a href ="/abs/2411.14017" title="Abstract" id="2411.14017"> arXiv:2411.14017 </a> (cross-list from eess.IV) [<a href="/pdf/2411.14017" title="Download PDF" id="pdf-2411.14017" aria-labelledby="pdf-2411.14017">pdf</a>, <a href="https://arxiv.org/html/2411.14017v1" title="View HTML" id="html-2411.14017" aria-labelledby="html-2411.14017" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14017" title="Other formats" id="oth-2411.14017" aria-labelledby="oth-2411.14017">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Automatic brain tumor segmentation in 2D intra-operative ultrasound images using MRI tumor annotations </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Faanes,+M">Mathilde Faanes</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Helland,+R+H">Ragnhild Holden Helland</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Solheim,+O">Ole Solheim</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Reinertsen,+I">Ingerid Reinertsen</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 19, 8 figures, submitted to International Journal of Computer Assisted Radiology and Surgery </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Computer Vision and Pattern Recognition (cs.CV); Machine Learning (cs.LG) </div> <p class='mathjax'> Automatic segmentation of brain tumors in intra-operative ultrasound (iUS) images could facilitate localization of tumor tissue during resection surgery. The lack of large annotated datasets limits the current models performances. In this paper, we investigate the use of tumor annotations in pre-operative MRI images, which are more easily accessible than annotations in iUS images, for training of deep learning models for iUS brain tumor segmentation. We used 180 annotated pre-operative MRI images with corresponding unannotated iUS images, and 29 annotated iUS images. Image registration was performed to transfer the MRI annotations to the corresponding iUS images before training models with the nnU-Net framework. To validate the use of MRI labels, the models were compared to a model trained with only US annotated tumors, and a model with both US and MRI annotated tumors. In addition, the results were compared to annotations validated by an expert neurosurgeon on the same test set to measure inter-observer variability. The results showed similar performance for a model trained with only MRI annotated tumors, compared to a model trained with only US annotated tumors. The model trained using both modalities obtained slightly better results with an average Dice score of 0.62, where external expert annotations achieved a score of 0.67. The results also showed that the deep learning models were comparable to expert annotation for larger tumors (&gt; 200 mm2), but perform clearly worse for smaller tumors (&lt; 200 mm2). This shows that MRI tumor annotations can be used as a substitute for US tumor annotations to train a deep learning model for automatic brain tumor segmentation in intra-operative ultrasound images. Small tumors is a limitation for the current models and will be the focus of future work. The main models are available here: <a href="https://github.com/mathildefaanes/us_brain_tumor_segmentation" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item91'>[91]</a> <a href ="/abs/2411.14049" title="Abstract" id="2411.14049"> arXiv:2411.14049 </a> (cross-list from cs.LG) [<a href="/pdf/2411.14049" title="Download PDF" id="pdf-2411.14049" aria-labelledby="pdf-2411.14049">pdf</a>, <a href="https://arxiv.org/html/2411.14049v1" title="View HTML" id="html-2411.14049" aria-labelledby="html-2411.14049" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14049" title="Other formats" id="oth-2411.14049" aria-labelledby="oth-2411.14049">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Out-Of-Distribution Detection with Diversification (Provably) </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yao,+H">Haiyun Yao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Han,+Z">Zongbo Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Fu,+H">Huazhu Fu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Peng,+X">Xi Peng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Hu,+Q">Qinghua Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+C">Changqing Zhang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Out-of-distribution (OOD) detection is crucial for ensuring reliable deployment of machine learning models. Recent advancements focus on utilizing easily accessible auxiliary outliers (e.g., data from the web or other datasets) in training. However, we experimentally reveal that these methods still struggle to generalize their detection capabilities to unknown OOD data, due to the limited diversity of the auxiliary outliers collected. Therefore, we thoroughly examine this problem from the generalization perspective and demonstrate that a more diverse set of auxiliary outliers is essential for enhancing the detection capabilities. However, in practice, it is difficult and costly to collect sufficiently diverse auxiliary outlier data. Therefore, we propose a simple yet practical approach with a theoretical guarantee, termed Diversity-induced Mixup for OOD detection (diverseMix), which enhances the diversity of auxiliary outlier set for training in an efficient way. Extensive experiments show that diverseMix achieves superior performance on commonly used and recent challenging large-scale benchmarks, which further confirm the importance of the diversity of auxiliary outliers. </p> </div> </dd> <dt> <a name='item92'>[92]</a> <a href ="/abs/2411.14078" title="Abstract" id="2411.14078"> arXiv:2411.14078 </a> (cross-list from astro-ph.IM) [<a href="/pdf/2411.14078" title="Download PDF" id="pdf-2411.14078" aria-labelledby="pdf-2411.14078">pdf</a>, <a href="https://arxiv.org/html/2411.14078v1" title="View HTML" id="html-2411.14078" aria-labelledby="html-2411.14078" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14078" title="Other formats" id="oth-2411.14078" aria-labelledby="oth-2411.14078">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Self-supervised learning for radio-astronomy source classification: a benchmark </div> <div class='list-authors'><a href="https://arxiv.org/search/astro-ph?searchtype=author&amp;query=Cecconello,+T">Thomas Cecconello</a>, <a href="https://arxiv.org/search/astro-ph?searchtype=author&amp;query=Riggi,+S">Simone Riggi</a>, <a href="https://arxiv.org/search/astro-ph?searchtype=author&amp;query=Becciano,+U">Ugo Becciano</a>, <a href="https://arxiv.org/search/astro-ph?searchtype=author&amp;query=Vitello,+F">Fabio Vitello</a>, <a href="https://arxiv.org/search/astro-ph?searchtype=author&amp;query=Hopkins,+A+M">Andrew M. Hopkins</a>, <a href="https://arxiv.org/search/astro-ph?searchtype=author&amp;query=Vizzari,+G">Giuseppe Vizzari</a>, <a href="https://arxiv.org/search/astro-ph?searchtype=author&amp;query=Spampinato,+C">Concetto Spampinato</a>, <a href="https://arxiv.org/search/astro-ph?searchtype=author&amp;query=Palazzo,+S">Simone Palazzo</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Instrumentation and Methods for Astrophysics (astro-ph.IM)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> The upcoming Square Kilometer Array (SKA) telescope marks a significant step forward in radio astronomy, presenting new opportunities and challenges for data analysis. Traditional visual models pretrained on optical photography images may not perform optimally on radio interferometry images, which have distinct visual characteristics. <br>Self-Supervised Learning (SSL) offers a promising approach to address this issue, leveraging the abundant unlabeled data in radio astronomy to train neural networks that learn useful representations from radio images. This study explores the application of SSL to radio astronomy, comparing the performance of SSL-trained models with that of traditional models pretrained on natural images, evaluating the importance of data curation for SSL, and assessing the potential benefits of self-supervision to different domain-specific radio astronomy datasets. <br>Our results indicate that, SSL-trained models achieve significant improvements over the baseline in several downstream tasks, especially in the linear evaluation setting; when the entire backbone is fine-tuned, the benefits of SSL are less evident but still outperform pretraining. These findings suggest that SSL can play a valuable role in efficiently enhancing the analysis of radio astronomical data. The trained models and code is available at: \url{<a href="https://github.com/dr4thmos/solo-learn-radio" rel="external noopener nofollow" class="link-external link-https">this https URL</a>} </p> </div> </dd> <dt> <a name='item93'>[93]</a> <a href ="/abs/2411.14092" title="Abstract" id="2411.14092"> arXiv:2411.14092 </a> (cross-list from cs.RO) [<a href="/pdf/2411.14092" title="Download PDF" id="pdf-2411.14092" aria-labelledby="pdf-2411.14092">pdf</a>, <a href="https://arxiv.org/html/2411.14092v1" title="View HTML" id="html-2411.14092" aria-labelledby="html-2411.14092" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14092" title="Other formats" id="oth-2411.14092" aria-labelledby="oth-2411.14092">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MetaCropFollow: Few-Shot Adaptation with Meta-Learning for Under-Canopy Navigation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Woehrle,+T">Thomas Woehrle</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Sivakumar,+A+N">Arun N. Sivakumar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Uppalapati,+N">Naveen Uppalapati</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chowdhary,+G">Girish Chowdhary</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV); Machine Learning (cs.LG) </div> <p class='mathjax'> Autonomous under-canopy navigation faces additional challenges compared to over-canopy settings - for example the tight spacing between the crop rows, degraded GPS accuracy and excessive clutter. Keypoint-based visual navigation has been shown to perform well in these conditions, however the differences between agricultural environments in terms of lighting, season, soil and crop type mean that a domain shift will likely be encountered at some point of the robot deployment. In this paper, we explore the use of Meta-Learning to overcome this domain shift using a minimal amount of data. We train a base-learner that can quickly adapt to new conditions, enabling more robust navigation in low-data regimes. </p> </div> </dd> <dt> <a name='item94'>[94]</a> <a href ="/abs/2411.14133" title="Abstract" id="2411.14133"> arXiv:2411.14133 </a> (cross-list from cs.LG) [<a href="/pdf/2411.14133" title="Download PDF" id="pdf-2411.14133" aria-labelledby="pdf-2411.14133">pdf</a>, <a href="https://arxiv.org/html/2411.14133v1" title="View HTML" id="html-2411.14133" aria-labelledby="html-2411.14133" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14133" title="Other formats" id="oth-2411.14133" aria-labelledby="oth-2411.14133">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> GASP: Efficient Black-Box Generation of Adversarial Suffixes for Jailbreaking LLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Basani,+A+R">Advik Raj Basani</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+X">Xiao Zhang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 28 pages, 9 tables, 13 figures; under review at CVPR &#39;25 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Cryptography and Security (cs.CR); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Large Language Models (LLMs) have shown impressive proficiency across a range of natural language processing tasks yet remain vulnerable to adversarial prompts, known as jailbreak attacks, carefully designed to elicit harmful responses from LLMs. Traditional methods rely on manual heuristics, which suffer from limited generalizability. While being automatic, optimization-based attacks often produce unnatural jailbreak prompts that are easy to detect by safety filters or require high computational overhead due to discrete token optimization. Witnessing the limitations of existing jailbreak methods, we introduce Generative Adversarial Suffix Prompter (GASP), a novel framework that combines human-readable prompt generation with Latent Bayesian Optimization (LBO) to improve adversarial suffix creation in a fully black-box setting. GASP leverages LBO to craft adversarial suffixes by efficiently exploring continuous embedding spaces, gradually optimizing the model to improve attack efficacy while balancing prompt coherence through a targeted iterative refinement procedure. Our experiments show that GASP can generate natural jailbreak prompts, significantly improving attack success rates, reducing training times, and accelerating inference speed, thus making it an efficient and scalable solution for red-teaming LLMs. </p> </div> </dd> <dt> <a name='item95'>[95]</a> <a href ="/abs/2411.14141" title="Abstract" id="2411.14141"> arXiv:2411.14141 </a> (cross-list from math.NA) [<a href="/pdf/2411.14141" title="Download PDF" id="pdf-2411.14141" aria-labelledby="pdf-2411.14141">pdf</a>, <a href="https://arxiv.org/html/2411.14141v1" title="View HTML" id="html-2411.14141" aria-labelledby="html-2411.14141" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14141" title="Other formats" id="oth-2411.14141" aria-labelledby="oth-2411.14141">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Differentiable SVD based on Moore-Penrose Pseudoinverse for Inverse Imaging Problems </div> <div class='list-authors'><a href="https://arxiv.org/search/math?searchtype=author&amp;query=Zhang,+Y">Yinghao Zhang</a>, <a href="https://arxiv.org/search/math?searchtype=author&amp;query=Hu,+Y">Yue Hu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 11 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Numerical Analysis (math.NA)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Low-rank regularization-based deep unrolling networks have achieved remarkable success in various inverse imaging problems (IIPs). However, the singular value decomposition (SVD) is non-differentiable when duplicated singular values occur, leading to severe numerical instability during training. In this paper, we propose a differentiable SVD based on the Moore-Penrose pseudoinverse to address this issue. To the best of our knowledge, this is the first work to provide a comprehensive analysis of the differentiability of the trivial SVD. Specifically, we show that the non-differentiability of SVD is essentially due to an underdetermined system of linear equations arising in the derivation process. We utilize the Moore-Penrose pseudoinverse to solve the system, thereby proposing a differentiable SVD. A numerical stability analysis in the context of IIPs is provided. Experimental results in color image compressed sensing and dynamic MRI reconstruction show that our proposed differentiable SVD can effectively address the numerical instability issue while ensuring computational precision. Code is available at <a href="https://github.com/yhao-z/SVD-inv" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item96'>[96]</a> <a href ="/abs/2411.14163" title="Abstract" id="2411.14163"> arXiv:2411.14163 </a> (cross-list from cs.LO) [<a href="/pdf/2411.14163" title="Download PDF" id="pdf-2411.14163" aria-labelledby="pdf-2411.14163">pdf</a>, <a href="/format/2411.14163" title="Other formats" id="oth-2411.14163" aria-labelledby="oth-2411.14163">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Creating a Formally Verified Neural Network for Autonomous Navigation: An Experience Report </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Bukhari,+S+A+A">Syed Ali Asadullah Bukhari</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Flinkow,+T">Thomas Flinkow</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Inkarbekov,+M">Medet Inkarbekov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Pearlmutter,+B+A">Barak A. Pearlmutter</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Monahan,+R">Rosemary Monahan</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> In Proceedings FMAS2024, <a href="https://arxiv.org/abs/2411.13215" data-arxiv-id="2411.13215" class="link-https">arXiv:2411.13215</a> </div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> EPTCS 411, 2024, pp. 178-190 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Logic in Computer Science (cs.LO)</span>; Computer Vision and Pattern Recognition (cs.CV); Machine Learning (cs.LG) </div> <p class='mathjax'> The increased reliance of self-driving vehicles on neural networks opens up the challenge of their verification. In this paper we present an experience report, describing a case study which we undertook to explore the design and training of a neural network on a custom dataset for vision-based autonomous navigation. We are particularly interested in the use of machine learning with differentiable logics to obtain networks satisfying basic safety properties by design, guaranteeing the behaviour of the neural network after training. We motivate the choice of a suitable neural network verifier for our purposes and report our observations on the use of neural network verifiers for self-driving systems. </p> </div> </dd> <dt> <a name='item97'>[97]</a> <a href ="/abs/2411.14184" title="Abstract" id="2411.14184"> arXiv:2411.14184 </a> (cross-list from eess.IV) [<a href="/pdf/2411.14184" title="Download PDF" id="pdf-2411.14184" aria-labelledby="pdf-2411.14184">pdf</a>, <a href="https://arxiv.org/html/2411.14184v1" title="View HTML" id="html-2411.14184" aria-labelledby="html-2411.14184" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14184" title="Other formats" id="oth-2411.14184" aria-labelledby="oth-2411.14184">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Deep Learning Approach for Enhancing Oral Squamous Cell Carcinoma with LIME Explainable AI Technique </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Islam,+S">Samiha Islam</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Mahmud,+M+Z">Muhammad Zawad Mahmud</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Alve,+S+R">Shahran Rahman Alve</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Chowdhury,+M+M+U">Md. Mejbah Ullah Chowdhury</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Under Review at an IEEE conference </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> The goal of the present study is to analyze an application of deep learning models in order to augment the diagnostic performance of oral squamous cell carcinoma (OSCC) with a longitudinal cohort study using the Histopathological Imaging Database for oral cancer analysis. The dataset consisted of 5192 images (2435 Normal and 2511 OSCC), which were allocated between training, testing, and validation sets with an estimated ratio repartition of about 52% for the OSCC group, and still, our performance measure was validated on a combination set that contains almost equal number of sample in this use case as entire database have been divided into half using stratified splitting technique based again near binary proportion but total distribution was around even. We selected four deep-learning architectures for evaluation in the present study: ResNet101, DenseNet121, VGG16, and EfficientnetB3. EfficientNetB3 was found to be the best, with an accuracy of 98.33% and F1 score (0.9844), and it took remarkably less computing power in comparison with other models. The subsequent one was DenseNet121, with 90.24% accuracy and an F1 score of 90.45%. Moreover, we employed the Local Interpretable Model-agnostic Explanations (LIME) method to clarify why EfficientNetB3 made certain decisions with its predictions to improve the explainability and trustworthiness of results. This work provides evidence for the possible superior diagnosis in OSCC activated from the EfficientNetB3 model with the explanation of AI techniques such as LIME and paves an important groundwork to build on towards clinical usage. </p> </div> </dd> <dt> <a name='item98'>[98]</a> <a href ="/abs/2411.14202" title="Abstract" id="2411.14202"> arXiv:2411.14202 </a> (cross-list from cs.LG) [<a href="/pdf/2411.14202" title="Download PDF" id="pdf-2411.14202" aria-labelledby="pdf-2411.14202">pdf</a>, <a href="https://arxiv.org/html/2411.14202v1" title="View HTML" id="html-2411.14202" aria-labelledby="html-2411.14202" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14202" title="Other formats" id="oth-2411.14202" aria-labelledby="oth-2411.14202">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Revised Regularization for Efficient Continual Learning through Correlation-Based Parameter Update in Bayesian Neural Networks </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Palit,+S">Sanchar Palit</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Banerjee,+B">Biplab Banerjee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chaudhuri,+S">Subhasis Chaudhuri</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> at ICVGIP 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> We propose a Bayesian neural network-based continual learning algorithm using Variational Inference, aiming to overcome several drawbacks of existing methods. Specifically, in continual learning scenarios, storing network parameters at each step to retain knowledge poses challenges. This is compounded by the crucial need to mitigate catastrophic forgetting, particularly given the limited access to past datasets, which complicates maintaining correspondence between network parameters and datasets across all sessions. Current methods using Variational Inference with KL divergence risk catastrophic forgetting during uncertain node updates and coupled disruptions in certain nodes. To address these challenges, we propose the following strategies. To reduce the storage of the dense layer parameters, we propose a parameter distribution learning method that significantly reduces the storage requirements. In the continual learning framework employing variational inference, our study introduces a regularization term that specifically targets the dynamics and population of the mean and variance of the parameters. This term aims to retain the benefits of KL divergence while addressing related challenges. To ensure proper correspondence between network parameters and the data, our method introduces an importance-weighted Evidence Lower Bound term to capture data and parameter correlations. This enables storage of common and distinctive parameter hyperspace bases. The proposed method partitions the parameter space into common and distinctive subspaces, with conditions for effective backward and forward knowledge transfer, elucidating the network-parameter dataset correspondence. The experimental results demonstrate the effectiveness of our method across diverse datasets and various combinations of sequential datasets, yielding superior performance compared to existing approaches. </p> </div> </dd> <dt> <a name='item99'>[99]</a> <a href ="/abs/2411.14243" title="Abstract" id="2411.14243"> arXiv:2411.14243 </a> (cross-list from cs.CR) [<a href="/pdf/2411.14243" title="Download PDF" id="pdf-2411.14243" aria-labelledby="pdf-2411.14243">pdf</a>, <a href="https://arxiv.org/html/2411.14243v1" title="View HTML" id="html-2411.14243" aria-labelledby="html-2411.14243" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14243" title="Other formats" id="oth-2411.14243" aria-labelledby="oth-2411.14243">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> AnywhereDoor: Multi-Target Backdoor Attacks on Object Detection </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lu,+J">Jialin Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Shan,+J">Junjie Shan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhao,+Z">Ziqi Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chow,+K">Ka-Ho Chow</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Cryptography and Security (cs.CR)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> As object detection becomes integral to many safety-critical applications, understanding its vulnerabilities is essential. Backdoor attacks, in particular, pose a significant threat by implanting hidden backdoor in a victim model, which adversaries can later exploit to trigger malicious behaviors during inference. However, current backdoor techniques are limited to static scenarios where attackers must define a malicious objective before training, locking the attack into a predetermined action without inference-time adaptability. Given the expressive output space in object detection, including object existence detection, bounding box estimation, and object classification, the feasibility of implanting a backdoor that provides inference-time control with a high degree of freedom remains unexplored. This paper introduces AnywhereDoor, a flexible backdoor attack tailored for object detection. Once implanted, AnywhereDoor enables adversaries to specify different attack types (object vanishing, fabrication, or misclassification) and configurations (untargeted or targeted with specific classes) to dynamically control detection behavior. This flexibility is achieved through three key innovations: (i) objective disentanglement to support a broader range of attack combinations well beyond what existing methods allow; (ii) trigger mosaicking to ensure backdoor activations are robust, even against those object detectors that extract localized regions from the input image for recognition; and (iii) strategic batching to address object-level data imbalances that otherwise hinders a balanced manipulation. Extensive experiments demonstrate that AnywhereDoor provides attackers with a high degree of control, achieving an attack success rate improvement of nearly 80% compared to adaptations of existing methods for such flexible control. </p> </div> </dd> <dt> <a name='item100'>[100]</a> <a href ="/abs/2411.14250" title="Abstract" id="2411.14250"> arXiv:2411.14250 </a> (cross-list from eess.IV) [<a href="/pdf/2411.14250" title="Download PDF" id="pdf-2411.14250" aria-labelledby="pdf-2411.14250">pdf</a>, <a href="https://arxiv.org/html/2411.14250v1" title="View HTML" id="html-2411.14250" aria-labelledby="html-2411.14250" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14250" title="Other formats" id="oth-2411.14250" aria-labelledby="oth-2411.14250">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CP-UNet: Contour-based Probabilistic Model for Medical Ultrasound Images Segmentation </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Yu,+R">Ruiguo Yu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Zhang,+Y">Yiyang Zhang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Tian,+Y">Yuan Tian</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Liu,+Z">Zhiqiang Liu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Li,+X">Xuewei Li</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Gao,+J">Jie Gao</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 4 pages, 4 figures, 2 tables;For icassp2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Deep learning-based segmentation methods are widely utilized for detecting lesions in ultrasound images. Throughout the imaging procedure, the attenuation and scattering of ultrasound waves cause contour blurring and the formation of artifacts, limiting the clarity of the acquired ultrasound images. To overcome this challenge, we propose a contour-based probabilistic segmentation model CP-UNet, which guides the segmentation network to enhance its focus on contour during decoding. We design a novel down-sampling module to enable the contour probability distribution modeling and encoding stages to acquire global-local features. Furthermore, the Gaussian Mixture Model utilizes optimized features to model the contour distribution, capturing the uncertainty of lesion boundaries. Extensive experiments with several state-of-the-art deep learning segmentation methods on three ultrasound image datasets show that our method performs better on breast and thyroid lesions segmentation. </p> </div> </dd> <dt> <a name='item101'>[101]</a> <a href ="/abs/2411.14269" title="Abstract" id="2411.14269"> arXiv:2411.14269 </a> (cross-list from eess.IV) [<a href="/pdf/2411.14269" title="Download PDF" id="pdf-2411.14269" aria-labelledby="pdf-2411.14269">pdf</a>, <a href="https://arxiv.org/html/2411.14269v1" title="View HTML" id="html-2411.14269" aria-labelledby="html-2411.14269" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14269" title="Other formats" id="oth-2411.14269" aria-labelledby="oth-2411.14269">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Guided MRI Reconstruction via Schr\&#34;odinger Bridge </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wang,+Y">Yue Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Zhou,+T">Tian Zhou</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Cui,+Z">Zhuo-xu Cui</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Huang,+B">Bingsheng Huang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Zheng,+H">Hairong Zheng</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Liang,+D">Dong Liang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Zhu,+Y">Yanjie Zhu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Computer Vision and Pattern Recognition (cs.CV); Signal Processing (eess.SP) </div> <p class='mathjax'> Magnetic Resonance Imaging (MRI) is a multi-contrast imaging technique in which different contrast images share similar structural information. However, conventional diffusion models struggle to effectively leverage this structural similarity. Recently, the Schr枚dinger Bridge (SB), a nonlinear extension of the diffusion model, has been proposed to establish diffusion paths between any distributions, allowing the incorporation of guided priors. This study proposes an SB-based, multi-contrast image-guided reconstruction framework that establishes a diffusion bridge between the guiding and target image distributions. By using the guiding image along with data consistency during sampling, the target image is reconstructed more accurately. To better address structural differences between images, we introduce an inversion strategy from the field of image editing, termed $\mathbf{I}^2$SB-inversion. Experiments on a paried T1 and T2-FLAIR datasets demonstrate that $\mathbf{I}^2$SB-inversion achieve a high acceleration up to 14.4 and outperforms existing methods in terms of both reconstruction accuracy and stability. </p> </div> </dd> <dt> <a name='item102'>[102]</a> <a href ="/abs/2411.14322" title="Abstract" id="2411.14322"> arXiv:2411.14322 </a> (cross-list from cs.RO) [<a href="/pdf/2411.14322" title="Download PDF" id="pdf-2411.14322" aria-labelledby="pdf-2411.14322">pdf</a>, <a href="https://arxiv.org/html/2411.14322v1" title="View HTML" id="html-2411.14322" aria-labelledby="html-2411.14322" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14322" title="Other formats" id="oth-2411.14322" aria-labelledby="oth-2411.14322">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SplatR : Experience Goal Visual Rearrangement with 3D Gaussian Splatting and Dense Feature Matching </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=S,+A+P">Arjun P S</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Melnik,+A">Andrew Melnik</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Nandi,+G+C">Gora Chand Nandi</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Experience Goal Visual Rearrangement task stands as a foundational challenge within Embodied AI, requiring an agent to construct a robust world model that accurately captures the goal state. The agent uses this world model to restore a shuffled scene to its original configuration, making an accurate representation of the world essential for successfully completing the task. In this work, we present a novel framework that leverages on 3D Gaussian Splatting as a 3D scene representation for experience goal visual rearrangement task. Recent advances in volumetric scene representation like 3D Gaussian Splatting, offer fast rendering of high quality and photo-realistic novel views. Our approach enables the agent to have consistent views of the current and the goal setting of the rearrangement task, which enables the agent to directly compare the goal state and the shuffled state of the world in image space. To compare these views, we propose to use a dense feature matching method with visual features extracted from a foundation model, leveraging its advantages of a more universal feature representation, which facilitates robustness, and generalization. We validate our approach on the AI2-THOR rearrangement challenge benchmark and demonstrate improvements over the current state of the art methods </p> </div> </dd> <dt> <a name='item103'>[103]</a> <a href ="/abs/2411.14345" title="Abstract" id="2411.14345"> arXiv:2411.14345 </a> (cross-list from cs.LG) [<a href="/pdf/2411.14345" title="Download PDF" id="pdf-2411.14345" aria-labelledby="pdf-2411.14345">pdf</a>, <a href="https://arxiv.org/html/2411.14345v1" title="View HTML" id="html-2411.14345" aria-labelledby="html-2411.14345" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14345" title="Other formats" id="oth-2411.14345" aria-labelledby="oth-2411.14345">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Layer Pruning with Consensus: A Triple-Win Solution </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Mugnaini,+L+G">Leandro Giusti Mugnaini</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Duarte,+C+T">Carolina Tavares Duarte</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Costa,+A+H+R">Anna H. Reali Costa</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jordao,+A">Artur Jordao</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Layer pruning offers a promising alternative to standard structured pruning, effectively reducing computational costs, latency, and memory footprint. While notable layer-pruning approaches aim to detect unimportant layers for removal, they often rely on single criteria that may not fully capture the complex, underlying properties of layers. We propose a novel approach that combines multiple similarity metrics into a single expressive measure of low-importance layers, called the Consensus criterion. Our technique delivers a triple-win solution: low accuracy drop, high-performance improvement, and increased robustness to adversarial attacks. With up to 78.80% FLOPs reduction and performance on par with state-of-the-art methods across different benchmarks, our approach reduces energy consumption and carbon emissions by up to 66.99% and 68.75%, respectively. Additionally, it avoids shortcut learning and improves robustness by up to 4 percentage points under various adversarial attacks. Overall, the Consensus criterion demonstrates its effectiveness in creating robust, efficient, and environmentally friendly pruned models. </p> </div> </dd> <dt> <a name='item104'>[104]</a> <a href ="/abs/2411.14353" title="Abstract" id="2411.14353"> arXiv:2411.14353 </a> (cross-list from eess.IV) [<a href="/pdf/2411.14353" title="Download PDF" id="pdf-2411.14353" aria-labelledby="pdf-2411.14353">pdf</a>, <a href="/format/2411.14353" title="Other formats" id="oth-2411.14353" aria-labelledby="oth-2411.14353">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Enhancing Medical Image Segmentation with Deep Learning and Diffusion Models </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Liu,+H">Houze Liu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Zhou,+T">Tong Zhou</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Xiang,+Y">Yanlin Xiang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Shen,+A">Aoran Shen</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Hu,+J">Jiacheng Hu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Du,+J">Junliang Du</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Computer Vision and Pattern Recognition (cs.CV); Machine Learning (cs.LG) </div> <p class='mathjax'> Medical image segmentation is crucial for accurate clinical diagnoses, yet it faces challenges such as low contrast between lesions and normal tissues, unclear boundaries, and high variability across patients. Deep learning has improved segmentation accuracy and efficiency, but it still relies heavily on expert annotations and struggles with the complexities of medical images. The small size of medical image datasets and the high cost of data acquisition further limit the performance of segmentation networks. Diffusion models, with their iterative denoising process, offer a promising alternative for better detail capture in segmentation. However, they face difficulties in accurately segmenting small targets and maintaining the precision of boundary details. This article discusses the importance of medical image segmentation, the limitations of current deep learning approaches, and the potential of diffusion models to address these challenges. </p> </div> </dd> <dt> <a name='item105'>[105]</a> <a href ="/abs/2411.14354" title="Abstract" id="2411.14354"> arXiv:2411.14354 </a> (cross-list from cs.LG) [<a href="/pdf/2411.14354" title="Download PDF" id="pdf-2411.14354" aria-labelledby="pdf-2411.14354">pdf</a>, <a href="https://arxiv.org/html/2411.14354v1" title="View HTML" id="html-2411.14354" aria-labelledby="html-2411.14354" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14354" title="Other formats" id="oth-2411.14354" aria-labelledby="oth-2411.14354">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Contrasting local and global modeling with machine learning and satellite data: A case study estimating tree canopy height in African savannas </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Rolf,+E">Esther Rolf</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gordon,+L">Lucia Gordon</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Tambe,+M">Milind Tambe</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Davies,+A">Andrew Davies</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 31 pages; 9 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> While advances in machine learning with satellite imagery (SatML) are facilitating environmental monitoring at a global scale, developing SatML models that are accurate and useful for local regions remains critical to understanding and acting on an ever-changing planet. As increasing attention and resources are being devoted to training SatML models with global data, it is important to understand when improvements in global models will make it easier to train or fine-tune models that are accurate in specific regions. To explore this question, we contrast local and global training paradigms for SatML through a case study of tree canopy height (TCH) mapping in the Karingani Game Reserve, Mozambique. We find that recent advances in global TCH mapping do not necessarily translate to better local modeling abilities in our study region. Specifically, small models trained only with locally-collected data outperform published global TCH maps, and even outperform globally pretrained models that we fine-tune using local data. Analyzing these results further, we identify specific points of conflict and synergy between local and global modeling paradigms that can inform future research toward aligning local and global performance objectives in geospatial machine learning. </p> </div> </dd> <dt> <a name='item106'>[106]</a> <a href ="/abs/2411.14358" title="Abstract" id="2411.14358"> arXiv:2411.14358 </a> (cross-list from cs.RO) [<a href="/pdf/2411.14358" title="Download PDF" id="pdf-2411.14358" aria-labelledby="pdf-2411.14358">pdf</a>, <a href="https://arxiv.org/html/2411.14358v1" title="View HTML" id="html-2411.14358" aria-labelledby="html-2411.14358" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14358" title="Other formats" id="oth-2411.14358" aria-labelledby="oth-2411.14358">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> InCrowd-VI: A Realistic Visual-Inertial Dataset for Evaluating SLAM in Indoor Pedestrian-Rich Spaces for Human Navigation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Bamdad,+M">Marziyeh Bamdad</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Hutter,+H">Hans-Peter Hutter</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Darvishy,+A">Alireza Darvishy</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 18 pages, 7 figures, 5 tabels </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Simultaneous localization and mapping (SLAM) techniques can be used to navigate the visually impaired, but the development of robust SLAM solutions for crowded spaces is limited by the lack of realistic datasets. To address this, we introduce InCrowd-VI, a novel visual-inertial dataset specifically designed for human navigation in indoor pedestrian-rich environments. Recorded using Meta Aria Project glasses, it captures realistic scenarios without environmental control. InCrowd-VI features 58 sequences totaling a 5 km trajectory length and 1.5 hours of recording time, including RGB, stereo images, and IMU measurements. The dataset captures important challenges such as pedestrian occlusions, varying crowd densities, complex layouts, and lighting changes. Ground-truth trajectories, accurate to approximately 2 cm, are provided in the dataset, originating from the Meta Aria project machine perception SLAM service. In addition, a semi-dense 3D point cloud of scenes is provided for each sequence. The evaluation of state-of-the-art visual odometry (VO) and SLAM algorithms on InCrowd-VI revealed severe performance limitations in these realistic scenarios, demonstrating the need and value of the new dataset to advance SLAM research for visually impaired navigation in complex indoor environments. </p> </div> </dd> <dt> <a name='item107'>[107]</a> <a href ="/abs/2411.14374" title="Abstract" id="2411.14374"> arXiv:2411.14374 </a> (cross-list from cs.LO) [<a href="/pdf/2411.14374" title="Download PDF" id="pdf-2411.14374" aria-labelledby="pdf-2411.14374">pdf</a>, <a href="/format/2411.14374" title="Other formats" id="oth-2411.14374" aria-labelledby="oth-2411.14374">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Using Formal Models, Safety Shields and Certified Control to Validate AI-Based Train Systems </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gruteser,+J">Jan Gruteser</a> (Heinrich Heine University D眉sseldorf), <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ro%C3%9Fbach,+J">Jan Ro脽bach</a> (Heinrich Heine University D眉sseldorf), <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Vu,+F">Fabian Vu</a> (Heinrich Heine University D眉sseldorf), <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Leuschel,+M">Michael Leuschel</a> (Heinrich Heine University D眉sseldorf)</div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> In Proceedings FMAS2024, <a href="https://arxiv.org/abs/2411.13215" data-arxiv-id="2411.13215" class="link-https">arXiv:2411.13215</a> </div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> EPTCS 411, 2024, pp. 151-159 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Logic in Computer Science (cs.LO)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> The certification of autonomous systems is an important concern in science and industry. The KI-LOK project explores new methods for certifying and safely integrating AI components into autonomous trains. We pursued a two-layered approach: (1) ensuring the safety of the steering system by formal analysis using the B method, and (2) improving the reliability of the perception system with a runtime certificate checker. This work links both strategies within a demonstrator that runs simulations on the formal model, controlled by the real AI output and the real certificate checker. The demonstrator is integrated into the validation tool ProB. This enables runtime monitoring, runtime verification, and statistical validation of formal safety properties using a formal B model. Consequently, one can detect and analyse potential vulnerabilities and weaknesses of the AI and the certificate checker. We apply these techniques to a signal detection case study and present our findings. </p> </div> </dd> <dt> <a name='item108'>[108]</a> <a href ="/abs/2411.14385" title="Abstract" id="2411.14385"> arXiv:2411.14385 </a> (cross-list from eess.IV) [<a href="/pdf/2411.14385" title="Download PDF" id="pdf-2411.14385" aria-labelledby="pdf-2411.14385">pdf</a>, <a href="https://arxiv.org/html/2411.14385v1" title="View HTML" id="html-2411.14385" aria-labelledby="html-2411.14385" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14385" title="Other formats" id="oth-2411.14385" aria-labelledby="oth-2411.14385">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Enhancing Diagnostic Precision in Gastric Bleeding through Automated Lesion Segmentation: A Deep DuS-KFCM Approach </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Liu,+X">Xian-Xian Liu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Xu,+M">Mingkun Xu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wei,+Y">Yuanyuan Wei</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Qin,+H">Huafeng Qin</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Song,+Q">Qun Song</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Fong,+S">Simon Fong</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Tien,+F">Feng Tien</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Luo,+W">Wei Luo</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Gao,+J">Juntao Gao</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Zhang,+Z">Zhihua Zhang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Siu,+S">Shirley Siu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Timely and precise classification and segmentation of gastric bleeding in endoscopic imagery are pivotal for the rapid diagnosis and intervention of gastric complications, which is critical in life-saving medical procedures. Traditional methods grapple with the challenge posed by the indistinguishable intensity values of bleeding tissues adjacent to other gastric structures. Our study seeks to revolutionize this domain by introducing a novel deep learning model, the Dual Spatial Kernelized Constrained Fuzzy C-Means (Deep DuS-KFCM) clustering algorithm. This Hybrid Neuro-Fuzzy system synergizes Neural Networks with Fuzzy Logic to offer a highly precise and efficient identification of bleeding regions. Implementing a two-fold coarse-to-fine strategy for segmentation, this model initially employs the Spatial Kernelized Fuzzy C-Means (SKFCM) algorithm enhanced with spatial intensity profiles and subsequently harnesses the state-of-the-art DeepLabv3+ with ResNet50 architecture to refine the segmentation output. Through extensive experiments across mainstream gastric bleeding and red spots datasets, our Deep DuS-KFCM model demonstrated unprecedented accuracy rates of 87.95%, coupled with a specificity of 96.33%, outperforming contemporary segmentation methods. The findings underscore the model&#39;s robustness against noise and its outstanding segmentation capabilities, particularly for identifying subtle bleeding symptoms, thereby presenting a significant leap forward in medical image processing. </p> </div> </dd> <dt> <a name='item109'>[109]</a> <a href ="/abs/2411.14412" title="Abstract" id="2411.14412"> arXiv:2411.14412 </a> (cross-list from quant-ph) [<a href="/pdf/2411.14412" title="Download PDF" id="pdf-2411.14412" aria-labelledby="pdf-2411.14412">pdf</a>, <a href="https://arxiv.org/html/2411.14412v1" title="View HTML" id="html-2411.14412" aria-labelledby="html-2411.14412" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14412" title="Other formats" id="oth-2411.14412" aria-labelledby="oth-2411.14412">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Adversarial Poisoning Attack on Quantum Machine Learning Models </div> <div class='list-authors'><a href="https://arxiv.org/search/quant-ph?searchtype=author&amp;query=Kundu,+S">Satwik Kundu</a>, <a href="https://arxiv.org/search/quant-ph?searchtype=author&amp;query=Ghosh,+S">Swaroop Ghosh</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Quantum Physics (quant-ph)</span>; Cryptography and Security (cs.CR); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> With the growing interest in Quantum Machine Learning (QML) and the increasing availability of quantum computers through cloud providers, addressing the potential security risks associated with QML has become an urgent priority. One key concern in the QML domain is the threat of data poisoning attacks in the current quantum cloud setting. Adversarial access to training data could severely compromise the integrity and availability of QML models. Classical data poisoning techniques require significant knowledge and training to generate poisoned data, and lack noise resilience, making them ineffective for QML models in the Noisy Intermediate Scale Quantum (NISQ) era. In this work, we first propose a simple yet effective technique to measure intra-class encoder state similarity (ESS) by analyzing the outputs of encoding circuits. Leveraging this approach, we introduce a quantum indiscriminate data poisoning attack, QUID. Through extensive experiments conducted in both noiseless and noisy environments (e.g., IBM\_Brisbane&#39;s noise), across various architectures and datasets, QUID achieves up to $92\%$ accuracy degradation in model performance compared to baseline models and up to $75\%$ accuracy degradation compared to random label-flipping. We also tested QUID against state-of-the-art classical defenses, with accuracy degradation still exceeding $50\%$, demonstrating its effectiveness. This work represents the first attempt to reevaluate data poisoning attacks in the context of QML. </p> </div> </dd> <dt> <a name='item110'>[110]</a> <a href ="/abs/2411.14418" title="Abstract" id="2411.14418"> arXiv:2411.14418 </a> (cross-list from eess.IV) [<a href="/pdf/2411.14418" title="Download PDF" id="pdf-2411.14418" aria-labelledby="pdf-2411.14418">pdf</a>, <a href="https://arxiv.org/html/2411.14418v1" title="View HTML" id="html-2411.14418" aria-labelledby="html-2411.14418" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14418" title="Other formats" id="oth-2411.14418" aria-labelledby="oth-2411.14418">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Multimodal 3D Brain Tumor Segmentation with Adversarial Training and Conditional Random Field </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Jiang,+L">Lan Jiang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Zheng,+Y">Yuchao Zheng</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Yu,+M">Miao Yu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Zhang,+H">Haiqing Zhang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Aladwani,+F">Fatemah Aladwani</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Perelli,+A">Alessandro Perelli</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 13 pages, 7 figures, Annual Conference on Medical Image Understanding and Analysis (MIUA) 2024 </div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> Medical Image Understanding and Analysis (MIUA), Lecture Notes in Computer Science, Springer, vol. 14859, 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Accurate brain tumor segmentation remains a challenging task due to structural complexity and great individual differences of gliomas. Leveraging the pre-eminent detail resilience of CRF and spatial feature extraction capacity of V-net, we propose a multimodal 3D Volume Generative Adversarial Network (3D-vGAN) for precise segmentation. The model utilizes Pseudo-3D for V-net improvement, adds conditional random field after generator and use original image as supplemental guidance. Results, using the BraTS-2018 dataset, show that 3D-vGAN outperforms classical segmentation models, including U-net, Gan, FCN and 3D V-net, reaching specificity over 99.8%. </p> </div> </dd> </dl> <dl id='articles'> <h3>Replacement submissions (showing 70 of 70 entries)</h3> <dt> <a name='item111'>[111]</a> <a href ="/abs/2003.03653" title="Abstract" id="2003.03653"> arXiv:2003.03653 </a> (replaced) [<a href="/pdf/2003.03653" title="Download PDF" id="pdf-2003.03653" aria-labelledby="pdf-2003.03653">pdf</a>, <a href="https://arxiv.org/html/2003.03653v4" title="View HTML" id="html-2003.03653" aria-labelledby="html-2003.03653" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2003.03653" title="Other formats" id="oth-2003.03653" aria-labelledby="oth-2003.03653">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SalsaNext: Fast, Uncertainty-aware Semantic Segmentation of LiDAR Point Clouds for Autonomous Driving </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Cortinhal,+T">Tiago Cortinhal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Tzelepis,+G">George Tzelepis</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Aksoy,+E+E">Eren Erdal Aksoy</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> In this paper, we introduce SalsaNext for the uncertainty-aware semantic segmentation of a full 3D LiDAR point cloud in real-time. SalsaNext is the next version of SalsaNet [1] which has an encoder-decoder architecture where the encoder unit has a set of ResNet blocks and the decoder part combines upsampled features from the residual blocks. In contrast to SalsaNet, we introduce a new context module, replace the ResNet encoder blocks with a new residual dilated convolution stack with gradually increasing receptive fields and add the pixel-shuffle layer in the decoder. Additionally, we switch from stride convolution to average pooling and also apply central dropout treatment. To directly optimize the Jaccard index, we further combine the weighted cross-entropy loss with Lovasz-Softmax loss [2]. We finally inject a Bayesian treatment to compute the epistemic and aleatoric uncertainties for each point in the cloud. We provide a thorough quantitative evaluation on the Semantic-KITTI dataset [3], which demonstrates that the proposed SalsaNext outperforms other state-of-the-art semantic segmentation networks and ranks first on the Semantic-KITTI leaderboard. We also release our source code <a href="https://github.com/TiagoCortinhal/SalsaNext" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item112'>[112]</a> <a href ="/abs/2209.15179" title="Abstract" id="2209.15179"> arXiv:2209.15179 </a> (replaced) [<a href="/pdf/2209.15179" title="Download PDF" id="pdf-2209.15179" aria-labelledby="pdf-2209.15179">pdf</a>, <a href="https://arxiv.org/html/2209.15179v4" title="View HTML" id="html-2209.15179" aria-labelledby="html-2209.15179" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2209.15179" title="Other formats" id="oth-2209.15179" aria-labelledby="oth-2209.15179">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Physical Adversarial Attack meets Computer Vision: A Decade Survey </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wei,+H">Hui Wei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Tang,+H">Hao Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jia,+X">Xuemei Jia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+Z">Zhixiang Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yu,+H">Hanxun Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+Z">Zhubo Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Satoh,+S">Shin&#39;ichi Satoh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Van+Gool,+L">Luc Van Gool</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+Z">Zheng Wang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Published at IEEE TPAMI. GitHub:<a href="https://github.com/weihui1308/PAA" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Despite the impressive achievements of Deep Neural Networks (DNNs) in computer vision, their vulnerability to adversarial attacks remains a critical concern. Extensive research has demonstrated that incorporating sophisticated perturbations into input images can lead to a catastrophic degradation in DNNs&#39; performance. This perplexing phenomenon not only exists in the digital space but also in the physical world. Consequently, it becomes imperative to evaluate the security of DNNs-based systems to ensure their safe deployment in real-world scenarios, particularly in security-sensitive applications. To facilitate a profound understanding of this topic, this paper presents a comprehensive overview of physical adversarial attacks. Firstly, we distill four general steps for launching physical adversarial attacks. Building upon this foundation, we uncover the pervasive role of artifacts carrying adversarial perturbations in the physical world. These artifacts influence each step. To denote them, we introduce a new term: adversarial medium. Then, we take the first step to systematically evaluate the performance of physical adversarial attacks, taking the adversarial medium as a first attempt. Our proposed evaluation metric, hiPAA, comprises six perspectives: Effectiveness, Stealthiness, Robustness, Practicability, Aesthetics, and Economics. We also provide comparative results across task categories, together with insightful observations and suggestions for future research directions. </p> </div> </dd> <dt> <a name='item113'>[113]</a> <a href ="/abs/2306.02243" title="Abstract" id="2306.02243"> arXiv:2306.02243 </a> (replaced) [<a href="/pdf/2306.02243" title="Download PDF" id="pdf-2306.02243" aria-labelledby="pdf-2306.02243">pdf</a>, <a href="https://arxiv.org/html/2306.02243v3" title="View HTML" id="html-2306.02243" aria-labelledby="html-2306.02243" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2306.02243" title="Other formats" id="oth-2306.02243" aria-labelledby="oth-2306.02243">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Retrieval-Enhanced Visual Prompt Learning for Few-shot Classification </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Rong,+J">Jintao Rong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+H">Hao Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ou,+L">Linlin Ou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+T">Tianxiao Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yu,+X">Xinyi Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+Y">Yifan Liu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> The Contrastive Language-Image Pretraining (CLIP) model has been widely used in various downstream vision tasks. The few-shot learning paradigm has been widely adopted to augment its capacity for these tasks. However, current paradigms may struggle with fine-grained classification, such as satellite image recognition, due to widening domain gaps. To address this limitation, we propose retrieval-enhanced visual prompt learning (RePrompt), which introduces retrieval mechanisms to cache and reuse the knowledge of downstream tasks. RePrompt constructs a retrieval database from either training examples or external data if available, and uses a retrieval mechanism to enhance multiple stages of a simple prompt learning baseline, thus narrowing the domain gap. During inference, our enhanced model can reference similar samples brought by retrieval to make more accurate predictions. A detailed analysis reveals that retrieval helps to improve the distribution of late features, thus, improving generalization for downstream tasks. Reprompt attains state-of-the-art performance on a wide range of vision datasets, including 11 image datasets, 3 video datasets, 1 multi-view dataset, and 4 domain generalization benchmarks. </p> </div> </dd> <dt> <a name='item114'>[114]</a> <a href ="/abs/2308.00090" title="Abstract" id="2308.00090"> arXiv:2308.00090 </a> (replaced) [<a href="/pdf/2308.00090" title="Download PDF" id="pdf-2308.00090" aria-labelledby="pdf-2308.00090">pdf</a>, <a href="https://arxiv.org/html/2308.00090v3" title="View HTML" id="html-2308.00090" aria-labelledby="html-2308.00090" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2308.00090" title="Other formats" id="oth-2308.00090" aria-labelledby="oth-2308.00090">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> VG-SSL: Benchmarking Self-supervised Representation Learning Approaches for Visual Geo-localization </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xiao,+J">Jiuhong Xiao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhu,+G">Gao Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Loianno,+G">Giuseppe Loianno</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 18 pages (including appendix, references), 7 figures, 7 tables. Accepted for WACV 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Visual Geo-localization (VG) is a critical research area for identifying geo-locations from visual inputs, particularly in autonomous navigation for robotics and vehicles. Current VG methods often learn feature extractors from geo-labeled images to create dense, geographically relevant representations. Recent advances in Self-Supervised Learning (SSL) have demonstrated its capability to achieve performance on par with supervised techniques with unlabeled images. This study presents a novel VG-SSL framework, designed for versatile integration and benchmarking of diverse SSL methods for representation learning in VG, featuring a unique geo-related pair strategy, GeoPair. Through extensive performance analysis, we adapt SSL techniques to improve VG on datasets from hand-held and car-mounted cameras used in robotics and autonomous vehicles. Our results show that contrastive learning and information maximization methods yield superior geo-specific representation quality, matching or surpassing the performance of state-of-the-art VG techniques. To our knowledge, This is the first benchmarking study of SSL in VG, highlighting its potential in enhancing geo-specific visual representations for robotics and autonomous vehicles. The code is publicly available at <a href="https://github.com/arplaboratory/VG-SSL" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item115'>[115]</a> <a href ="/abs/2308.08812" title="Abstract" id="2308.08812"> arXiv:2308.08812 </a> (replaced) [<a href="/pdf/2308.08812" title="Download PDF" id="pdf-2308.08812" aria-labelledby="pdf-2308.08812">pdf</a>, <a href="https://arxiv.org/html/2308.08812v2" title="View HTML" id="html-2308.08812" aria-labelledby="html-2308.08812" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2308.08812" title="Other formats" id="oth-2308.08812" aria-labelledby="oth-2308.08812">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Fusion of Variational Distribution Priors and Saliency Map Replay for Continual 3D Reconstruction </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Palit,+S">Sanchar Palit</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Biswas,+S">Sandika Biswas</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> at ICVGIP 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Single-image 3D reconstruction is a research challenge focused on predicting 3D object shapes from single-view images. This task requires significant data acquisition to predict both visible and occluded portions of the shape. Furthermore, learning-based methods face the difficulty of creating a comprehensive training dataset for all possible classes. To this end, we propose a continual learning-based 3D reconstruction method where our goal is to design a model using Variational Priors that can still reconstruct the previously seen classes reasonably even after training on new classes. Variational Priors represent abstract shapes and combat forgetting, whereas saliency maps preserve object attributes with less memory usage. This is vital due to resource constraints in storing extensive training data. Additionally, we introduce saliency map-based experience replay to capture global and distinct object features. Thorough experiments show competitive results compared to established methods, both quantitatively and qualitatively. </p> </div> </dd> <dt> <a name='item116'>[116]</a> <a href ="/abs/2309.10011" title="Abstract" id="2309.10011"> arXiv:2309.10011 </a> (replaced) [<a href="/pdf/2309.10011" title="Download PDF" id="pdf-2309.10011" aria-labelledby="pdf-2309.10011">pdf</a>, <a href="https://arxiv.org/html/2309.10011v3" title="View HTML" id="html-2309.10011" aria-labelledby="html-2309.10011" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2309.10011" title="Other formats" id="oth-2309.10011" aria-labelledby="oth-2309.10011">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Universal Photorealistic Style Transfer: A Lightweight and Adaptive Approach </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+R">Rong Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhao,+E">Enyu Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+Z">Zhiyuan Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Feng,+A">Andrew Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Easley,+S+J">Scott John Easley</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Image and Video Processing (eess.IV) </div> <p class='mathjax'> Photorealistic style transfer aims to apply stylization while preserving the realism and structure of input content. However, existing methods often encounter challenges such as color tone distortions, dependency on pair-wise pre-training, inefficiency with high-resolution inputs, and the need for additional constraints in video style transfer tasks. To address these issues, we propose a Universal Photorealistic Style Transfer (UPST) framework that delivers accurate photorealistic style transfer on high-resolution images and videos without relying on pre-training. Our approach incorporates a lightweight StyleNet for per-instance transfer, ensuring color tone accuracy while supporting high-resolution inputs, maintaining rapid processing speeds, and eliminating the need for pretraining. To further enhance photorealism and efficiency, we introduce instance-adaptive optimization, which features an adaptive coefficient to prioritize content image realism and employs early stopping to accelerate network convergence. Additionally, UPST enables seamless video style transfer without additional constraints due to its strong non-color information preservation ability. Experimental results show that UPST consistently produces photorealistic outputs and significantly reduces GPU memory usage, making it an effective and universal solution for various photorealistic style transfer tasks. </p> </div> </dd> <dt> <a name='item117'>[117]</a> <a href ="/abs/2310.06313" title="Abstract" id="2310.06313"> arXiv:2310.06313 </a> (replaced) [<a href="/pdf/2310.06313" title="Download PDF" id="pdf-2310.06313" aria-labelledby="pdf-2310.06313">pdf</a>, <a href="https://arxiv.org/html/2310.06313v4" title="View HTML" id="html-2310.06313" aria-labelledby="html-2310.06313" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2310.06313" title="Other formats" id="oth-2310.06313" aria-labelledby="oth-2310.06313">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Advancing Pose-Guided Image Synthesis with Progressive Conditional Diffusion Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Shen,+F">Fei Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ye,+H">Hu Ye</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+J">Jun Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+C">Cong Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Han,+X">Xiao Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yang,+W">Wei Yang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted to ICLR 2024. The final version is available at OpenReview: <a href="https://openreview.net/forum?id=rHzapPnCgT" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> The Twelfth International Conference on Learning Representations, 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Recent work has showcased the significant potential of diffusion models in pose-guided person image synthesis. However, owing to the inconsistency in pose between the source and target images, synthesizing an image with a distinct pose, relying exclusively on the source image and target pose information, remains a formidable challenge. This paper presents Progressive Conditional Diffusion Models (PCDMs) that incrementally bridge the gap between person images under the target and source poses through three stages. Specifically, in the first stage, we design a simple prior conditional diffusion model that predicts the global features of the target image by mining the global alignment relationship between pose coordinates and image appearance. Then, the second stage establishes a dense correspondence between the source and target images using the global features from the previous stage, and an inpainting conditional diffusion model is proposed to further align and enhance the contextual features, generating a coarse-grained person image. In the third stage, we propose a refining conditional diffusion model to utilize the coarsely generated image from the previous stage as a condition, achieving texture restoration and enhancing fine-detail consistency. The three-stage PCDMs work progressively to generate the final high-quality and high-fidelity synthesized image. Both qualitative and quantitative results demonstrate the consistency and photorealism of our proposed PCDMs under challenging <a href="http://scenarios.The" rel="external noopener nofollow" class="link-external link-http">this http URL</a> code and model will be available at <a href="https://github.com/tencent-ailab/PCDMs" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item118'>[118]</a> <a href ="/abs/2311.03478" title="Abstract" id="2311.03478"> arXiv:2311.03478 </a> (replaced) [<a href="/pdf/2311.03478" title="Download PDF" id="pdf-2311.03478" aria-labelledby="pdf-2311.03478">pdf</a>, <a href="https://arxiv.org/html/2311.03478v2" title="View HTML" id="html-2311.03478" aria-labelledby="html-2311.03478" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2311.03478" title="Other formats" id="oth-2311.03478" aria-labelledby="oth-2311.03478">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Multi Loss-based Feature Fusion and Top Two Voting Ensemble Decision Strategy for Facial Expression Recognition in the Wild </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhou,+G">Guangyao Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xie,+Y">Yuanlun Xie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Fu,+Y">Yiqin Fu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+Z">Zhaokun Wang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 12 pages, 8 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Facial expression recognition (FER) in the wild is a challenging task affected by the image quality and has attracted broad interest in computer vision. There is no research using feature fusion and ensemble strategy for FER simultaneously. Different from previous studies, this paper applies both internal feature fusion for a single model and feature fusion among multiple networks, as well as the ensemble strategy. This paper proposes one novel single model named R18+FAML, as well as one ensemble model named R18+FAML-FGA-T2V to improve the performance of the FER in the wild. Based on the structure of ResNet18 (R18), R18+FAML combines internal Feature fusion and three Attention blocks using Multiple Loss functions (FAML) to improve the diversity of the feature extraction. To improve the performance of R18+FAML, we propose a Feature fusion among networks based on the Genetic Algorithm (FGA), which can fuse the convolution kernels for feature extraction of multiple networks. On the basis of R18+FAML and FGA, we propose one ensemble strategy, i.e., the Top Two Voting (T2V) to support the classification of FER, which can consider more classification information comprehensively. Combining the above strategies, R18+FAML-FGA-T2V can focus on the main expression-aware areas. Extensive experiments demonstrate that our single model R18+FAML and the ensemble model R18+FAML-FGA-T2V achieve the accuracies of $\left( 90.32, 62.17, 65.83 \right)\%$ and $\left( 91.59, 63.27, 66.63 \right)\%$ on three challenging unbalanced FER datasets RAF-DB, AffectNet-8 and AffectNet-7 respectively, both outperforming the state-of-the-art results. </p> </div> </dd> <dt> <a name='item119'>[119]</a> <a href ="/abs/2311.15864" title="Abstract" id="2311.15864"> arXiv:2311.15864 </a> (replaced) [<a href="/pdf/2311.15864" title="Download PDF" id="pdf-2311.15864" aria-labelledby="pdf-2311.15864">pdf</a>, <a href="https://arxiv.org/html/2311.15864v4" title="View HTML" id="html-2311.15864" aria-labelledby="html-2311.15864" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2311.15864" title="Other formats" id="oth-2311.15864" aria-labelledby="oth-2311.15864">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> InterControl: Zero-shot Human Interaction Generation by Controlling Every Joint </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+Z">Zhenzhi Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+J">Jingbo Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+Y">Yixuan Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lin,+D">Dahua Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Dai,+B">Bo Dai</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> NeurIPS 2024 camera ready version. TL;DR: Generate human interactions with only single-person motion data in training via joint contact pairs from LLMs </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Text-conditioned motion synthesis has made remarkable progress with the emergence of diffusion models. However, the majority of these motion diffusion models are primarily designed for a single character and overlook multi-human interactions. In our approach, we strive to explore this problem by synthesizing human motion with interactions for a group of characters of any size in a zero-shot manner. The key aspect of our approach is the adaptation of human-wise interactions as pairs of human joints that can be either in contact or separated by a desired distance. In contrast to existing methods that necessitate training motion generation models on multi-human motion datasets with a fixed number of characters, our approach inherently possesses the flexibility to model human interactions involving an arbitrary number of individuals, thereby transcending the limitations imposed by the training data. We introduce a novel controllable motion generation method, InterControl, to encourage the synthesized motions maintaining the desired distance between joint pairs. It consists of a motion controller and an inverse kinematics guidance module that realistically and accurately aligns the joints of synthesized characters to the desired location. Furthermore, we demonstrate that the distance between joint pairs for human-wise interactions can be generated using an off-the-shelf Large Language Model (LLM). Experimental results highlight the capability of our framework to generate interactions with multiple human characters and its potential to work with off-the-shelf physics-based character simulators. Code is available at <a href="https://github.com/zhenzhiwang/intercontrol" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item120'>[120]</a> <a href ="/abs/2312.00236" title="Abstract" id="2312.00236"> arXiv:2312.00236 </a> (replaced) [<a href="/pdf/2312.00236" title="Download PDF" id="pdf-2312.00236" aria-labelledby="pdf-2312.00236">pdf</a>, <a href="https://arxiv.org/html/2312.00236v3" title="View HTML" id="html-2312.00236" aria-labelledby="html-2312.00236" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2312.00236" title="Other formats" id="oth-2312.00236" aria-labelledby="oth-2312.00236">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Brainformer: Mimic Human Visual Brain Functions to Machine Vision Models via fMRI </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Nguyen,+X">Xuan-Bac Nguyen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+X">Xin Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Sinha,+P">Pawan Sinha</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Khan,+S+U">Samee U. Khan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Luu,+K">Khoa Luu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Human perception plays a vital role in forming beliefs and understanding reality. A deeper understanding of brain functionality will lead to the development of novel deep neural networks. In this work, we introduce a novel framework named Brainformer, a straightforward yet effective Transformer-based framework, to analyze Functional Magnetic Resonance Imaging (fMRI) patterns in the human perception system from a machine-learning perspective. Specifically, we present the Multi-scale fMRI Transformer to explore brain activity patterns through fMRI signals. This architecture includes a simple yet efficient module for high-dimensional fMRI signal encoding and incorporates a novel embedding technique called 3D Voxels Embedding. Secondly, drawing inspiration from the functionality of the brain&#39;s Region of Interest, we introduce a novel loss function called Brain fMRI Guidance Loss. This loss function mimics brain activity patterns from these regions in the deep neural network using fMRI data. This work introduces a prospective approach to transferring knowledge from human perception to neural networks. Our experiments demonstrate that leveraging fMRI information allows the machine vision model to achieve results comparable to State-of-the-Art methods in various image recognition tasks. </p> </div> </dd> <dt> <a name='item121'>[121]</a> <a href ="/abs/2312.02124" title="Abstract" id="2312.02124"> arXiv:2312.02124 </a> (replaced) [<a href="/pdf/2312.02124" title="Download PDF" id="pdf-2312.02124" aria-labelledby="pdf-2312.02124">pdf</a>, <a href="https://arxiv.org/html/2312.02124v2" title="View HTML" id="html-2312.02124" aria-labelledby="html-2312.02124" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2312.02124" title="Other formats" id="oth-2312.02124" aria-labelledby="oth-2312.02124">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> VerA: Versatile Anonymization Applicable to Clinical Facial Photographs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Helou,+M+E">Majed El Helou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Cetin,+D">Doruk Cetin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Stamenkovic,+P">Petar Stamenkovic</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Huber,+N+B">Niko Benjamin Huber</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Z%C3%BCnd,+F">Fabio Z眉nd</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> accepted to WACV 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Graphics (cs.GR); Machine Learning (cs.LG) </div> <p class='mathjax'> The demand for privacy in facial image dissemination is gaining ground internationally, echoed by the proliferation of regulations such as GDPR, DPDPA, CCPA, PIPL, and APPI. While recent advances in anonymization surpass pixelation or blur methods, additional constraints to the task pose challenges. Largely unaddressed by current anonymization methods are clinical images and pairs of before-and-after clinical images illustrating facial medical interventions, e.g., facial surgeries or dental procedures. We present VerA, the first Versatile Anonymization framework that solves two challenges in clinical applications: A) it preserves selected semantic areas (e.g., mouth region) to show medical intervention results, that is, anonymization is only applied to the areas outside the preserved area; and B) it produces anonymized images with consistent personal identity across multiple photographs, which is crucial for anonymizing photographs of the same person taken before and after a clinical intervention. We validate our results on both single and paired anonymization of clinical images through extensive quantitative and qualitative evaluation. We also demonstrate that VerA reaches the state of the art on established anonymization tasks, in terms of photorealism and de-identification. </p> </div> </dd> <dt> <a name='item122'>[122]</a> <a href ="/abs/2401.13721" title="Abstract" id="2401.13721"> arXiv:2401.13721 </a> (replaced) [<a href="/pdf/2401.13721" title="Download PDF" id="pdf-2401.13721" aria-labelledby="pdf-2401.13721">pdf</a>, <a href="https://arxiv.org/html/2401.13721v3" title="View HTML" id="html-2401.13721" aria-labelledby="html-2401.13721" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2401.13721" title="Other formats" id="oth-2401.13721" aria-labelledby="oth-2401.13721">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Uncertainty-Guided Alignment for Unsupervised Domain Adaptation in Regression </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Nejjar,+I">Ismail Nejjar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Frusque,+G">Gaetan Frusque</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Forest,+F">Florent Forest</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Fink,+O">Olga Fink</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Unsupervised Domain Adaptation for Regression (UDAR) aims to adapt models from a labeled source domain to an unlabeled target domain for regression tasks. Traditional feature alignment methods, successful in classification, often prove ineffective for regression due to the correlated nature of regression features. To address this challenge, we propose Uncertainty-Guided Alignment (UGA), a novel method that integrates predictive uncertainty into the feature alignment process. UGA employs Evidential Deep Learning to predict both target values and their associated uncertainties. This uncertainty information guides the alignment process and fuses information within the embedding space, effectively mitigating issues such as feature collapse in out-of-distribution scenarios. We evaluate UGA on two computer vision benchmarks and a real-world battery state-of-charge prediction across different manufacturers and operating temperatures. Across 52 transfer tasks, UGA on average outperforms existing state-of-the-art methods. Our approach not only improves adaptation performance but also provides well-calibrated uncertainty estimates. </p> </div> </dd> <dt> <a name='item123'>[123]</a> <a href ="/abs/2402.00712" title="Abstract" id="2402.00712"> arXiv:2402.00712 </a> (replaced) [<a href="/pdf/2402.00712" title="Download PDF" id="pdf-2402.00712" aria-labelledby="pdf-2402.00712">pdf</a>, <a href="https://arxiv.org/html/2402.00712v5" title="View HTML" id="html-2402.00712" aria-labelledby="html-2402.00712" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2402.00712" title="Other formats" id="oth-2402.00712" aria-labelledby="oth-2402.00712">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> ChaosBench: A Multi-Channel, Physics-Based Benchmark for Subseasonal-to-Seasonal Climate Prediction </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Nathaniel,+J">Juan Nathaniel</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Qu,+Y">Yongquan Qu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Nguyen,+T">Tung Nguyen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yu,+S">Sungduk Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Busecke,+J">Julius Busecke</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Grover,+A">Aditya Grover</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gentine,+P">Pierre Gentine</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> NeurIPS 2024 D&amp;B Track (Oral) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Computers and Society (cs.CY) </div> <p class='mathjax'> Accurate prediction of climate in the subseasonal-to-seasonal scale is crucial for disaster preparedness and robust decision making amidst climate change. Yet, forecasting beyond the weather timescale is challenging because it deals with problems other than initial condition, including boundary interaction, butterfly effect, and our inherent lack of physical understanding. At present, existing benchmarks tend to have shorter forecasting range of up-to 15 days, do not include a wide range of operational baselines, and lack physics-based constraints for explainability. Thus, we propose ChaosBench, a challenging benchmark to extend the predictability range of data-driven weather emulators to S2S timescale. First, ChaosBench is comprised of variables beyond the typical surface-atmospheric ERA5 to also include ocean, ice, and land reanalysis products that span over 45 years to allow for full Earth system emulation that respects boundary conditions. We also propose physics-based, in addition to deterministic and probabilistic metrics, to ensure a physically-consistent ensemble that accounts for butterfly effect. Furthermore, we evaluate on a diverse set of physics-based forecasts from four national weather agencies as baselines to our data-driven counterpart such as ViT/ClimaX, PanguWeather, GraphCast, and FourCastNetV2. Overall, we find methods originally developed for weather-scale applications fail on S2S task: their performance simply collapse to an unskilled climatology. Nonetheless, we outline and demonstrate several strategies that can extend the predictability range of existing weather emulators, including the use of ensembles, robust control of error propagation, and the use of physics-informed models. Our benchmark, datasets, and instructions are available at <a href="https://leap-stc.github.io/ChaosBench" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item124'>[124]</a> <a href ="/abs/2402.01002" title="Abstract" id="2402.01002"> arXiv:2402.01002 </a> (replaced) [<a href="/pdf/2402.01002" title="Download PDF" id="pdf-2402.01002" aria-labelledby="pdf-2402.01002">pdf</a>, <a href="https://arxiv.org/html/2402.01002v3" title="View HTML" id="html-2402.01002" aria-labelledby="html-2402.01002" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2402.01002" title="Other formats" id="oth-2402.01002" aria-labelledby="oth-2402.01002">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> AI-generated faces influence gender stereotypes and racial homogenization </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=AlDahoul,+N">Nouar AlDahoul</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Rahwan,+T">Talal Rahwan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zaki,+Y">Yasir Zaki</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 47 pages, 19 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Text-to-image generative AI models such as Stable Diffusion are used daily by millions worldwide. However, the extent to which these models exhibit racial and gender stereotypes is not yet fully understood. Here, we document significant biases in Stable Diffusion across six races, two genders, 32 professions, and eight attributes. Additionally, we examine the degree to which Stable Diffusion depicts individuals of the same race as being similar to one another. This analysis reveals significant racial homogenization, e.g., depicting nearly all Middle Eastern men as bearded, brown-skinned, and wearing traditional attire. We then propose debiasing solutions that allow users to specify the desired distributions of race and gender when generating images while minimizing racial homogenization. Finally, using a preregistered survey experiment, we find evidence that being presented with inclusive AI-generated faces reduces people&#39;s racial and gender biases, while being presented with non-inclusive ones increases such biases, regardless of whether the images are labeled as AI-generated. Taken together, our findings emphasize the need to address biases and stereotypes in text-to-image models. </p> </div> </dd> <dt> <a name='item125'>[125]</a> <a href ="/abs/2402.19160" title="Abstract" id="2402.19160"> arXiv:2402.19160 </a> (replaced) [<a href="/pdf/2402.19160" title="Download PDF" id="pdf-2402.19160" aria-labelledby="pdf-2402.19160">pdf</a>, <a href="https://arxiv.org/html/2402.19160v4" title="View HTML" id="html-2402.19160" aria-labelledby="html-2402.19160" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2402.19160" title="Other formats" id="oth-2402.19160" aria-labelledby="oth-2402.19160">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Effective Message Hiding with Order-Preserving Mechanisms </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yu,+G">Gao Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xuchong,+Q">Qiu Xuchong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zihan,+Y">Ye Zihan</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> BMVC 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Message hiding, a technique that conceals secret message bits within a cover image, aims to achieve an optimal balance among message capacity, recovery accuracy, and imperceptibility. While convolutional neural networks have notably improved message capacity and imperceptibility, achieving high recovery accuracy remains challenging. This challenge arises because convolutional operations struggle to preserve the sequential order of message bits and effectively address the discrepancy between these two modalities. To address this, we propose StegaFormer, an innovative MLP-based framework designed to preserve bit order and enable global fusion between modalities. Specifically, StegaFormer incorporates three crucial components: Order-Preserving Message Encoder (OPME), Decoder (OPMD) and Global Message-Image Fusion (GMIF). OPME and OPMD aim to preserve the order of message bits by segmenting the entire sequence into equal-length segments and incorporating sequential information during encoding and decoding. Meanwhile, GMIF employs a cross-modality fusion mechanism to effectively fuse the features from the two uncorrelated modalities. Experimental results on the COCO and DIV2K datasets demonstrate that StegaFormer surpasses existing state-of-the-art methods in terms of recovery accuracy, message capacity, and imperceptibility. We will make our code publicly available. </p> </div> </dd> <dt> <a name='item126'>[126]</a> <a href ="/abs/2403.09055" title="Abstract" id="2403.09055"> arXiv:2403.09055 </a> (replaced) [<a href="/pdf/2403.09055" title="Download PDF" id="pdf-2403.09055" aria-labelledby="pdf-2403.09055">pdf</a>, <a href="https://arxiv.org/html/2403.09055v3" title="View HTML" id="html-2403.09055" aria-labelledby="html-2403.09055" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2403.09055" title="Other formats" id="oth-2403.09055" aria-labelledby="oth-2403.09055">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SemanticDraw: Towards Real-Time Interactive Content Creation from Image Diffusion Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lee,+J">Jaerin Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jung,+D+S">Daniel Sungho Jung</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lee,+K">Kanggeon Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lee,+K+M">Kyoung Mu Lee</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 20 pages, 15 figures. v3: added tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> We introduce SemanticDraw, a new paradigm of interactive content creation where high-quality images are generated in near real-time from given multiple hand-drawn regions, each encoding prescribed semantic meaning. In order to maximize the productivity of content creators and to fully realize their artistic imagination, it requires both quick interactive interfaces and fine-grained regional controls in their tools. Despite astonishing generation quality from recent diffusion models, we find that existing approaches for regional controllability are very slow (52 seconds for $512 \times 512$ image) while not compatible with acceleration methods such as LCM, blocking their huge potential in interactive content creation. From this observation, we build our solution for interactive content creation in two steps: (1) we establish compatibility between region-based controls and acceleration techniques for diffusion models, maintaining high fidelity of multi-prompt image generation with $\times 10$ reduced number of inference steps, (2) we increase the generation throughput with our new multi-prompt stream batch pipeline, enabling low-latency generation from multiple, region-based text prompts on a single RTX 2080 Ti GPU. Our proposed framework is generalizable to any existing diffusion models and acceleration schedulers, allowing sub-second (0.64 seconds) image content creation application upon well-established image diffusion models. Our project page is: <a href="https://jaerinlee.com/research/semantic-draw" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item127'>[127]</a> <a href ="/abs/2403.19797" title="Abstract" id="2403.19797"> arXiv:2403.19797 </a> (replaced) [<a href="/pdf/2403.19797" title="Download PDF" id="pdf-2403.19797" aria-labelledby="pdf-2403.19797">pdf</a>, <a href="https://arxiv.org/html/2403.19797v4" title="View HTML" id="html-2403.19797" aria-labelledby="html-2403.19797" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2403.19797" title="Other formats" id="oth-2403.19797" aria-labelledby="oth-2403.19797">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Efficient 3D Instance Mapping and Localization with Neural Fields </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Tang,+G">George Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jatavallabhula,+K+M">Krishna Murthy Jatavallabhula</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Torralba,+A">Antonio Torralba</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> We tackle the problem of learning an implicit scene representation for 3D instance segmentation from a sequence of posed RGB images. Towards this, we introduce 3DIML, a novel framework that efficiently learns a neural label field which can render 3D instance segmentation masks from novel viewpoints. Opposed to prior art that optimizes a neural field in a self-supervised manner, requiring complicated training procedures and loss function design, 3DIML leverages a two-phase process. The first phase, InstanceMap, takes as input 2D segmentation masks of the image sequence generated by a frontend instance segmentation model, and associates corresponding masks across images to 3D labels. These almost 3D-consistent pseudolabel masks are then used in the second phase, InstanceLift, to supervise the training of a neural label field, which interpolates regions missed by InstanceMap and resolves ambiguities. Additionally, we introduce InstanceLoc, which enables near realtime localization of instance masks given a trained neural label field. We evaluate 3DIML on sequences from the Replica and ScanNet datasets and demonstrate its effectiveness under mild assumptions for the image sequences. We achieve a large practical speedup over existing implicit scene representation methods with comparable quality, showcasing its potential to facilitate faster and more effective 3D scene understanding. </p> </div> </dd> <dt> <a name='item128'>[128]</a> <a href ="/abs/2403.19912" title="Abstract" id="2403.19912"> arXiv:2403.19912 </a> (replaced) [<a href="/pdf/2403.19912" title="Download PDF" id="pdf-2403.19912" aria-labelledby="pdf-2403.19912">pdf</a>, <a href="https://arxiv.org/html/2403.19912v2" title="View HTML" id="html-2403.19912" aria-labelledby="html-2403.19912" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2403.19912" title="Other formats" id="oth-2403.19912" aria-labelledby="oth-2403.19912">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Automated Identification and Segmentation of Hi Sources in CRAFTS Using Deep Learning Method </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Song,+Z">Zihao Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+H">Huaxi Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Quan,+D">Donghui Quan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+D">Di Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zheng,+Y">Yinghui Zheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ni,+S">Shulei Ni</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+Y">Yunchuan Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zheng,+Y">Yun Zheng</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 8 pages, 8 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Astrophysics of Galaxies (astro-ph.GA); Instrumentation and Methods for Astrophysics (astro-ph.IM) </div> <p class='mathjax'> Identifying neutral hydrogen (\hi) galaxies from observational data is a significant challenge in \hi\ galaxy surveys. With the advancement of observational technology, especially with the advent of large-scale telescope projects such as FAST and SKA, the significant increase in data volume presents new challenges for the efficiency and accuracy of data <a href="http://processing.To" rel="external noopener nofollow" class="link-external link-http">this http URL</a> address this challenge, in this study, we present a machine learning-based method for extracting \hi\ sources from the three-dimensional (3D) spectral data obtained from the Commensal Radio Astronomy FAST Survey (CRAFTS). We have carefully assembled a specialized dataset, HISF, rich in \hi\ sources, specifically designed to enhance the detection process. Our model, Unet-LK, utilizes the advanced 3D-Unet segmentation architecture and employs an elongated convolution kernel to effectively capture the intricate structures of \hi\ sources. This strategy ensures a reliable identification and segmentation of \hi\ sources, achieving notable performance metrics with a recall rate of 91.6\% and an accuracy of 95.7\%. These results substantiate the robustness of our dataset and the effectiveness of our proposed network architecture in the precise identification of \hi\ sources. Our code and dataset is publicly available at \url{<a href="https://github.com/fishszh/HISF" rel="external noopener nofollow" class="link-external link-https">this https URL</a>}. </p> </div> </dd> <dt> <a name='item129'>[129]</a> <a href ="/abs/2404.04856" title="Abstract" id="2404.04856"> arXiv:2404.04856 </a> (replaced) [<a href="/pdf/2404.04856" title="Download PDF" id="pdf-2404.04856" aria-labelledby="pdf-2404.04856">pdf</a>, <a href="https://arxiv.org/html/2404.04856v2" title="View HTML" id="html-2404.04856" aria-labelledby="html-2404.04856" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2404.04856" title="Other formats" id="oth-2404.04856" aria-labelledby="oth-2404.04856">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Msmsfnet: a multi-stream and multi-scale fusion net for edge detection </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+C">Chenguang Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+C">Chisheng Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Dong,+F">Feifei Dong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xiao,+X">Xiayang Xiao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Su,+X">Xin Su</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhu,+C">Chuanhua Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+D">Dejin Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+Q">Qingquan Li</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Edge detection is a long-standing problem in computer vision. Recent deep learning based algorithms achieve state-of-the-art performance in publicly available datasets. Despite their efficiency, their performance, however, relies heavily on the pre-trained weights of the backbone network on the ImageNet dataset. This significantly limits the design space of deep learning based edge detectors. Whenever we want to devise a new model, we have to train this new model on the ImageNet dataset first, and then fine-tune the model using the edge detection datasets. The comparison would be unfair otherwise. However, it is usually not feasible for many researchers to train a model on the ImageNet dataset due to the limited computation resources. Besides, if these methods need to be trained to detect edges in a different kind of data, Synthetic Aperture Radar (SAR) images for instance, the pre-trained weights on the ImageNet dataset are unlikely to improve the edge detection accuracy due to the strong differences in the statistics between optical and SAR images. In the meantime, no dataset for SAR image processing matches the size of the ImageNet dataset. In this work, we study the performance achievable by existing methods in publicly available datasets when they are trained from scratch, and devise a new network architecture, the multi-stream and multi-scale fusion net (msmsfnet), for edge detection. We show in our experiments that by training all models from scratch to ensure the fairness of comparison, our model outperforms state-of-the-art deep learning based edge detectors in three publicly available datasets. The efficiency of our model is further demonstrated by the experiments for edge detection in SAR images, which serves as an important evidence showing the meaningfulness of this work as no useful pre-trained weight is available for edge detection in SAR images. </p> </div> </dd> <dt> <a name='item130'>[130]</a> <a href ="/abs/2404.19513" title="Abstract" id="2404.19513"> arXiv:2404.19513 </a> (replaced) [<a href="/pdf/2404.19513" title="Download PDF" id="pdf-2404.19513" aria-labelledby="pdf-2404.19513">pdf</a>, <a href="https://arxiv.org/html/2404.19513v4" title="View HTML" id="html-2404.19513" aria-labelledby="html-2404.19513" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2404.19513" title="Other formats" id="oth-2404.19513" aria-labelledby="oth-2404.19513">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Smartphone-Based Method for Assessing Tomato Nutrient Status through Trichome Density Measurement </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ueda,+S">Sho Ueda</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ye,+X">Xujun Ye</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Early detection of fertilizer-induced stress in tomato plants is crucial for optimizing crop yield through timely management interventions. While conventional optical methods struggle to detect fertilizer stress in young leaves, these leaves contain valuable diagnostic information through their microscopic hair-like structures, particularly trichomes, which existing approaches have overlooked. This study introduces a smartphone-based noninvasive technique that leverages mobile computing and digital imaging capabilities to quantify trichome density on young leaves with superior detection latency. Our method uniquely combines augmented reality technology with image processing algorithms to analyze trichomes transferred onto specialized measurement paper. A robust automated pipeline processes these images through region extraction, perspective transformation, and illumination correction to precisely quantify trichome density. Validation experiments on hydroponically grown tomatoes under varying fertilizer conditions demonstrated the method&#39;s effectiveness. Leave-one-out cross-validation revealed strong predictive performance with the area under the precision-recall curve (PR-AUC: 0.82) and area under the receiver operating characteristic curve (ROC-AUC: 0.64), while the predicted and observed trichome densities exhibited high correlation ($r = 0.79$). This innovative approach transforms smartphones into precise diagnostic tools for plant nutrition assessment, offering a practical, cost-effective solution for precision agriculture. </p> </div> </dd> <dt> <a name='item131'>[131]</a> <a href ="/abs/2405.13337" title="Abstract" id="2405.13337"> arXiv:2405.13337 </a> (replaced) [<a href="/pdf/2405.13337" title="Download PDF" id="pdf-2405.13337" aria-labelledby="pdf-2405.13337">pdf</a>, <a href="https://arxiv.org/html/2405.13337v2" title="View HTML" id="html-2405.13337" aria-labelledby="html-2405.13337" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2405.13337" title="Other formats" id="oth-2405.13337" aria-labelledby="oth-2405.13337">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Semantic Equitable Clustering: A Simple and Effective Strategy for Clustering Vision Tokens </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Fan,+Q">Qihang Fan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Huang,+H">Huaibo Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+M">Mingrui Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=He,+R">Ran He</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> The Vision Transformer (ViT) has gained prominence for its superior relational modeling prowess. However, its global attention mechanism&#39;s quadratic complexity poses substantial computational burdens. A common remedy spatially groups tokens for self-attention, reducing computational requirements. Nonetheless, this strategy neglects semantic information in tokens, possibly scattering semantically-linked tokens across distinct groups, thus compromising the efficacy of self-attention intended for modeling inter-token dependencies. Motivated by these insights, we introduce a fast and balanced clustering method, named <br>\textbf{S}emantic \textbf{E}quitable \textbf{C}lustering (SEC). SEC clusters tokens based on their global semantic relevance in an efficient, straightforward manner. In contrast to traditional clustering methods requiring multiple iterations, our method achieves token clustering in a single pass. Additionally, SEC regulates the number of tokens per cluster, ensuring a balanced distribution for effective parallel processing on current computational platforms without necessitating further optimization. Capitalizing on SEC, we propose a versatile vision backbone, SECViT. Comprehensive experiments in image classification, object detection, instance segmentation, and semantic segmentation validate the effectiveness of SECViT. Moreover, SEC can be conveniently and swiftly applied to multimodal large language models (MLLM), such as LLaVA, to serve as a vision language connector, effectively accelerating the model&#39;s efficiency while maintaining unchanged or better performance. </p> </div> </dd> <dt> <a name='item132'>[132]</a> <a href ="/abs/2405.14342" title="Abstract" id="2405.14342"> arXiv:2405.14342 </a> (replaced) [<a href="/pdf/2405.14342" title="Download PDF" id="pdf-2405.14342" aria-labelledby="pdf-2405.14342">pdf</a>, <a href="https://arxiv.org/html/2405.14342v3" title="View HTML" id="html-2405.14342" aria-labelledby="html-2405.14342" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2405.14342" title="Other formats" id="oth-2405.14342" aria-labelledby="oth-2405.14342">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> RoGs: Large Scale Road Surface Reconstruction with Meshgrid Gaussian </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Feng,+Z">Zhiheng Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wu,+W">Wenhua Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Deng,+T">Tianchen Deng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+H">Hesheng Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Road surface reconstruction plays a crucial role in autonomous driving, which can be used for road lane perception and autolabeling. Recently, mesh-based road surface reconstruction algorithms have shown promising reconstruction results. However, these mesh-based methods suffer from slow speed and poor reconstruction quality. To address these limitations, we propose a novel large-scale road surface reconstruction approach with meshgrid Gaussian, named RoGs. Specifically, we model the road surface by placing Gaussian surfels in the vertices of a uniformly distributed square mesh, where each surfel stores color, semantic, and geometric information. This square mesh-based layout covers the entire road with fewer Gaussian surfels and reduces the overlap between Gaussian surfels during training. In addition, because the road surface has no thickness, 2D Gaussian surfel is more consistent with the physical reality of the road surface than 3D Gaussian sphere. Then, unlike previous initialization methods that rely on point clouds, we introduce a vehicle pose-based initialization method to initialize the height and rotation of the Gaussian surfel. Thanks to this meshgrid Gaussian modeling and pose-based initialization, our method achieves significant speedups while improving reconstruction quality. We obtain excellent results in reconstruction of road surfaces in a variety of challenging real-world scenes. </p> </div> </dd> <dt> <a name='item133'>[133]</a> <a href ="/abs/2405.16200" title="Abstract" id="2405.16200"> arXiv:2405.16200 </a> (replaced) [<a href="/pdf/2405.16200" title="Download PDF" id="pdf-2405.16200" aria-labelledby="pdf-2405.16200">pdf</a>, <a href="https://arxiv.org/html/2405.16200v2" title="View HTML" id="html-2405.16200" aria-labelledby="html-2405.16200" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2405.16200" title="Other formats" id="oth-2405.16200" aria-labelledby="oth-2405.16200">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> FlightPatchNet: Multi-Scale Patch Network with Differential Coding for Flight Trajectory Prediction </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wu,+L">Lan Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+X">Xuebin Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chu,+R">Ruijuan Chu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+G">Guangyi Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+Y">Yingchun Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+J">Jing Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+L">Linyu Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Accurate multi-step flight trajectory prediction plays an important role in Air Traffic Control, which can ensure the safety of air transportation. Two main issues limit the flight trajectory prediction performance of existing works. The first issue is the negative impact on prediction accuracy caused by the significant differences in data range. The second issue is that real-world flight trajectories involve underlying temporal dependencies, and existing methods fail to reveal the hidden complex temporal variations and only extract features from one single time scale. To address the above issues, we propose FlightPatchNet, a multi-scale patch network with differential coding for flight trajectory prediction. Specifically, FlightPatchNet first utilizes the differential coding to encode the original values of longitude and latitude into first-order differences and generates embeddings for all variables at each time step. Then, a global temporal attention is introduced to explore the dependencies between different time steps. To fully explore the diverse temporal patterns in flight trajectories, a multi-scale patch network is delicately designed to serve as the backbone. The multi-scale patch network exploits stacked patch mixer blocks to capture inter- and intra-patch dependencies under different time scales, and further integrates multi-scale temporal features across different scales and variables. Finally, FlightPatchNet ensembles multiple predictors to make direct multi-step prediction. Extensive experiments on ADS-B datasets demonstrate that our model outperforms the competitive baselines. </p> </div> </dd> <dt> <a name='item134'>[134]</a> <a href ="/abs/2405.17158" title="Abstract" id="2405.17158"> arXiv:2405.17158 </a> (replaced) [<a href="/pdf/2405.17158" title="Download PDF" id="pdf-2405.17158" aria-labelledby="pdf-2405.17158">pdf</a>, <a href="https://arxiv.org/html/2405.17158v4" title="View HTML" id="html-2405.17158" aria-labelledby="html-2405.17158" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2405.17158" title="Other formats" id="oth-2405.17158" aria-labelledby="oth-2405.17158">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> PatchScaler: An Efficient Patch-Independent Diffusion Model for Image Super-Resolution </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+Y">Yong Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Dong,+H">Hang Dong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Pan,+J">Jinshan Pan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Dong,+Q">Qingji Dong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+K">Kai Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+R">Rongxiang Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Fu,+L">Lean Fu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+F">Fei Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> While diffusion models significantly improve the perceptual quality of super-resolved images, they usually require a large number of sampling steps, resulting in high computational costs and long inference times. Recent efforts have explored reasonable acceleration schemes by reducing the number of sampling steps. However, these approaches treat all regions of the image equally, overlooking the fact that regions with varying levels of reconstruction difficulty require different sampling steps. To address this limitation, we propose PatchScaler, an efficient patch-independent diffusion pipeline for single image super-resolution. Specifically, PatchScaler introduces a Patch-adaptive Group Sampling (PGS) strategy that groups feature patches by quantifying their reconstruction difficulty and establishes shortcut paths with different sampling configurations for each group. To further optimize the patch-level reconstruction process of PGS, we propose a texture prompt that provides rich texture conditional information to the diffusion model. The texture prompt adaptively retrieves texture priors for the target patch from a common reference texture memory. Extensive experiments show that our PatchScaler achieves superior performance in both quantitative and qualitative evaluations, while significantly speeding up inference. Our code will be available at \url{<a href="https://github.com/yongliuy/PatchScaler" rel="external noopener nofollow" class="link-external link-https">this https URL</a>}. </p> </div> </dd> <dt> <a name='item135'>[135]</a> <a href ="/abs/2405.18299" title="Abstract" id="2405.18299"> arXiv:2405.18299 </a> (replaced) [<a href="/pdf/2405.18299" title="Download PDF" id="pdf-2405.18299" aria-labelledby="pdf-2405.18299">pdf</a>, <a href="https://arxiv.org/html/2405.18299v4" title="View HTML" id="html-2405.18299" aria-labelledby="html-2405.18299" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2405.18299" title="Other formats" id="oth-2405.18299" aria-labelledby="oth-2405.18299">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Deep Learning Innovations for Underwater Waste Detection: An In-Depth Analysis </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Walia,+J+S">Jaskaran Singh Walia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=K,+P+L">Pavithra L K</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG); Robotics (cs.RO) </div> <p class='mathjax'> Addressing the issue of submerged underwater trash is crucial for safeguarding aquatic ecosystems and preserving marine life. While identifying debris present on the surface of water bodies is straightforward, assessing the underwater submerged waste is a challenge due to the image distortions caused by factors such as light refraction, absorption, suspended particles, color shifts, and occlusion. This paper conducts a comprehensive review of state-of-the-art architectures and on the existing datasets to establish a baseline for submerged waste and trash detection. The primary goal remains to establish the benchmark of the object localization techniques to be leveraged by advanced underwater sensors and autonomous underwater vehicles. The ultimate objective is to explore the underwater environment, to identify, and remove underwater debris. The absence of benchmarks (dataset or algorithm) in many researches emphasizes the need for a more robust algorithmic solution. Through this research, we aim to give performance comparative analysis of various underwater trash detection algorithms. </p> </div> </dd> <dt> <a name='item136'>[136]</a> <a href ="/abs/2406.00777" title="Abstract" id="2406.00777"> arXiv:2406.00777 </a> (replaced) [<a href="/pdf/2406.00777" title="Download PDF" id="pdf-2406.00777" aria-labelledby="pdf-2406.00777">pdf</a>, <a href="https://arxiv.org/html/2406.00777v2" title="View HTML" id="html-2406.00777" aria-labelledby="html-2406.00777" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.00777" title="Other formats" id="oth-2406.00777" aria-labelledby="oth-2406.00777">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Diffusion Features to Bridge Domain Gap for Semantic Segmentation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ji,+Y">Yuxiang Ji</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=He,+B">Boyong He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Qu,+C">Chenyuan Qu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Tan,+Z">Zhuoyue Tan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Qin,+C">Chuan Qin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wu,+L">Liaoni Wu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> The code is released at <a href="https://github.com/Yux1angJi/DIFF" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Pre-trained diffusion models have demonstrated remarkable proficiency in synthesizing images across a wide range of scenarios with customizable prompts, indicating their effective capacity to capture universal features. Motivated by this, our study delves into the utilization of the implicit knowledge embedded within diffusion models to address challenges in cross-domain semantic segmentation. This paper investigates the approach that leverages the sampling and fusion techniques to harness the features of diffusion models efficiently. We propose DIffusion Feature Fusion (DIFF) as a backbone use for extracting and integrating effective semantic representations through the diffusion process. By leveraging the strength of text-to-image generation capability, we introduce a new training framework designed to implicitly learn posterior knowledge from it. Through rigorous evaluation in the contexts of domain generalization semantic segmentation, we establish that our methodology surpasses preceding approaches in mitigating discrepancies across distinct domains and attains the state-of-the-art (SOTA) benchmark. </p> </div> </dd> <dt> <a name='item137'>[137]</a> <a href ="/abs/2406.07472" title="Abstract" id="2406.07472"> arXiv:2406.07472 </a> (replaced) [<a href="/pdf/2406.07472" title="Download PDF" id="pdf-2406.07472" aria-labelledby="pdf-2406.07472">pdf</a>, <a href="/format/2406.07472" title="Other formats" id="oth-2406.07472" aria-labelledby="oth-2406.07472">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> 4Real: Towards Photorealistic 4D Scene Generation via Video Diffusion Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yu,+H">Heng Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+C">Chaoyang Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhuang,+P">Peiye Zhuang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Menapace,+W">Willi Menapace</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Siarohin,+A">Aliaksandr Siarohin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Cao,+J">Junli Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jeni,+L+A">Laszlo A Jeni</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Tulyakov,+S">Sergey Tulyakov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lee,+H">Hsin-Ying Lee</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> NeurIPS 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Existing dynamic scene generation methods mostly rely on distilling knowledge from pre-trained 3D generative models, which are typically fine-tuned on synthetic object datasets. As a result, the generated scenes are often object-centric and lack photorealism. To address these limitations, we introduce a novel pipeline designed for photorealistic text-to-4D scene generation, discarding the dependency on multi-view generative models and instead fully utilizing video generative models trained on diverse real-world datasets. Our method begins by generating a reference video using the video generation model. We then learn the canonical 3D representation of the video using a freeze-time video, delicately generated from the reference video. To handle inconsistencies in the freeze-time video, we jointly learn a per-frame deformation to model these imperfections. We then learn the temporal deformation based on the canonical representation to capture dynamic interactions in the reference video. The pipeline facilitates the generation of dynamic scenes with enhanced photorealism and structural integrity, viewable from multiple perspectives, thereby setting a new standard in 4D scene generation. </p> </div> </dd> <dt> <a name='item138'>[138]</a> <a href ="/abs/2406.08222" title="Abstract" id="2406.08222"> arXiv:2406.08222 </a> (replaced) [<a href="/pdf/2406.08222" title="Download PDF" id="pdf-2406.08222" aria-labelledby="pdf-2406.08222">pdf</a>, <a href="/format/2406.08222" title="Other formats" id="oth-2406.08222" aria-labelledby="oth-2406.08222">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Sociotechnical Lens for Evaluating Computer Vision Models: A Case Study on Detecting and Reasoning about Gender and Emotion </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Luo,+S">Sha Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kim,+S+J">Sang Jung Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Duan,+Z">Zening Duan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+K">Kaiping Chen</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Computers and Society (cs.CY); Human-Computer Interaction (cs.HC) </div> <p class='mathjax'> In the evolving landscape of computer vision (CV) technologies, the automatic detection and interpretation of gender and emotion in images is a critical area of study. This paper investigates social biases in CV models, emphasizing the limitations of traditional evaluation metrics such as precision, recall, and accuracy. These metrics often fall short in capturing the complexities of gender and emotion, which are fluid and culturally nuanced constructs. Our study proposes a sociotechnical framework for evaluating CV models, incorporating both technical performance measures and considerations of social fairness. Using a dataset of 5,570 images related to vaccination and climate change, we empirically compared the performance of various CV models, including traditional models like DeepFace and FER, and generative models like GPT-4 Vision. Our analysis involved manually validating the gender and emotional expressions in a subset of images to serve as benchmarks. Our findings reveal that while GPT-4 Vision outperforms other models in technical accuracy for gender classification, it exhibits discriminatory biases, particularly in response to transgender and non-binary personas. Furthermore, the model&#39;s emotion detection skew heavily towards positive emotions, with a notable bias towards associating female images with happiness, especially when prompted by male personas. These findings underscore the necessity of developing more comprehensive evaluation criteria that address both validity and discriminatory biases in CV models. Our proposed framework provides guidelines for researchers to critically assess CV tools, ensuring their application in communication research is both ethical and effective. The significant contribution of this study lies in its emphasis on a sociotechnical approach, advocating for CV technologies that support social good and mitigate biases rather than perpetuate them. </p> </div> </dd> <dt> <a name='item139'>[139]</a> <a href ="/abs/2406.08298" title="Abstract" id="2406.08298"> arXiv:2406.08298 </a> (replaced) [<a href="/pdf/2406.08298" title="Download PDF" id="pdf-2406.08298" aria-labelledby="pdf-2406.08298">pdf</a>, <a href="https://arxiv.org/html/2406.08298v5" title="View HTML" id="html-2406.08298" aria-labelledby="html-2406.08298" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.08298" title="Other formats" id="oth-2406.08298" aria-labelledby="oth-2406.08298">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> AdaNCA: Neural Cellular Automata As Adaptors For More Robust Vision Transformer </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xu,+Y">Yitao Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+T">Tong Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=S%C3%BCsstrunk,+S">Sabine S眉sstrunk</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 32 pages, 12 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Vision Transformers (ViTs) demonstrate remarkable performance in image classification through visual-token interaction learning, particularly when equipped with local information via region attention or convolutions. Although such architectures improve the feature aggregation from different granularities, they often fail to contribute to the robustness of the networks. Neural Cellular Automata (NCA) enables the modeling of global visual-token representations through local interactions, with its training strategies and architecture design conferring strong generalization ability and robustness against noisy input. In this paper, we propose Adaptor Neural Cellular Automata (AdaNCA) for Vision Transformers that uses NCA as plug-and-play adaptors between ViT layers, thus enhancing ViT&#39;s performance and robustness against adversarial samples as well as out-of-distribution inputs. To overcome the large computational overhead of standard NCAs, we propose Dynamic Interaction for more efficient interaction learning. Using our analysis of AdaNCA placement and robustness improvement, we also develop an algorithm for identifying the most effective insertion points for AdaNCA. With less than a 3% increase in parameters, AdaNCA contributes to more than 10% absolute improvement in accuracy under adversarial attacks on the ImageNet1K benchmark. Moreover, we demonstrate with extensive evaluations across eight robustness benchmarks and four ViT architectures that AdaNCA, as a plug-and-play module, consistently improves the robustness of ViTs. </p> </div> </dd> <dt> <a name='item140'>[140]</a> <a href ="/abs/2406.10079" title="Abstract" id="2406.10079"> arXiv:2406.10079 </a> (replaced) [<a href="/pdf/2406.10079" title="Download PDF" id="pdf-2406.10079" aria-labelledby="pdf-2406.10079">pdf</a>, <a href="https://arxiv.org/html/2406.10079v3" title="View HTML" id="html-2406.10079" aria-labelledby="html-2406.10079" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.10079" title="Other formats" id="oth-2406.10079" aria-labelledby="oth-2406.10079">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Localizing Events in Videos with Multimodal Queries </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+G">Gengyuan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Fok,+M+L+A">Mang Ling Ada Fok</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ma,+J">Jialu Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xia,+Y">Yan Xia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Cremers,+D">Daniel Cremers</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Torr,+P">Philip Torr</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Tresp,+V">Volker Tresp</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gu,+J">Jindong Gu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 20 pages (including references and appendix); for the project homepage, see <a href="https://icq-benchmark.github.io/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Localizing events in videos based on semantic queries is a pivotal task in video understanding, with the growing significance of user-oriented applications like video search. Yet, current research predominantly relies on natural language queries (NLQs), overlooking the potential of using multimodal queries (MQs) that integrate images to more flexibly represent semantic queries -- especially when it is difficult to express non-verbal or unfamiliar concepts in words. To bridge this gap, we introduce ICQ, a new benchmark designed for localizing events in videos with MQs, alongside an evaluation dataset ICQ-Highlight. To accommodate and evaluate existing video localization models for this new task, we propose 3 Multimodal Query Adaptation methods and a novel Surrogate Fine-tuning on pseudo-MQs strategy. ICQ systematically benchmarks 12 state-of-the-art backbone models, spanning from specialized video localization models to Video LLMs, across diverse application domains. Our experiments highlight the high potential of MQs in real-world applications. We believe this benchmark is a first step toward advancing MQs in video event localization. </p> </div> </dd> <dt> <a name='item141'>[141]</a> <a href ="/abs/2407.05771" title="Abstract" id="2407.05771"> arXiv:2407.05771 </a> (replaced) [<a href="/pdf/2407.05771" title="Download PDF" id="pdf-2407.05771" aria-labelledby="pdf-2407.05771">pdf</a>, <a href="https://arxiv.org/html/2407.05771v3" title="View HTML" id="html-2407.05771" aria-labelledby="html-2407.05771" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2407.05771" title="Other formats" id="oth-2407.05771" aria-labelledby="oth-2407.05771">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Multi-times Monte Carlo Rendering for Inter-reflection Reconstruction </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhu,+T">Tengjie Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+Z">Zhuo Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gao,+J">Jingnan Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yan,+Y">Yichao Yan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yang,+X">Xiaokang Yang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 10 pages,6 figures, Accepted by NeurIPS 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Inverse rendering methods have achieved remarkable performance in reconstructing high-fidelity 3D objects with disentangled geometries, materials, and environmental light. However, they still face huge challenges in reflective surface reconstruction. Although recent methods model the light trace to learn specularity, the ignorance of indirect illumination makes it hard to handle inter-reflections among multiple smooth objects. In this work, we propose Ref-MC2 that introduces the multi-time Monte Carlo sampling which comprehensively computes the environmental illumination and meanwhile considers the reflective light from object surfaces. To address the computation challenge as the times of Monte Carlo sampling grow, we propose a specularity-adaptive sampling strategy, significantly reducing the computational complexity. Besides the computational resource, higher geometry accuracy is also required because geometric errors accumulate multiple times. Therefore, we further introduce a reflection-aware surface model to initialize the geometry and refine it during inverse rendering. We construct a challenging dataset containing scenes with multiple objects and inter-reflections. Experiments show that our method outperforms other inverse rendering methods on various object groups. We also show downstream applications, e.g., relighting and material editing, to illustrate the disentanglement ability of our method. </p> </div> </dd> <dt> <a name='item142'>[142]</a> <a href ="/abs/2407.07315" title="Abstract" id="2407.07315"> arXiv:2407.07315 </a> (replaced) [<a href="/pdf/2407.07315" title="Download PDF" id="pdf-2407.07315" aria-labelledby="pdf-2407.07315">pdf</a>, <a href="https://arxiv.org/html/2407.07315v2" title="View HTML" id="html-2407.07315" aria-labelledby="html-2407.07315" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2407.07315" title="Other formats" id="oth-2407.07315" aria-labelledby="oth-2407.07315">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CosmoCLIP: Generalizing Large Vision-Language Models for Astronomical Imaging </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Imam,+R">Raza Imam</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Alam,+M+T">Mohammed Talha Alam</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Rahman,+U">Umaima Rahman</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Guizani,+M">Mohsen Guizani</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Karray,+F">Fakhri Karray</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted at SPAICE Conference, ECSAT, UK, 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Existing vision-text contrastive learning models enhance representation transferability and support zero-shot prediction by matching paired image and caption embeddings while pushing unrelated pairs apart. However, astronomical image-label datasets are significantly smaller compared to general image and label datasets available from the internet. We introduce CosmoCLIP, an astronomical image-text contrastive learning framework precisely fine-tuned on the pre-trained CLIP model using SpaceNet and BLIP-based captions. SpaceNet, attained via FLARE, constitutes ~13k optimally distributed images, while BLIP acts as a rich knowledge extractor. The rich semantics derived from this SpaceNet and BLIP descriptions, when learned contrastively, enable CosmoCLIP to achieve superior generalization across various in-domain and out-of-domain tasks. Our results demonstrate that CosmoCLIP is a straightforward yet powerful framework, significantly outperforming CLIP in zero-shot classification and image-text retrieval tasks. </p> </div> </dd> <dt> <a name='item143'>[143]</a> <a href ="/abs/2407.11424" title="Abstract" id="2407.11424"> arXiv:2407.11424 </a> (replaced) [<a href="/pdf/2407.11424" title="Download PDF" id="pdf-2407.11424" aria-labelledby="pdf-2407.11424">pdf</a>, <a href="https://arxiv.org/html/2407.11424v2" title="View HTML" id="html-2407.11424" aria-labelledby="html-2407.11424" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2407.11424" title="Other formats" id="oth-2407.11424" aria-labelledby="oth-2407.11424">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Model Inversion Attacks Through Target-Specific Conditional Diffusion Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+O">Ouxiang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Hao,+Y">Yanbin Hao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+Z">Zhicai Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhu,+B">Bin Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+S">Shuo Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+Z">Zaixi Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Feng,+F">Fuli Feng</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Model inversion attacks (MIAs) aim to reconstruct private images from a target classifier&#39;s training set, thereby raising privacy concerns in AI applications. Previous GAN-based MIAs tend to suffer from inferior generative fidelity due to GAN&#39;s inherent flaws and biased optimization within latent space. To alleviate these issues, leveraging on diffusion models&#39; remarkable synthesis capabilities, we propose Diffusion-based Model Inversion (Diff-MI) attacks. Specifically, we introduce a novel target-specific conditional diffusion model (CDM) to purposely approximate target classifier&#39;s private distribution and achieve superior accuracy-fidelity balance. Our method involves a two-step learning paradigm. Step-1 incorporates the target classifier into the entire CDM learning under a pretrain-then-finetune fashion, with creating pseudo-labels as model conditions in pretraining and adjusting specified layers with image predictions in fine-tuning. Step-2 presents an iterative image reconstruction method, further enhancing the attack performance through a combination of diffusion priors and target knowledge. Additionally, we propose an improved max-margin loss that replaces the hard max with top-k maxes, fully leveraging feature information and soft labels from the target classifier. Extensive experiments demonstrate that Diff-MI significantly improves generative fidelity with an average decrease of 20\% in FID while maintaining competitive attack accuracy compared to state-of-the-art methods across various datasets and models. Our code is available at: \url{<a href="https://github.com/Ouxiang-Li/Diff-MI" rel="external noopener nofollow" class="link-external link-https">this https URL</a>}. </p> </div> </dd> <dt> <a name='item144'>[144]</a> <a href ="/abs/2407.17438" title="Abstract" id="2407.17438"> arXiv:2407.17438 </a> (replaced) [<a href="/pdf/2407.17438" title="Download PDF" id="pdf-2407.17438" aria-labelledby="pdf-2407.17438">pdf</a>, <a href="https://arxiv.org/html/2407.17438v3" title="View HTML" id="html-2407.17438" aria-labelledby="html-2407.17438" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2407.17438" title="Other formats" id="oth-2407.17438" aria-labelledby="oth-2407.17438">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> HumanVid: Demystifying Training Data for Camera-controllable Human Image Animation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+Z">Zhenzhi Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+Y">Yixuan Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zeng,+Y">Yanhong Zeng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Fang,+Y">Youqing Fang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Guo,+Y">Yuwei Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+W">Wenran Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Tan,+J">Jing Tan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+K">Kai Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xue,+T">Tianfan Xue</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Dai,+B">Bo Dai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lin,+D">Dahua Lin</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> NeurIPS D&amp;B Track 2024 camera ready version, TL;DR: the first large-scale dataset for camera controllable human image animation task, and a baseline method </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Human image animation involves generating videos from a character photo, allowing user control and unlocking the potential for video and movie production. While recent approaches yield impressive results using high-quality training data, the inaccessibility of these datasets hampers fair and transparent benchmarking. Moreover, these approaches prioritize 2D human motion and overlook the significance of camera motions in videos, leading to limited control and unstable video generation. To demystify the training data, we present HumanVid, the first large-scale high-quality dataset tailored for human image animation, which combines crafted real-world and synthetic data. For the real-world data, we compile a vast collection of real-world videos from the internet. We developed and applied careful filtering rules to ensure video quality, resulting in a curated collection of 20K high-resolution (1080P) human-centric videos. Human and camera motion annotation is accomplished using a 2D pose estimator and a SLAM-based method. To expand our synthetic dataset, we collected 10K 3D avatar assets and leveraged existing assets of body shapes, skin textures and clothings. Notably, we introduce a rule-based camera trajectory generation method, enabling the synthetic pipeline to incorporate diverse and precise camera motion annotation, which can rarely be found in real-world data. To verify the effectiveness of HumanVid, we establish a baseline model named CamAnimate, short for Camera-controllable Human Animation, that considers both human and camera motions as conditions. Through extensive experimentation, we demonstrate that such simple baseline training on our HumanVid achieves state-of-the-art performance in controlling both human pose and camera motions, setting a new benchmark. Demo, data and code could be found in the project website: <a href="https://humanvid.github.io/" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item145'>[145]</a> <a href ="/abs/2408.00754" title="Abstract" id="2408.00754"> arXiv:2408.00754 </a> (replaced) [<a href="/pdf/2408.00754" title="Download PDF" id="pdf-2408.00754" aria-labelledby="pdf-2408.00754">pdf</a>, <a href="https://arxiv.org/html/2408.00754v2" title="View HTML" id="html-2408.00754" aria-labelledby="html-2408.00754" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2408.00754" title="Other formats" id="oth-2408.00754" aria-labelledby="oth-2408.00754">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Coarse Correspondences Boost Spatial-Temporal Reasoning in Multimodal Language Model </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+B">Benlin Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Dong,+Y">Yuhao Dong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+Y">Yiqin Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ma,+Z">Zixian Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Tang,+Y">Yansong Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Tang,+L">Luming Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Rao,+Y">Yongming Rao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ma,+W">Wei-Chiu Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Krishna,+R">Ranjay Krishna</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> project page: <a href="https://coarse-correspondence.github.io" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Multimodal language models (MLLMs) are increasingly being applied in real-world environments, necessitating their ability to interpret 3D spaces and comprehend temporal dynamics. Current methods often rely on specialized architectural designs or task-specific fine-tuning to achieve this. We introduce Coarse Correspondences, a simple lightweight method that enhances MLLMs&#39; spatial-temporal reasoning with 2D images as input, without modifying the architecture or requiring task-specific fine-tuning. Our method uses a lightweight tracking model to identify primary object correspondences between frames in a video or across different image viewpoints, and then conveys this information to MLLMs through visual prompting. We demonstrate that this simple training-free approach brings substantial gains to GPT4-V/O consistently on four benchmarks that require spatial-temporal reasoning, including +20.5\% improvement on ScanQA, +9.7\% on OpenEQA&#39;s episodic memory subset, +6.0\% on the long-form video benchmark EgoSchema, and +11\% on the R2R navigation benchmark. Additionally, we show that Coarse Correspondences can also enhance open-source MLLMs&#39; spatial reasoning (by +6.9\% on ScanQA) when applied in both training and inference and that the improvement can generalize to unseen datasets such as SQA3D (+3.1\%). Taken together, we show that Coarse Correspondences effectively and efficiently boosts models&#39; performance on downstream tasks requiring spatial-temporal reasoning. </p> </div> </dd> <dt> <a name='item146'>[146]</a> <a href ="/abs/2408.02555" title="Abstract" id="2408.02555"> arXiv:2408.02555 </a> (replaced) [<a href="/pdf/2408.02555" title="Download PDF" id="pdf-2408.02555" aria-labelledby="pdf-2408.02555">pdf</a>, <a href="https://arxiv.org/html/2408.02555v2" title="View HTML" id="html-2408.02555" aria-labelledby="html-2408.02555" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2408.02555" title="Other formats" id="oth-2408.02555" aria-labelledby="oth-2408.02555">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MeshAnything V2: Artist-Created Mesh Generation With Adjacent Mesh Tokenization </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+Y">Yiwen Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+Y">Yikai Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Luo,+Y">Yihao Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+Z">Zhengyi Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+Z">Zilong Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhu,+J">Jun Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+C">Chi Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lin,+G">Guosheng Lin</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Project Page: <a href="https://buaacyw.github.io/meshanything-v2/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> Github: <a href="https://github.com/buaacyw/MeshAnythingV2" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Graphics (cs.GR) </div> <p class='mathjax'> Meshes are the de facto 3D representation in the industry but are labor-intensive to produce. Recently, a line of research has focused on autoregressively generating meshes. This approach processes meshes into a sequence composed of vertices and then generates them vertex by vertex, similar to how a language model generates text. These methods have achieved some success but still struggle to generate complex meshes. One primary reason for this limitation is their inefficient tokenization methods. To address this issue, we introduce MeshAnything V2, an advanced mesh generation model designed to create Artist-Created Meshes that align precisely with specified shapes. A key innovation behind MeshAnything V2 is our novel Adjacent Mesh Tokenization (AMT) method. Unlike traditional approaches that represent each face using three vertices, AMT optimizes this by employing a single vertex wherever feasible, effectively reducing the token sequence length by about half on average. This not only streamlines the tokenization process but also results in more compact and well-structured sequences, enhancing the efficiency of mesh generation. With these improvements, MeshAnything V2 effectively doubles the face limit compared to previous models, delivering superior performance without increasing computational costs. We will make our code and models publicly available. Project Page: <a href="https://buaacyw.github.io/meshanything-v2/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item147'>[147]</a> <a href ="/abs/2408.10072" title="Abstract" id="2408.10072"> arXiv:2408.10072 </a> (replaced) [<a href="/pdf/2408.10072" title="Download PDF" id="pdf-2408.10072" aria-labelledby="pdf-2408.10072">pdf</a>, <a href="https://arxiv.org/html/2408.10072v2" title="View HTML" id="html-2408.10072" aria-labelledby="html-2408.10072" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2408.10072" title="Other formats" id="oth-2408.10072" aria-labelledby="oth-2408.10072">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> FFAA: Multimodal Large Language Model based Explainable Open-World Face Forgery Analysis Assistant </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Huang,+Z">Zhengchao Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xia,+B">Bin Xia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lin,+Z">Zicheng Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Mou,+Z">Zhun Mou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yang,+W">Wenming Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jia,+J">Jiaya Jia</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 23 pages, 21 figures; project page: <a href="https://ffaa-vl.github.io" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> The rapid advancement of deepfake technologies has sparked widespread public concern, particularly as face forgery poses a serious threat to public information security. However, the unknown and diverse forgery techniques, varied facial features and complex environmental factors pose significant challenges for face forgery analysis. Existing datasets lack descriptive annotations of these aspects, making it difficult for models to distinguish between real and forged faces using only visual information amid various confounding factors. In addition, existing methods fail to yield user-friendly and explainable results, hindering the understanding of the model&#39;s decision-making process. To address these challenges, we introduce a novel Open-World Face Forgery Analysis VQA (OW-FFA-VQA) task and its corresponding benchmark. To tackle this task, we first establish a dataset featuring a diverse collection of real and forged face images with essential descriptions and reliable forgery reasoning. Based on this dataset, we introduce FFAA: Face Forgery Analysis Assistant, consisting of a fine-tuned Multimodal Large Language Model (MLLM) and Multi-answer Intelligent Decision System (MIDS). By integrating hypothetical prompts with MIDS, the impact of fuzzy classification boundaries is effectively mitigated, enhancing model robustness. Extensive experiments demonstrate that our method not only provides user-friendly and explainable results but also significantly boosts accuracy and robustness compared to previous methods. </p> </div> </dd> <dt> <a name='item148'>[148]</a> <a href ="/abs/2408.16266" title="Abstract" id="2408.16266"> arXiv:2408.16266 </a> (replaced) [<a href="/pdf/2408.16266" title="Download PDF" id="pdf-2408.16266" aria-labelledby="pdf-2408.16266">pdf</a>, <a href="https://arxiv.org/html/2408.16266v2" title="View HTML" id="html-2408.16266" aria-labelledby="html-2408.16266" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2408.16266" title="Other formats" id="oth-2408.16266" aria-labelledby="oth-2408.16266">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Inversion Circle Interpolation: Diffusion-based Image Augmentation for Data-scarce Classification </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+Y">Yanghao Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+L">Long Chen</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Data Augmentation (DA), i.e., synthesizing faithful and diverse samples to expand the original training set, is a prevalent and effective strategy to improve the performance of various data-scarce tasks. With the powerful image generation ability, diffusion-based DA has shown strong performance gains on different image classification benchmarks. In this paper, we analyze today&#39;s diffusion-based DA methods, and argue that they cannot take account of both faithfulness and diversity, which are two critical keys for generating high-quality samples and boosting classification performance. To this end, we propose a novel Diffusion-based DA method: Diff-II. Specifically, it consists of three steps: 1) Category concepts learning: Learning concept embeddings for each category. 2) Inversion interpolation: Calculating the inversion for each image, and conducting circle interpolation for two randomly sampled inversions from the same category. 3) Two-stage denoising: Using different prompts to generate synthesized images in a coarse-to-fine manner. Extensive experiments on various data-scarce image classification tasks (e.g., few-shot, long-tailed, and out-of-distribution classification) have demonstrated its effectiveness over state-of-the-art diffusion-based DA methods. </p> </div> </dd> <dt> <a name='item149'>[149]</a> <a href ="/abs/2409.11340" title="Abstract" id="2409.11340"> arXiv:2409.11340 </a> (replaced) [<a href="/pdf/2409.11340" title="Download PDF" id="pdf-2409.11340" aria-labelledby="pdf-2409.11340">pdf</a>, <a href="https://arxiv.org/html/2409.11340v2" title="View HTML" id="html-2409.11340" aria-labelledby="html-2409.11340" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.11340" title="Other formats" id="oth-2409.11340" aria-labelledby="oth-2409.11340">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> OmniGen: Unified Image Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xiao,+S">Shitao Xiao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+Y">Yueze Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhou,+J">Junjie Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yuan,+H">Huaying Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xing,+X">Xingrun Xing</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yan,+R">Ruiran Yan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+C">Chaofan Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+S">Shuting Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Huang,+T">Tiejun Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+Z">Zheng Liu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Update the paper for OmniGen-v1 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> The emergence of Large Language Models (LLMs) has unified language generation tasks and revolutionized human-machine interaction. However, in the realm of image generation, a unified model capable of handling various tasks within a single framework remains largely unexplored. In this work, we introduce OmniGen, a new diffusion model for unified image generation. OmniGen is characterized by the following features: 1) Unification: OmniGen not only demonstrates text-to-image generation capabilities but also inherently supports various downstream tasks, such as image editing, subject-driven generation, and visual-conditional generation. 2) Simplicity: The architecture of OmniGen is highly simplified, eliminating the need for additional plugins. Moreover, compared to existing diffusion models, it is more user-friendly and can complete complex tasks end-to-end through instructions without the need for extra intermediate steps, greatly simplifying the image generation workflow. 3) Knowledge Transfer: Benefit from learning in a unified format, OmniGen effectively transfers knowledge across different tasks, manages unseen tasks and domains, and exhibits novel capabilities. We also explore the model&#39;s reasoning capabilities and potential applications of the chain-of-thought mechanism. This work represents the first attempt at a general-purpose image generation model, and we will release our resources at <a href="https://github.com/VectorSpaceLab/OmniGen" rel="external noopener nofollow" class="link-external link-https">this https URL</a> to foster future advancements. </p> </div> </dd> <dt> <a name='item150'>[150]</a> <a href ="/abs/2409.13978" title="Abstract" id="2409.13978"> arXiv:2409.13978 </a> (replaced) [<a href="/pdf/2409.13978" title="Download PDF" id="pdf-2409.13978" aria-labelledby="pdf-2409.13978">pdf</a>, <a href="https://arxiv.org/html/2409.13978v3" title="View HTML" id="html-2409.13978" aria-labelledby="html-2409.13978" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.13978" title="Other formats" id="oth-2409.13978" aria-labelledby="oth-2409.13978">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> FracGM: A Fast Fractional Programming Technique for Geman-McClure Robust Estimator </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+B">Bang-Shien Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lin,+Y">Yu-Kai Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+J">Jian-Yu Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Huang,+C">Chih-Wei Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chern,+J">Jann-Long Chern</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Sun,+C">Ching-Cherng Sun</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 8 pages, 6 figures </div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> IEEE Robotics and Automation Letters, 9(12), 11666-11673, 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Robotics (cs.RO); Optimization and Control (math.OC) </div> <p class='mathjax'> Robust estimation is essential in computer vision, robotics, and navigation, aiming to minimize the impact of outlier measurements for improved accuracy. We present a fast algorithm for Geman-McClure robust estimation, FracGM, leveraging fractional programming techniques. This solver reformulates the original non-convex fractional problem to a convex dual problem and a linear equation system, iteratively solving them in an alternating optimization pattern. Compared to graduated non-convexity approaches, this strategy exhibits a faster convergence rate and better outlier rejection capability. In addition, the global optimality of the proposed solver can be guaranteed under given conditions. We demonstrate the proposed FracGM solver with Wahba&#39;s rotation problem and 3-D point-cloud registration along with relaxation pre-processing and projection post-processing. Compared to state-of-the-art algorithms, when the outlier rates increase from 20% to 80%, FracGM shows 53% and 88% lower rotation and translation increases. In real-world scenarios, FracGM achieves better results in 13 out of 18 outcomes, while having a 19.43% improvement in the computation time. </p> </div> </dd> <dt> <a name='item151'>[151]</a> <a href ="/abs/2410.02592" title="Abstract" id="2410.02592"> arXiv:2410.02592 </a> (replaced) [<a href="/pdf/2410.02592" title="Download PDF" id="pdf-2410.02592" aria-labelledby="pdf-2410.02592">pdf</a>, <a href="https://arxiv.org/html/2410.02592v4" title="View HTML" id="html-2410.02592" aria-labelledby="html-2410.02592" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.02592" title="Other formats" id="oth-2410.02592" aria-labelledby="oth-2410.02592">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> IC3M: In-Car Multimodal Multi-object Monitoring for Abnormal Status of Both Driver and Passengers </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Fang,+Z">Zihan Fang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lin,+Z">Zheng Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Hu,+S">Senkang Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Cao,+H">Hangcheng Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Deng,+Y">Yiqin Deng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+X">Xianhao Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Fang,+Y">Yuguang Fang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 16 pages, 17 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG); Systems and Control (eess.SY) </div> <p class='mathjax'> Recently, in-car monitoring has emerged as a promising technology for detecting early-stage abnormal status of the driver and providing timely alerts to prevent traffic accidents. Although training models with multimodal data enhances the reliability of abnormal status detection, the scarcity of labeled data and the imbalance of class distribution impede the extraction of critical abnormal state features, significantly deteriorating training performance. Furthermore, missing modalities due to environment and hardware limitations further exacerbate the challenge of abnormal status identification. More importantly, monitoring abnormal health conditions of passengers, particularly in elderly care, is of paramount importance but remains underexplored. To address these challenges, we introduce our IC3M, an efficient camera-rotation-based multimodal framework for monitoring both driver and passengers in a car. Our IC3M comprises two key modules: an adaptive threshold pseudo-labeling strategy and a missing modality reconstruction. The former customizes pseudo-labeling thresholds for different classes based on the class distribution, generating class-balanced pseudo labels to guide model training effectively, while the latter leverages crossmodality relationships learned from limited labels to accurately recover missing modalities by distribution transferring from available modalities. Extensive experimental results demonstrate that IC3M outperforms state-of-the-art benchmarks in accuracy, precision, and recall while exhibiting superior robustness under limited labeled data and severe missing modality. </p> </div> </dd> <dt> <a name='item152'>[152]</a> <a href ="/abs/2410.07753" title="Abstract" id="2410.07753"> arXiv:2410.07753 </a> (replaced) [<a href="/pdf/2410.07753" title="Download PDF" id="pdf-2410.07753" aria-labelledby="pdf-2410.07753">pdf</a>, <a href="https://arxiv.org/html/2410.07753v2" title="View HTML" id="html-2410.07753" aria-labelledby="html-2410.07753" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.07753" title="Other formats" id="oth-2410.07753" aria-labelledby="oth-2410.07753">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Data Augmentation for Surgical Scene Segmentation with Anatomy-Aware Diffusion Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Venkatesh,+D+K">Danush Kumar Venkatesh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Rivoir,+D">Dominik Rivoir</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Pfeiffer,+M">Micha Pfeiffer</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kolbinger,+F">Fiona Kolbinger</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Speidel,+S">Stefanie Speidel</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted at WACV 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> In computer-assisted surgery, automatically recognizing anatomical organs is crucial for understanding the surgical scene and providing intraoperative assistance. While machine learning models can identify such structures, their deployment is hindered by the need for labeled, diverse surgical datasets with anatomical annotations. Labeling multiple classes (i.e., organs) in a surgical scene is time-intensive, requiring medical experts. Although synthetically generated images can enhance segmentation performance, maintaining both organ structure and texture during generation is challenging. We introduce a multi-stage approach using diffusion models to generate multi-class surgical datasets with annotations. Our framework improves anatomy awareness by training organ specific models with an inpainting objective guided by binary segmentation masks. The organs are generated with an inference pipeline using pre-trained ControlNet to maintain the organ structure. The synthetic multi-class datasets are constructed through an image composition step, ensuring structural and textural consistency. This versatile approach allows the generation of multi-class datasets from real binary datasets and simulated surgical masks. We thoroughly evaluate the generated datasets on image quality and downstream segmentation, achieving a $15\%$ improvement in segmentation scores when combined with real images. The code is available at <a href="https://gitlab.com/nct_tso_public/muli-class-image-synthesis" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item153'>[153]</a> <a href ="/abs/2410.09747" title="Abstract" id="2410.09747"> arXiv:2410.09747 </a> (replaced) [<a href="/pdf/2410.09747" title="Download PDF" id="pdf-2410.09747" aria-labelledby="pdf-2410.09747">pdf</a>, <a href="https://arxiv.org/html/2410.09747v3" title="View HTML" id="html-2410.09747" aria-labelledby="html-2410.09747" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.09747" title="Other formats" id="oth-2410.09747" aria-labelledby="oth-2410.09747">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> t-READi: Transformer-Powered Robust and Efficient Multimodal Inference for Autonomous Driving </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Hu,+P">Pengfei Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Qian,+Y">Yuhang Qian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zheng,+T">Tianyue Zheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+A">Ang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+Z">Zhe Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gao,+Y">Yue Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Cheng,+X">Xiuzhen Cheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Luo,+J">Jun Luo</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 14 pages, 16 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Distributed, Parallel, and Cluster Computing (cs.DC); Machine Learning (cs.LG); Robotics (cs.RO) </div> <p class='mathjax'> Given the wide adoption of multimodal sensors (e.g., camera, lidar, radar) by autonomous vehicles (AVs), deep analytics to fuse their outputs for a robust perception become imperative. However, existing fusion methods often make two assumptions rarely holding in practice: i) similar data distributions for all inputs and ii) constant availability for all sensors. Because, for example, lidars have various resolutions and failures of radars may occur, such variability often results in significant performance degradation in fusion. To this end, we present tREADi, an adaptive inference system that accommodates the variability of multimodal sensory data and thus enables robust and efficient perception. t-READi identifies variation-sensitive yet structure-specific model parameters; it then adapts only these parameters while keeping the rest intact. t-READi also leverages a cross-modality contrastive learning method to compensate for the loss from missing modalities. Both functions are implemented to maintain compatibility with existing multimodal deep fusion methods. The extensive experiments evidently demonstrate that compared with the status quo approaches, t-READi not only improves the average inference accuracy by more than 6% but also reduces the inference latency by almost 15x with the cost of only 5% extra memory overhead in the worst case under realistic data and modal variations. </p> </div> </dd> <dt> <a name='item154'>[154]</a> <a href ="/abs/2410.14729" title="Abstract" id="2410.14729"> arXiv:2410.14729 </a> (replaced) [<a href="/pdf/2410.14729" title="Download PDF" id="pdf-2410.14729" aria-labelledby="pdf-2410.14729">pdf</a>, <a href="https://arxiv.org/html/2410.14729v2" title="View HTML" id="html-2410.14729" aria-labelledby="html-2410.14729" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.14729" title="Other formats" id="oth-2410.14729" aria-labelledby="oth-2410.14729">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Is Less More? Exploring Token Condensation as Training-free Adaptation for CLIP </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+Z">Zixin Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gong,+D">Dong Gong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+S">Sen Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Huang,+Z">Zi Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Luo,+Y">Yadan Luo</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 15 pages, 7 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Machine Learning (cs.LG) </div> <p class='mathjax'> Contrastive language-image pre-training (CLIP) has shown remarkable generalization ability in image classification. However, CLIP sometimes encounters performance drops on downstream datasets during zero-shot inference. Test-time adaptation methods attempt to mitigate this by adjusting normalization layers or tuning context prompts with large batch sizes and extensive augmentations; yet, these methods are computationally intensive. This raises an important question: Is there a training-free approach that can efficiently address CLIP&#39;s performance drop in such cases? To explore this, we benchmark token condensation techniques, originally designed to enhance the efficiency of vision transformers, on CLIP zero-shot inference tasks. We observe that although token condensation may compromise in-domain accuracy, it surprisingly enhances CLIP&#39;s performance on certain cross-dataset benchmarks. This motivates two key inquiries: (1) Can token condensation serve as a &#34;free-lunch&#34; solution for CLIP zero-shot inference? (2) What criteria should guide condensation -- how can essential tokens be identified and redundant ones eliminated? To address these questions, we propose Token Condensation as Adaptation (TCA), a training-free adaptation method for CLIP by pruning class-irrelevant visual tokens while merging class-ambiguous tokens. As the first approach for CLIP&#39;s token efficiency, TCA demonstrates superior performance across cross-dataset tasks, achieving up to a 21.4\% improvement over the strongest baseline while reducing GFLOPs by 12.2\% to 48.9\%, with minimized hyperparameter dependency. </p> </div> </dd> <dt> <a name='item155'>[155]</a> <a href ="/abs/2410.16162" title="Abstract" id="2410.16162"> arXiv:2410.16162 </a> (replaced) [<a href="/pdf/2410.16162" title="Download PDF" id="pdf-2410.16162" aria-labelledby="pdf-2410.16162">pdf</a>, <a href="/format/2410.16162" title="Other formats" id="oth-2410.16162" aria-labelledby="oth-2410.16162">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Sparkle: Mastering Basic Spatial Capabilities in Vision Language Models Elicits Generalization to Composite Spatial Reasoning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Tang,+Y">Yihong Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Qu,+A">Ao Qu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+Z">Zhaokai Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhuang,+D">Dingyi Zhuang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wu,+Z">Zhaofeng Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ma,+W">Wei Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+S">Shenhao Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zheng,+Y">Yunhan Zheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhao,+Z">Zhan Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhao,+J">Jinhua Zhao</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Vision language models (VLMs) have demonstrated impressive performance across a wide range of downstream tasks. However, their proficiency in spatial reasoning remains limited, despite its crucial role in tasks involving navigation and interaction with physical environments. Specifically, most of these tasks rely on the core spatial reasoning capabilities in two-dimensional (2D) environments, and our evaluation reveals that state-of-the-art VLMs frequently generate implausible and incorrect responses to composite spatial reasoning problems, including simple pathfinding tasks that humans can solve effortlessly at a glance. To address this, we explore an effective approach to enhance 2D spatial reasoning within VLMs by training the model solely on basic spatial capabilities. We begin by disentangling the key components of 2D spatial reasoning: direction comprehension, distance estimation, and localization. Our central hypothesis is that mastering these basic spatial capabilities can significantly enhance a model&#39;s performance on composite spatial tasks requiring advanced spatial understanding and combinatorial problem-solving, with generalized improvements in visual-spatial tasks. To investigate this hypothesis, we introduce Sparkle, a framework that fine-tunes VLMs on these three basic spatial capabilities by synthetic data generation and targeted supervision to form an instruction dataset for each capability. Our experiments demonstrate that VLMs fine-tuned with Sparkle achieve significant performance gains, not only in the basic tasks themselves but also in generalizing to composite and out-of-distribution spatial reasoning tasks. These findings underscore the effectiveness of mastering basic spatial capabilities in enhancing composite spatial problem-solving, offering insights into systematic strategies for improving VLMs&#39; spatial reasoning capabilities. </p> </div> </dd> <dt> <a name='item156'>[156]</a> <a href ="/abs/2410.24160" title="Abstract" id="2410.24160"> arXiv:2410.24160 </a> (replaced) [<a href="/pdf/2410.24160" title="Download PDF" id="pdf-2410.24160" aria-labelledby="pdf-2410.24160">pdf</a>, <a href="https://arxiv.org/html/2410.24160v2" title="View HTML" id="html-2410.24160" aria-labelledby="html-2410.24160" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.24160" title="Other formats" id="oth-2410.24160" aria-labelledby="oth-2410.24160">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Redefining &lt;Creative&gt; in Dictionary: Towards an Enhanced Semantic Understanding of Creative Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Feng,+F">Fu Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xie,+Y">Yucheng Xie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yang,+X">Xu Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+J">Jing Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Geng,+X">Xin Geng</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> ``Creative&#39;&#39; remains an inherently abstract concept for both humans and diffusion models. While text-to-image (T2I) diffusion models can easily generate out-of-domain concepts like ``a blue banana&#39;&#39;, they struggle with generating combinatorial objects such as ``a creative mixture that resembles a lettuce and a mantis&#39;&#39;, due to difficulties in understanding the semantic depth of ``creative&#39;&#39;. Current methods rely heavily on synthesizing reference prompts or images to achieve a creative effect, typically requiring retraining for each unique creative output -- a process that is computationally intensive and limits practical applications. To address this, we introduce CreTok, which brings meta-creativity to diffusion models by redefining ``creative&#39;&#39; as a new token, \texttt{&lt;CreTok&gt;}, thus enhancing models&#39; semantic understanding for combinatorial creativity. CreTok achieves such redefinition by iteratively sampling diverse text pairs from our proposed CangJie dataset to form adaptive prompts and restrictive prompts, and then optimizing the similarity between their respective text embeddings. Extensive experiments demonstrate that \texttt{&lt;CreTok&gt;} enables the universal and direct generation of combinatorial creativity across diverse concepts without additional training (4s vs. BASS&#39;s 2400s per image), achieving state-of-the-art performance with improved text-image alignment ($\uparrow$0.03 in VQAScore) and higher human preference ratings ($\uparrow$0.009 in PickScore and $\uparrow$0.169 in ImageReward). Further evaluations with GPT-4o and user studies underscore CreTok&#39;s strengths in advancing creative generation. </p> </div> </dd> <dt> <a name='item157'>[157]</a> <a href ="/abs/2411.03795" title="Abstract" id="2411.03795"> arXiv:2411.03795 </a> (replaced) [<a href="/pdf/2411.03795" title="Download PDF" id="pdf-2411.03795" aria-labelledby="pdf-2411.03795">pdf</a>, <a href="https://arxiv.org/html/2411.03795v2" title="View HTML" id="html-2411.03795" aria-labelledby="html-2411.03795" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.03795" title="Other formats" id="oth-2411.03795" aria-labelledby="oth-2411.03795">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> VQA$^2$: Visual Question Answering for Video Quality Assessment </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jia,+Z">Ziheng Jia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+Z">Zicheng Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Qian,+J">Jiaying Qian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wu,+H">Haoning Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Sun,+W">Wei Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+C">Chunyi Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+X">Xiaohong Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lin,+W">Weisi Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhai,+G">Guangtao Zhai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Min,+X">Xiongkuo Min</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 24 pages 12 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> The advent and proliferation of large multi-modal models (LMMs) have introduced new paradigms to computer vision, transforming various tasks into a unified visual question answering framework. Video Quality Assessment (VQA), a classic field in low-level visual perception, focused initially on quantitative video quality scoring. However, driven by advances in LMMs, it is now progressing toward more holistic visual quality understanding tasks. Recent studies in the image domain have demonstrated that Visual Question Answering (VQA) can markedly enhance low-level visual quality evaluation. Nevertheless, related work has not been explored in the video domain, leaving substantial room for improvement. To address this gap, we introduce the VQA2 Instruction Dataset - the first visual question answering instruction dataset that focuses on video quality assessment. This dataset consists of 3 subsets and covers various video types, containing 157,755 instruction question-answer pairs. Then, leveraging this foundation, we present the VQA2 series models. The VQA2 series models interleave visual and motion tokens to enhance the perception of spatial-temporal quality details in videos. We conduct extensive experiments on video quality scoring and understanding tasks, and results demonstrate that the VQA2series models achieve excellent performance in both tasks. Notably, our final model, the VQA2-Assistant, exceeds the renowned GPT-4o in visual quality understanding tasks while maintaining strong competitiveness in quality scoring tasks. Our work provides a foundation and feasible approach for integrating low-level video quality assessment and understanding with LMMs. </p> </div> </dd> <dt> <a name='item158'>[158]</a> <a href ="/abs/2411.09955" title="Abstract" id="2411.09955"> arXiv:2411.09955 </a> (replaced) [<a href="/pdf/2411.09955" title="Download PDF" id="pdf-2411.09955" aria-labelledby="pdf-2411.09955">pdf</a>, <a href="https://arxiv.org/html/2411.09955v2" title="View HTML" id="html-2411.09955" aria-labelledby="html-2411.09955" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.09955" title="Other formats" id="oth-2411.09955" aria-labelledby="oth-2411.09955">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Instruction-Guided Editing Controls for Images and Multimedia: A Survey in LLM era </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Nguyen,+T+T">Thanh Tam Nguyen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ren,+Z">Zhao Ren</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Pham,+T">Trinh Pham</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Huynh,+T+T">Thanh Trung Huynh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Nguyen,+P+L">Phi Le Nguyen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yin,+H">Hongzhi Yin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Nguyen,+Q+V+H">Quoc Viet Hung Nguyen</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Fixed a serious error in author information </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Human-Computer Interaction (cs.HC); Machine Learning (cs.LG); Multimedia (cs.MM) </div> <p class='mathjax'> The rapid advancement of large language models (LLMs) and multimodal learning has transformed digital content creation and manipulation. Traditional visual editing tools require significant expertise, limiting accessibility. Recent strides in instruction-based editing have enabled intuitive interaction with visual content, using natural language as a bridge between user intent and complex editing operations. This survey provides an overview of these techniques, focusing on how LLMs and multimodal models empower users to achieve precise visual modifications without deep technical knowledge. By synthesizing over 100 publications, we explore methods from generative adversarial networks to diffusion models, examining multimodal integration for fine-grained content control. We discuss practical applications across domains such as fashion, 3D scene manipulation, and video synthesis, highlighting increased accessibility and alignment with human intuition. Our survey compares existing literature, emphasizing LLM-empowered editing, and identifies key challenges to stimulate further research. We aim to democratize powerful visual editing across various industries, from entertainment to education. Interested readers are encouraged to access our repository at <a href="https://github.com/tamlhp/awesome-instruction-editing" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item159'>[159]</a> <a href ="/abs/2411.10028" title="Abstract" id="2411.10028"> arXiv:2411.10028 </a> (replaced) [<a href="/pdf/2411.10028" title="Download PDF" id="pdf-2411.10028" aria-labelledby="pdf-2411.10028">pdf</a>, <a href="https://arxiv.org/html/2411.10028v2" title="View HTML" id="html-2411.10028" aria-labelledby="html-2411.10028" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.10028" title="Other formats" id="oth-2411.10028" aria-labelledby="oth-2411.10028">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MOT FCG++: Enhanced Representation of Spatio-temporal Motion and Appearance Features </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Fang,+Y">Yanzhao Fang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 14 pages, 7 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> The goal of multi-object tracking (MOT) is to detect and track all objects in a scene across frames, while maintaining a unique identity for each object. Most existing methods rely on the spatial-temporal motion features and appearance embedding features of the detected objects in consecutive frames. Effectively and robustly representing the spatial and appearance features of long trajectories has become a critical factor affecting the performance of MOT. We propose a novel approach for appearance and spatial-temporal motion feature representation, improving upon the hierarchical clustering association method MOT FCG. For spatialtemporal motion features, we first propose Diagonal Modulated GIoU, which more accurately represents the relationship between the position and shape of the objects. Second, Mean Constant Velocity Modeling is proposed to reduce the effect of observation noise on target motion state estimation. For appearance features, we utilize a dynamic appearance representation that incorporates confidence information, enabling the trajectory appearance features to be more robust and global. Based on the baseline model MOT FCG, we have realized further improvements in the performance of all. we achieved 63.1 HOTA, 76.9 MOTA and 78.2 IDF1 on the MOT17 test set, and also achieved competitive performance on the MOT20 and DanceTrack sets. </p> </div> </dd> <dt> <a name='item160'>[160]</a> <a href ="/abs/2411.10346" title="Abstract" id="2411.10346"> arXiv:2411.10346 </a> (replaced) [<a href="/pdf/2411.10346" title="Download PDF" id="pdf-2411.10346" aria-labelledby="pdf-2411.10346">pdf</a>, <a href="https://arxiv.org/html/2411.10346v2" title="View HTML" id="html-2411.10346" aria-labelledby="html-2411.10346" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.10346" title="Other formats" id="oth-2411.10346" aria-labelledby="oth-2411.10346">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> BiDense: Binarization for Dense Prediction </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yin,+R">Rui Yin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Qin,+H">Haotong Qin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+Y">Yulun Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+W">Wenbo Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Guo,+Y">Yong Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhu,+J">Jianjun Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+C">Cheng Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jia,+B">Biao Jia</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Dense prediction is a critical task in computer vision. However, previous methods often require extensive computational resources, which hinders their real-world application. In this paper, we propose BiDense, a generalized binary neural network (BNN) designed for efficient and accurate dense prediction tasks. BiDense incorporates two key techniques: the Distribution-adaptive Binarizer (DAB) and the Channel-adaptive Full-precision Bypass (CFB). The DAB adaptively calculates thresholds and scaling factors for binarization, effectively retaining more information within BNNs. Meanwhile, the CFB facilitates full-precision bypassing for binary convolutional layers undergoing various channel size transformations, which enhances the propagation of real-valued signals and minimizes information loss. By leveraging these techniques, BiDense preserves more real-valued information, enabling more accurate and detailed dense predictions in BNNs. Extensive experiments demonstrate that our framework achieves performance levels comparable to full-precision models while significantly reducing memory usage and computational costs. </p> </div> </dd> <dt> <a name='item161'>[161]</a> <a href ="/abs/2411.12089" title="Abstract" id="2411.12089"> arXiv:2411.12089 </a> (replaced) [<a href="/pdf/2411.12089" title="Download PDF" id="pdf-2411.12089" aria-labelledby="pdf-2411.12089">pdf</a>, <a href="https://arxiv.org/html/2411.12089v2" title="View HTML" id="html-2411.12089" aria-labelledby="html-2411.12089" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.12089" title="Other formats" id="oth-2411.12089" aria-labelledby="oth-2411.12089">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> FruitNinja: 3D Object Interior Texture Generation with Gaussian Splatting </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wu,+F">Fangyu Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+Y">Yuhao Chen</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Graphics (cs.GR); Human-Computer Interaction (cs.HC) </div> <p class='mathjax'> In the real world, objects reveal internal textures when sliced or cut, yet this behavior is not well-studied in 3D generation tasks today. For example, slicing a virtual 3D watermelon should reveal flesh and seeds. Given that no available dataset captures an object&#39;s full internal structure and collecting data from all slices is impractical, generative methods become the obvious approach. However, current 3D generation and inpainting methods often focus on visible appearance and overlook internal textures. To bridge this gap, we introduce FruitNinja, the first method to generate internal textures for 3D objects undergoing geometric and topological changes. Our approach produces objects via 3D Gaussian Splatting (3DGS) with both surface and interior textures synthesized, enabling real-time slicing and rendering without additional optimization. FruitNinja leverages a pre-trained diffusion model to progressively inpaint cross-sectional views and applies voxel-grid-based smoothing to achieve cohesive textures throughout the object. Our OpaqueAtom GS strategy overcomes 3DGS limitations by employing densely distributed opaque Gaussians, avoiding biases toward larger particles that destabilize training and sharp color transitions for fine-grained textures. Experimental results show that FruitNinja substantially outperforms existing approaches, showcasing unmatched visual quality in real-time rendered internal views across arbitrary geometry manipulations. </p> </div> </dd> <dt> <a name='item162'>[162]</a> <a href ="/abs/2411.12248" title="Abstract" id="2411.12248"> arXiv:2411.12248 </a> (replaced) [<a href="/pdf/2411.12248" title="Download PDF" id="pdf-2411.12248" aria-labelledby="pdf-2411.12248">pdf</a>, <a href="https://arxiv.org/html/2411.12248v2" title="View HTML" id="html-2411.12248" aria-labelledby="html-2411.12248" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.12248" title="Other formats" id="oth-2411.12248" aria-labelledby="oth-2411.12248">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Neuro-3D: Towards 3D Visual Decoding from EEG Signals </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Guo,+Z">Zhanqiang Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wu,+J">Jiamin Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Song,+Y">Yonghao Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Bu,+J">Jiahui Bu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Mai,+W">Weijian Mai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zheng,+Q">Qihao Zheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ouyang,+W">Wanli Ouyang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Song,+C">Chunfeng Song</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Human&#39;s perception of the visual world is shaped by the stereo processing of 3D information. Understanding how the brain perceives and processes 3D visual stimuli in the real world has been a longstanding endeavor in neuroscience. Towards this goal, we introduce a new neuroscience task: decoding 3D visual perception from EEG signals, a neuroimaging technique that enables real-time monitoring of neural dynamics enriched with complex visual cues. To provide the essential benchmark, we first present EEG-3D, a pioneering dataset featuring multimodal analysis data and extensive EEG recordings from 12 subjects viewing 72 categories of 3D objects rendered in both videos and images. Furthermore, we propose Neuro-3D, a 3D visual decoding framework based on EEG signals. This framework adaptively integrates EEG features derived from static and dynamic stimuli to learn complementary and robust neural representations, which are subsequently utilized to recover both the shape and color of 3D objects through the proposed diffusion-based colored point cloud decoder. To the best of our knowledge, we are the first to explore EEG-based 3D visual decoding. Experiments indicate that Neuro-3D not only reconstructs colored 3D objects with high fidelity, but also learns effective neural representations that enable insightful brain region analysis. The dataset and associated code will be made publicly available. </p> </div> </dd> <dt> <a name='item163'>[163]</a> <a href ="/abs/2411.13211" title="Abstract" id="2411.13211"> arXiv:2411.13211 </a> (replaced) [<a href="/pdf/2411.13211" title="Download PDF" id="pdf-2411.13211" aria-labelledby="pdf-2411.13211">pdf</a>, <a href="https://arxiv.org/html/2411.13211v2" title="View HTML" id="html-2411.13211" aria-labelledby="html-2411.13211" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13211" title="Other formats" id="oth-2411.13211" aria-labelledby="oth-2411.13211">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> ViSTa Dataset: Do vision-language models understand sequential tasks? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wybitul,+E">Ev啪en Wybitul</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gunter,+E+R">Evan Ryan Gunter</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Seleznyov,+M">Mikhail Seleznyov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lindner,+D">David Lindner</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Using vision-language models (VLMs) as reward models in reinforcement learning holds promise for reducing costs and improving safety. So far, VLM reward models have only been used for goal-oriented tasks, where the agent must reach a particular final outcome. We explore VLMs&#39; potential to supervise tasks that cannot be scored by the final state alone. To this end, we introduce ViSTa, a dataset for evaluating Vision-based understanding of Sequential Tasks. ViSTa comprises over 4,000 videos with step-by-step descriptions in virtual home, Minecraft, and real-world environments. Its novel hierarchical structure -- basic single-step tasks composed into more and more complex sequential tasks -- allows a fine-grained understanding of how well VLMs can judge tasks with varying complexity. To illustrate this, we use ViSTa to evaluate state-of-the-art VLMs, including CLIP, ViCLIP, and GPT-4o. We find that, while they are all good at object recognition, they fail to understand sequential tasks, with only GPT-4o achieving non-trivial performance. </p> </div> </dd> <dt> <a name='item164'>[164]</a> <a href ="/abs/2411.13525" title="Abstract" id="2411.13525"> arXiv:2411.13525 </a> (replaced) [<a href="/pdf/2411.13525" title="Download PDF" id="pdf-2411.13525" aria-labelledby="pdf-2411.13525">pdf</a>, <a href="https://arxiv.org/html/2411.13525v2" title="View HTML" id="html-2411.13525" aria-labelledby="html-2411.13525" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13525" title="Other formats" id="oth-2411.13525" aria-labelledby="oth-2411.13525">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Geometric Algebra Planes: Convex Implicit Neural Volumes </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Sivgin,+I">Irmak Sivgin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Fridovich-Keil,+S">Sara Fridovich-Keil</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wetzstein,+G">Gordon Wetzstein</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Pilanci,+M">Mert Pilanci</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Code is available at <a href="https://github.com/sivginirmak/Geometric-Algebra-Planes" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Volume parameterizations abound in recent literature, from the classic voxel grid to the implicit neural representation and everything in between. While implicit representations have shown impressive capacity and better memory efficiency compared to voxel grids, to date they require training via nonconvex optimization. This nonconvex training process can be slow to converge and sensitive to initialization and hyperparameter choices that affect the final converged result. We introduce a family of models, GA-Planes, that is the first class of implicit neural volume representations that can be trained by convex optimization. GA-Planes models include any combination of features stored in tensor basis elements, followed by a neural feature decoder. They generalize many existing representations and can be adapted for convex, semiconvex, or nonconvex training as needed for different inverse problems. In the 2D setting, we prove that GA-Planes is equivalent to a low-rank plus low-resolution matrix factorization; we show that this approximation outperforms the classic low-rank plus sparse decomposition for fitting a natural image. In 3D, we demonstrate GA-Planes&#39; competitive performance in terms of expressiveness, model size, and optimizability across three volume fitting tasks: radiance field reconstruction, 3D segmentation, and video segmentation. </p> </div> </dd> <dt> <a name='item165'>[165]</a> <a href ="/abs/2411.13545" title="Abstract" id="2411.13545"> arXiv:2411.13545 </a> (replaced) [<a href="/pdf/2411.13545" title="Download PDF" id="pdf-2411.13545" aria-labelledby="pdf-2411.13545">pdf</a>, <a href="https://arxiv.org/html/2411.13545v2" title="View HTML" id="html-2411.13545" aria-labelledby="html-2411.13545" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13545" title="Other formats" id="oth-2411.13545" aria-labelledby="oth-2411.13545">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Pushing the Limits of Sparsity: A Bag of Tricks for Extreme Pruning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+A">Andy Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Durrant,+A">Aiden Durrant</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Markovic,+M">Milan Markovic</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yin,+L">Lu Yin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Leontidis,+G">Georgios Leontidis</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> V2: same as V1 but with appendix/preliminaries; 12 pages, 5 figures, 4 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span> </div> <p class='mathjax'> Pruning of deep neural networks has been an effective technique for reducing model size while preserving most of the performance of dense networks, crucial for deploying models on memory and power-constrained devices. While recent sparse learning methods have shown promising performance up to moderate sparsity levels such as 95% and 98%, accuracy quickly deteriorates when pushing sparsities to extreme levels. Obtaining sparse networks at such extreme sparsity levels presents unique challenges, such as fragile gradient flow and heightened risk of layer collapse. In this work, we explore network performance beyond the commonly studied sparsities, and propose a collection of techniques that enable the continuous learning of networks without accuracy collapse even at extreme sparsities, including 99.90%, 99.95% and 99.99% on ResNet architectures. Our approach combines 1) Dynamic ReLU phasing, where DyReLU initially allows for richer parameter exploration before being gradually replaced by standard ReLU, 2) weight sharing which reuses parameters within a residual layer while maintaining the same number of learnable parameters, and 3) cyclic sparsity, where both sparsity levels and sparsity patterns evolve dynamically throughout training to better encourage parameter exploration. We evaluate our method, which we term Extreme Adaptive Sparse Training (EAST) at extreme sparsities using ResNet-34 and ResNet-50 on CIFAR-10, CIFAR-100, and ImageNet, achieving significant performance improvements over state-of-the-art methods we compared with. </p> </div> </dd> <dt> <a name='item166'>[166]</a> <a href ="/abs/2207.13021" title="Abstract" id="2207.13021"> arXiv:2207.13021 </a> (replaced) [<a href="/pdf/2207.13021" title="Download PDF" id="pdf-2207.13021" aria-labelledby="pdf-2207.13021">pdf</a>, <a href="/format/2207.13021" title="Other formats" id="oth-2207.13021" aria-labelledby="oth-2207.13021">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CTVR-EHO TDA-IPH Topological Optimized Convolutional Visual Recurrent Network for Brain Tumor Segmentation and Classification </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Joshi,+D">Dhananjay Joshi</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Singh,+B+K">Bhupesh Kumar Singh</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Nagwanshi,+K+K">Kapil Kumar Nagwanshi</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Choubey,+N+S">Nitin S. Choubey</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> In today&#39;s world of health care, brain tumor detection has become common. However, the manual brain tumor classification approach is time-consuming. So Deep Convolutional Neural Network (DCNN) is used by many researchers in the medical field for making accurate diagnoses and aiding in the patient&#39;s treatment. The traditional techniques have problems such as overfitting and the inability to extract necessary features. To overcome these problems, we developed the Topological Data Analysis based Improved Persistent Homology (TDA-IPH) and Convolutional Transfer learning and Visual Recurrent learning with Elephant Herding Optimization hyper-parameter tuning (CTVR-EHO) models for brain tumor segmentation and classification. Initially, the Topological Data Analysis based Improved Persistent Homology is designed to segment the brain tumor image. Then, from the segmented image, features are extracted using TL via the AlexNet model and Bidirectional Visual Long Short-Term Memory (Bi-VLSTM). Next, elephant Herding Optimization (EHO) is used to tune the hyperparameters of both networks to get an optimal result. Finally, extracted features are concatenated and classified using the softmax activation layer. The simulation result of this proposed CTVR-EHO and TDA-IPH method is analyzed based on precision, accuracy, recall, loss, and F score metrics. When compared to other existing brain tumor segmentation and classification models, the proposed CTVR-EHO and TDA-IPH approaches show high accuracy (99.8%), high recall (99.23%), high precision (99.67%), and high F score (99.59%). </p> </div> </dd> <dt> <a name='item167'>[167]</a> <a href ="/abs/2302.09682" title="Abstract" id="2302.09682"> arXiv:2302.09682 </a> (replaced) [<a href="/pdf/2302.09682" title="Download PDF" id="pdf-2302.09682" aria-labelledby="pdf-2302.09682">pdf</a>, <a href="https://arxiv.org/html/2302.09682v2" title="View HTML" id="html-2302.09682" aria-labelledby="html-2302.09682" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2302.09682" title="Other formats" id="oth-2302.09682" aria-labelledby="oth-2302.09682">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Dual Attention Model with Reinforcement Learning for Classification of Histology Whole-Slide Images </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Raza,+M">Manahil Raza</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Awan,+R">Ruqayya Awan</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Bashir,+R+M+S">Raja Muhammad Saad Bashir</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Qaiser,+T">Talha Qaiser</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Rajpoot,+N+M">Nasir M. Rajpoot</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Digital whole slide images (WSIs) are generally captured at microscopic resolution and encompass extensive spatial data. Directly feeding these images to deep learning models is computationally intractable due to memory constraints, while downsampling the WSIs risks incurring information loss. Alternatively, splitting the WSIs into smaller patches may result in a loss of important contextual information. In this paper, we propose a novel dual attention approach, consisting of two main components, both inspired by the visual examination process of a pathologist: The first soft attention model processes a low magnification view of the WSI to identify relevant regions of interest, followed by a custom sampling method to extract diverse and spatially distinct image tiles from the selected ROIs. The second component, the hard attention classification model further extracts a sequence of multi-resolution glimpses from each tile for classification. Since hard attention is non-differentiable, we train this component using reinforcement learning to predict the location of the glimpses. This approach allows the model to focus on essential regions instead of processing the entire tile, thereby aligning with a pathologist&#39;s way of diagnosis. The two components are trained in an end-to-end fashion using a joint loss function to demonstrate the efficacy of the model. The proposed model was evaluated on two WSI-level classification problems: Human epidermal growth factor receptor 2 scoring on breast cancer histology images and prediction of Intact/Loss status of two Mismatch Repair biomarkers from colorectal cancer histology images. We show that the proposed model achieves performance better than or comparable to the state-of-the-art methods while processing less than 10% of the WSI at the highest magnification and reducing the time required to infer the WSI-level label by more than 75%. </p> </div> </dd> <dt> <a name='item168'>[168]</a> <a href ="/abs/2310.00616" title="Abstract" id="2310.00616"> arXiv:2310.00616 </a> (replaced) [<a href="/pdf/2310.00616" title="Download PDF" id="pdf-2310.00616" aria-labelledby="pdf-2310.00616">pdf</a>, <a href="https://arxiv.org/html/2310.00616v2" title="View HTML" id="html-2310.00616" aria-labelledby="html-2310.00616" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2310.00616" title="Other formats" id="oth-2310.00616" aria-labelledby="oth-2310.00616">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Towards Understanding Adversarial Transferability in Federated Learning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+Y">Yijiang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gao,+Y">Ying Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+H">Haohan Wang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Published in Transactions on Machine Learning Research (TMLR) (11/2024) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> We investigate a specific security risk in FL: a group of malicious clients has impacted the model during training by disguising their identities and acting as benign clients but later switching to an adversarial role. They use their data, which was part of the training set, to train a substitute model and conduct transferable adversarial attacks against the federated model. This type of attack is subtle and hard to detect because these clients initially appear to be benign. <br>The key question we address is: How robust is the FL system to such covert attacks, especially compared to traditional centralized learning systems? We empirically show that the proposed attack imposes a high security risk to current FL systems. By using only 3\% of the client&#39;s data, we achieve the highest attack rate of over 80\%. To further offer a full understanding of the challenges the FL system faces in transferable attacks, we provide a comprehensive analysis over the transfer robustness of FL across a spectrum of configurations. Surprisingly, FL systems show a higher level of robustness than their centralized counterparts, especially when both systems are equally good at handling regular, non-malicious data. <br>We attribute this increased robustness to two main factors: 1) Decentralized Data Training: Each client trains the model on its own data, reducing the overall impact of any single malicious client. 2) Model Update Averaging: The updates from each client are averaged together, further diluting any malicious alterations. Both practical experiments and theoretical analysis support our conclusions. This research not only sheds light on the resilience of FL systems against hidden attacks but also raises important considerations for their future application and development. </p> </div> </dd> <dt> <a name='item169'>[169]</a> <a href ="/abs/2311.00167" title="Abstract" id="2311.00167"> arXiv:2311.00167 </a> (replaced) [<a href="/pdf/2311.00167" title="Download PDF" id="pdf-2311.00167" aria-labelledby="pdf-2311.00167">pdf</a>, <a href="https://arxiv.org/html/2311.00167v2" title="View HTML" id="html-2311.00167" aria-labelledby="html-2311.00167" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2311.00167" title="Other formats" id="oth-2311.00167" aria-labelledby="oth-2311.00167">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Hierarchical Information-sharing Convolutional Neural Network for the Prediction of Arctic Sea Ice Concentration and Velocity </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Koo,+Y">Younghyun Koo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Rahnemoonfar,+M">Maryam Rahnemoonfar</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computer Vision and Pattern Recognition (cs.CV); Atmospheric and Oceanic Physics (physics.ao-ph) </div> <p class='mathjax'> Forecasting sea ice concentration (SIC) and sea ice velocity (SIV) in the Arctic Ocean is of great significance as the Arctic environment has been changed by the recent warming climate. Given that physical sea ice models require high computational costs with complex parameterization, deep learning techniques can effectively replace the physical model and improve the performance of sea ice prediction. This study proposes a novel multi-task fully conventional network architecture named hierarchical information-sharing U-net (HIS-Unet) to predict daily SIC and SIV. Instead of learning SIC and SIV separately at each branch, we allow the SIC and SIV layers to share their information and assist each other&#39;s prediction through the weighting attention modules (WAMs). Consequently, our HIS-Unet outperforms other statistical approaches, sea ice physical models, and neural networks without such information-sharing units. The improvement of HIS-Unet is more significant to when and where SIC changes seasonally, which implies that the information sharing between SIC and SIV through WAMs helps learn the dynamic changes of SIC and SIV. The weight values of the WAMs imply that SIC information plays a more critical role in SIV prediction, compared to that of SIV information in SIC prediction, and information sharing is more active in marginal ice zones (e.g., East Greenland and Hudson/Baffin Bays) than in the central Arctic. </p> </div> </dd> <dt> <a name='item170'>[170]</a> <a href ="/abs/2311.15414" title="Abstract" id="2311.15414"> arXiv:2311.15414 </a> (replaced) [<a href="/pdf/2311.15414" title="Download PDF" id="pdf-2311.15414" aria-labelledby="pdf-2311.15414">pdf</a>, <a href="https://arxiv.org/html/2311.15414v3" title="View HTML" id="html-2311.15414" aria-labelledby="html-2311.15414" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2311.15414" title="Other formats" id="oth-2311.15414" aria-labelledby="oth-2311.15414">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> KOPPA: Improving Prompt-based Continual Learning with Key-Query Orthogonal Projection and Prototype-based One-Versus-All </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Tran,+Q">Quyen Tran</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Phan,+H">Hoang Phan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Tran,+L">Lam Tran</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Than,+K">Khoat Than</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Tran,+T">Toan Tran</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Phung,+D">Dinh Phung</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Le,+T">Trung Le</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Drawing inspiration from prompt tuning techniques applied to Large Language Models, recent methods based on pre-trained ViT networks have achieved remarkable results in the field of Continual Learning. Specifically, these approaches propose to maintain a set of prompts and allocate a subset of them to learn each task using a key-query matching strategy. However, they may encounter limitations when lacking control over the correlations between old task queries and keys of future tasks, the shift of features in the latent space, and the relative separation of latent vectors learned in independent tasks. In this work, we introduce a novel key-query learning strategy based on orthogonal projection, inspired by model-agnostic meta-learning, to enhance prompt matching efficiency and address the challenge of shifting features. Furthermore, we introduce a One-Versus-All (OVA) prototype-based component that enhances the classification head distinction. Experimental results on benchmark datasets demonstrate that our method empowers the model to achieve results surpassing those of current state-of-the-art approaches by a large margin of up to 20%. </p> </div> </dd> <dt> <a name='item171'>[171]</a> <a href ="/abs/2311.16141" title="Abstract" id="2311.16141"> arXiv:2311.16141 </a> (replaced) [<a href="/pdf/2311.16141" title="Download PDF" id="pdf-2311.16141" aria-labelledby="pdf-2311.16141">pdf</a>, <a href="https://arxiv.org/html/2311.16141v3" title="View HTML" id="html-2311.16141" aria-labelledby="html-2311.16141" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2311.16141" title="Other formats" id="oth-2311.16141" aria-labelledby="oth-2311.16141">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Brain-Inspired Efficient Pruning: Exploiting Criticality in Spiking Neural Networks </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+S">Shuo Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+B">Boxiao Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+Z">Zeshi Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=You,+H">Haihang You</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Neural and Evolutionary Computing (cs.NE)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV); Machine Learning (cs.LG) </div> <p class='mathjax'> Spiking Neural Networks (SNNs) have gained significant attention due to the energy-efficient and multiplication-free characteristics. Despite these advantages, deploying large-scale SNNs on edge hardware is challenging due to limited resource availability. Network pruning offers a viable approach to compress the network scale and reduce hardware resource requirements for model deployment. However, existing SNN pruning methods cause high pruning costs and performance loss because they lack efficiency in processing the sparse spike representation of SNNs. In this paper, inspired by the critical brain hypothesis in neuroscience and the high biological plausibility of SNNs, we explore and leverage criticality to facilitate efficient pruning in deep SNNs. We firstly explain criticality in SNNs from the perspective of maximizing feature information entropy. Second, We propose a low-cost metric for assess neuron criticality in feature transmission and design a pruning-regeneration method that incorporates this criticality into the pruning process. Experimental results demonstrate that our method achieves higher performance than the current state-of-the-art (SOTA) method with up to 95.26\% reduction of pruning cost. The criticality-based regeneration process efficiently selects potential structures and facilitates consistent feature representation. </p> </div> </dd> <dt> <a name='item172'>[172]</a> <a href ="/abs/2403.03163" title="Abstract" id="2403.03163"> arXiv:2403.03163 </a> (replaced) [<a href="/pdf/2403.03163" title="Download PDF" id="pdf-2403.03163" aria-labelledby="pdf-2403.03163">pdf</a>, <a href="/format/2403.03163" title="Other formats" id="oth-2403.03163" aria-labelledby="oth-2403.03163">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Design2Code: Benchmarking Multimodal Code Generation for Automated Front-End Engineering </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Si,+C">Chenglei Si</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+Y">Yanzhe Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+R">Ryan Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yang,+Z">Zhengyuan Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+R">Ruibo Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yang,+D">Diyi Yang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> The first two authors contributed equally </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Computer Vision and Pattern Recognition (cs.CV); Computers and Society (cs.CY) </div> <p class='mathjax'> Generative AI has made rapid advancements in recent years, achieving unprecedented capabilities in multimodal understanding and code generation. This can enable a new paradigm of front-end development in which multimodal large language models (MLLMs) directly convert visual designs into code implementations. In this work, we construct Design2Code - the first real-world benchmark for this task. Specifically, we manually curate 484 diverse real-world webpages as test cases and develop a set of automatic evaluation metrics to assess how well current multimodal LLMs can generate the code implementations that directly render into the given reference webpages, given the screenshots as input. We also complement automatic metrics with comprehensive human evaluations to validate the performance ranking. To rigorously benchmark MLLMs, we test various multimodal prompting methods on frontier models such as GPT-4o, GPT-4V, Gemini, and Claude. Our fine-grained break-down metrics indicate that models mostly lag in recalling visual elements from the input webpages and generating correct layout designs. </p> </div> </dd> <dt> <a name='item173'>[173]</a> <a href ="/abs/2403.14320" title="Abstract" id="2403.14320"> arXiv:2403.14320 </a> (replaced) [<a href="/pdf/2403.14320" title="Download PDF" id="pdf-2403.14320" aria-labelledby="pdf-2403.14320">pdf</a>, <a href="https://arxiv.org/html/2403.14320v2" title="View HTML" id="html-2403.14320" aria-labelledby="html-2403.14320" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2403.14320" title="Other formats" id="oth-2403.14320" aria-labelledby="oth-2403.14320">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Exosense: A Vision-Based Scene Understanding System For Exoskeletons </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+J">Jianeng Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Mattamala,+M">Matias Mattamala</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kassab,+C">Christina Kassab</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Burger,+G">Guillaume Burger</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Elnecave,+F">Fabio Elnecave</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+L">Lintong Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Petriaux,+M">Marine Petriaux</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Fallon,+M">Maurice Fallon</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 8 pages, 9 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Self-balancing exoskeletons are a key enabling technology for individuals with mobility impairments. While the current challenges focus on human-compliant hardware and control, unlocking their use for daily activities requires a scene perception system. In this work, we present Exosense, a vision-centric scene understanding system for self-balancing exoskeletons. We introduce a multi-sensor visual-inertial mapping device as well as a navigation stack for state estimation, terrain mapping and long-term operation. We tested Exosense attached to both a human leg and Wandercraft&#39;s Personal Exoskeleton in real-world indoor scenarios. This enabled us to test the system during typical periodic walking gaits, as well as future uses in multi-story environments. We demonstrate that Exosense can achieve an odometry drift of about 4 cm per meter traveled, and construct terrain maps under 1 cm average reconstruction error. It can also work in a visual localization mode in a previously mapped environment, providing a step towards long-term operation of exoskeletons. </p> </div> </dd> <dt> <a name='item174'>[174]</a> <a href ="/abs/2404.04254" title="Abstract" id="2404.04254"> arXiv:2404.04254 </a> (replaced) [<a href="/pdf/2404.04254" title="Download PDF" id="pdf-2404.04254" aria-labelledby="pdf-2404.04254">pdf</a>, <a href="https://arxiv.org/html/2404.04254v3" title="View HTML" id="html-2404.04254" aria-labelledby="html-2404.04254" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2404.04254" title="Other formats" id="oth-2404.04254" aria-labelledby="oth-2404.04254">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Watermark-based Attribution of AI-Generated Content </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jiang,+Z">Zhengyuan Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Guo,+M">Moyang Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Hu,+Y">Yuepeng Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gong,+N+Z">Neil Zhenqiang Gong</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Cryptography and Security (cs.CR)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Computer Vision and Pattern Recognition (cs.CV); Machine Learning (cs.LG) </div> <p class='mathjax'> Several companies have deployed watermark-based detection to identify AI-generated content. However, attribution--the ability to trace back to the user of a generative AI (GenAI) service who created a given piece of AI-generated content--remains largely unexplored despite its growing importance. In this work, we aim to bridge this gap by conducting the first systematic study on watermark-based, user-level attribution of AI-generated content. Our key idea is to assign a unique watermark to each user of the GenAI service and embed this watermark into the AI-generated content created by that user. Attribution is then performed by identifying the user whose watermark best matches the one extracted from the given content. This approach, however, faces a key challenge: How should watermarks be selected for users to maximize attribution performance? To address the challenge, we first theoretically derive lower bounds on detection and attribution performance through rigorous probabilistic analysis for any given set of user watermarks. Then, we select watermarks for users to maximize these lower bounds, thereby optimizing detection and attribution performance. Our theoretical and empirical results show that watermark-based attribution inherits both the accuracy and (non-)robustness properties of the underlying watermark. Specifically, attribution remains highly accurate when the watermarked AI-generated content is either not post-processed or subjected to common post-processing such as JPEG compression, as well as black-box adversarial post-processing with limited query budgets. </p> </div> </dd> <dt> <a name='item175'>[175]</a> <a href ="/abs/2405.17141" title="Abstract" id="2405.17141"> arXiv:2405.17141 </a> (replaced) [<a href="/pdf/2405.17141" title="Download PDF" id="pdf-2405.17141" aria-labelledby="pdf-2405.17141">pdf</a>, <a href="https://arxiv.org/html/2405.17141v2" title="View HTML" id="html-2405.17141" aria-labelledby="html-2405.17141" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2405.17141" title="Other formats" id="oth-2405.17141" aria-labelledby="oth-2405.17141">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MVMS-RCN: A Dual-Domain Unfolding CT Reconstruction with Multi-sparse-view and Multi-scale Refinement-correction </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Fan,+X">Xiaohong Fan</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Chen,+K">Ke Chen</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Yi,+H">Huaming Yi</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Yang,+Y">Yin Yang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Zhang,+J">Jianping Zhang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 14 pages, Accepted to IEEE Transactions on Computational Imaging, 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> X-ray Computed Tomography (CT) is one of the most important diagnostic imaging techniques in clinical applications. Sparse-view CT imaging reduces the number of projection views to a lower radiation dose and alleviates the potential risk of radiation exposure. Most existing deep learning (DL) and deep unfolding sparse-view CT reconstruction methods: 1) do not fully use the projection data; 2) do not always link their architecture designs to a mathematical theory; 3) do not flexibly deal with multi-sparse-view reconstruction assignments. This paper aims to use mathematical ideas and design optimal DL imaging algorithms for sparse-view tomography reconstructions. We propose a novel dual-domain deep unfolding unified framework that offers a great deal of flexibility for multi-sparse-view CT reconstruction with different sampling views through a single model. This framework combines the theoretical advantages of model-based methods with the superior reconstruction performance of DL-based methods, resulting in the expected generalizability of DL. We propose a refinement module that utilizes unfolding projection domain to refine full-sparse-view projection errors, as well as an image domain correction module that distills multi-scale geometric error corrections to reconstruct sparse-view CT. This provides us with a new way to explore the potential of projection information and a new perspective on designing network architectures. All parameters of our proposed framework are learnable end to end, and our method possesses the potential to be applied to plug-and-play reconstruction. Extensive experiments demonstrate that our framework is superior to other existing state-of-the-art methods. Our source codes are available at <a href="https://github.com/fanxiaohong/MVMS-RCN" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item176'>[176]</a> <a href ="/abs/2406.07294" title="Abstract" id="2406.07294"> arXiv:2406.07294 </a> (replaced) [<a href="/pdf/2406.07294" title="Download PDF" id="pdf-2406.07294" aria-labelledby="pdf-2406.07294">pdf</a>, <a href="https://arxiv.org/html/2406.07294v2" title="View HTML" id="html-2406.07294" aria-labelledby="html-2406.07294" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.07294" title="Other formats" id="oth-2406.07294" aria-labelledby="oth-2406.07294">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> OTO Planner: An Efficient Only Travelling Once Exploration Planner for Complex and Unknown Environments </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhou,+B">Bo Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lu,+C">Chuanzhao Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Pan,+Y">Yan Pan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+F">Fu Chen</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Autonomous exploration in complex and cluttered environments is essential for various applications. However, there are many challenges due to the lack of global heuristic information. Existing exploration methods suffer from the repeated paths and considerable computational resource requirement in large-scale environments. To address the above issues, this letter proposes an efficient exploration planner that reduces repeated paths in complex environments, hence it is called &#34;Only Travelling Once Planner&#34;. OTO Planner includes fast frontier updating, viewpoint evaluation and viewpoint refinement. A selective frontier updating mechanism is designed, saving a large amount of computational resources. In addition, a novel viewpoint evaluation system is devised to reduce the repeated paths utilizing the enclosed sub-region detection. Besides, a viewpoint refinement approach is raised to concentrate the redundant viewpoints, leading to smoother paths. We conduct extensive simulation and real-world experiments to validate the proposed method. Compared to the state-of-the-art approach, the proposed method reduces the exploration time and movement distance by 10%-20% and improves the speed of frontier detection by 6-9 times. </p> </div> </dd> <dt> <a name='item177'>[177]</a> <a href ="/abs/2406.15656" title="Abstract" id="2406.15656"> arXiv:2406.15656 </a> (replaced) [<a href="/pdf/2406.15656" title="Download PDF" id="pdf-2406.15656" aria-labelledby="pdf-2406.15656">pdf</a>, <a href="https://arxiv.org/html/2406.15656v2" title="View HTML" id="html-2406.15656" aria-labelledby="html-2406.15656" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.15656" title="Other formats" id="oth-2406.15656" aria-labelledby="oth-2406.15656">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Self-Supervised Adversarial Diffusion Models for Fast MRI Reconstruction </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Safari,+M">Mojtaba Safari</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Eidex,+Z">Zach Eidex</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Pan,+S">Shaoyan Pan</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Qiu,+R+L">Richard L.J. Qiu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Yang,+X">Xiaofeng Yang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Purpose: To propose a self-supervised deep learning-based compressed sensing MRI (DL-based CS-MRI) method named &#34;Adaptive Self-Supervised Consistency Guided Diffusion Model (ASSCGD)&#34; to accelerate data acquisition without requiring fully sampled datasets. Materials and Methods: We used the fastMRI multi-coil brain axial T2-weighted (T2-w) dataset from 1,376 cases and single-coil brain quantitative magnetization prepared 2 rapid acquisition gradient echoes (MP2RAGE) T1 maps from 318 cases to train and test our model. Robustness against domain shift was evaluated using two out-of-distribution (OOD) datasets: multi-coil brain axial postcontrast T1 -weighted (T1c) dataset from 50 cases and axial T1-weighted (T1-w) dataset from 50 patients. Data were retrospectively subsampled at acceleration rates R in {2x, 4x, 8x}. ASSCGD partitions a random sampling pattern into two disjoint sets, ensuring data consistency during training. We compared our method with ReconFormer Transformer and SS-MRI, assessing performance using normalized mean squared error (NMSE), peak signal-to-noise ratio (PSNR), and structural similarity index (SSIM). Statistical tests included one-way analysis of variance (ANOVA) and multi-comparison Tukey&#39;s Honesty Significant Difference (HSD) tests. Results: ASSCGD preserved fine structures and brain abnormalities visually better than comparative methods at R = 8x for both multi-coil and single-coil datasets. It achieved the lowest NMSE at R in {4x, 8x}, and the highest PSNR and SSIM values at all acceleration rates for the multi-coil dataset. Similar trends were observed for the single-coil dataset, though SSIM values were comparable to ReconFormer at R in {2x, 8x}. These results were further confirmed by the voxel-wise correlation scatter plots. OOD results showed significant (p &lt;&lt; 10^-5 ) improvements in undersampled image quality after reconstruction. </p> </div> </dd> <dt> <a name='item178'>[178]</a> <a href ="/abs/2407.10921" title="Abstract" id="2407.10921"> arXiv:2407.10921 </a> (replaced) [<a href="/pdf/2407.10921" title="Download PDF" id="pdf-2407.10921" aria-labelledby="pdf-2407.10921">pdf</a>, <a href="https://arxiv.org/html/2407.10921v4" title="View HTML" id="html-2407.10921" aria-labelledby="html-2407.10921" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2407.10921" title="Other formats" id="oth-2407.10921" aria-labelledby="oth-2407.10921">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Leveraging Bi-Focal Perspectives and Granular Feature Integration for Accurate Reliable Early Alzheimer&#39;s Detection </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=V,+P">Pandiyaraju V</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Venkatraman,+S">Shravan Venkatraman</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=A,+A">Abeshek A</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=S,+P+K">Pavan Kumar S</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=A,+A+S">Aravintakshan S A</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=A,+K">Kannan A</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 14 pages, 12 figures, 6 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Computer Vision and Pattern Recognition (cs.CV); Machine Learning (cs.LG) </div> <p class='mathjax'> Alzheimer&#39;s disease (AD) is the most common neurodegeneration, annually diagnosed in millions of patients. The present medicine scenario still finds challenges in the exact diagnosis and classification of AD through neuroimaging data. Traditional CNNs can extract a good amount of low-level information in an image but fail to extract high-level minuscule particles, which is a significant challenge in detecting AD from MRI scans. To overcome this, we propose a novel Granular Feature Integration method to combine information extraction at different scales combined with an efficient information flow, enabling the model to capture both broad and fine-grained features simultaneously. We also propose a Bi-Focal Perspective mechanism to highlight the subtle neurofibrillary tangles and amyloid plaques in the MRI scans, ensuring that critical pathological markers are accurately identified. Our model achieved an F1-Score of 99.31%, precision of 99.24%, and recall of 99.51%. These scores prove that our model is significantly better than the state-of-the-art (SOTA) CNNs in existence. </p> </div> </dd> <dt> <a name='item179'>[179]</a> <a href ="/abs/2411.09263" title="Abstract" id="2411.09263"> arXiv:2411.09263 </a> (replaced) [<a href="/pdf/2411.09263" title="Download PDF" id="pdf-2411.09263" aria-labelledby="pdf-2411.09263">pdf</a>, <a href="https://arxiv.org/html/2411.09263v2" title="View HTML" id="html-2411.09263" aria-labelledby="html-2411.09263" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.09263" title="Other formats" id="oth-2411.09263" aria-labelledby="oth-2411.09263">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Rethinking Weight-Averaged Model-merging </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+H">Hu Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ma,+C">Congbo Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Almakky,+I">Ibrahim Almakky</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Reid,+I">Ian Reid</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Carneiro,+G">Gustavo Carneiro</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yaqub,+M">Mohammad Yaqub</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Weight-averaged model-merging has emerged as a powerful approach in deep learning, capable of enhancing model performance without fine-tuning or retraining. However, the underlying mechanisms that explain its effectiveness remain largely unexplored. In this paper, we investigate this technique from three novel perspectives to provide deeper insights into how and why weight-averaged model-merging works: (1) we examine the intrinsic patterns captured by the learning of the model weights, through the visualizations of their patterns on several datasets, showing that these weights often encode structured and interpretable patterns; (2) we investigate model ensemble merging strategies based on averaging on weights versus averaging on features, providing detailed analyses across diverse architectures and datasets; and (3) we explore the impact on model-merging prediction stability in terms of changing the parameter magnitude, revealing insights into the way of weight averaging works as regularization by showing the robustness across different parameter scales. Our findings shed light on the &#34;black box&#34; of weight-averaged model-merging, offering valuable insights and practical recommendations that advance the model-merging process. </p> </div> </dd> <dt> <a name='item180'>[180]</a> <a href ="/abs/2411.10798" title="Abstract" id="2411.10798"> arXiv:2411.10798 </a> (replaced) [<a href="/pdf/2411.10798" title="Download PDF" id="pdf-2411.10798" aria-labelledby="pdf-2411.10798">pdf</a>, <a href="/format/2411.10798" title="Other formats" id="oth-2411.10798" aria-labelledby="oth-2411.10798">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Unveiling Hidden Details: A RAW Data-Enhanced Paradigm for Real-World Super-Resolution </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Peng,+L">Long Peng</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Li,+W">Wenbo Li</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Guo,+J">Jiaming Guo</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Di,+X">Xin Di</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Sun,+H">Haoze Sun</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Li,+Y">Yong Li</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Pei,+R">Renjing Pei</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wang,+Y">Yang Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Cao,+Y">Yang Cao</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Zha,+Z">Zheng-Jun Zha</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> We sincerely apologize, but due to some commercial confidentiality agreements related to the report, we have decided to withdraw the submission for now and will resubmit after making the necessary revisions </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Real-world image super-resolution (Real SR) aims to generate high-fidelity, detail-rich high-resolution (HR) images from low-resolution (LR) counterparts. Existing Real SR methods primarily focus on generating details from the LR RGB domain, often leading to a lack of richness or fidelity in fine details. In this paper, we pioneer the use of details hidden in RAW data to complement existing RGB-only methods, yielding superior outputs. We argue that key image processing steps in Image Signal Processing, such as denoising and demosaicing, inherently result in the loss of fine details in LR images, making LR RAW a valuable information source. To validate this, we present RealSR-RAW, a comprehensive dataset comprising over 10,000 pairs with LR and HR RGB images, along with corresponding LR RAW, captured across multiple smartphones under varying focal lengths and diverse scenes. Additionally, we propose a novel, general RAW adapter to efficiently integrate LR RAW data into existing CNNs, Transformers, and Diffusion-based Real SR models by suppressing the noise contained in LR RAW and aligning its distribution. Extensive experiments demonstrate that incorporating RAW data significantly enhances detail recovery and improves Real SR performance across ten evaluation metrics, including both fidelity and perception-oriented metrics. Our findings open a new direction for the Real SR task, with the dataset and code will be made available to support future research. </p> </div> </dd> </dl> <div class='paging'>Total of 180 entries </div> <div class='morefewer'>Showing up to 2000 entries per page: <a href=/list/cs.CV/new?skip=0&amp;show=1000 rel="nofollow"> fewer</a> | <span style="color: #454545">more</span> | <span style="color: #454545">all</span> </div> </div> </div> </div> </main> <footer style="clear: both;"> <div class="columns is-desktop" role="navigation" aria-label="Secondary" style="margin: -0.75em -0.75em 0.75em -0.75em"> <!-- Macro-Column 1 --> <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- End Macro-Column 1 --> <!-- Macro-Column 2 --> <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> <!-- End Macro-Column 2 --> </div> </footer> </div> <script src="/static/base/1.0.1/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10