CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 3,720 results for author: <span class="mathjax">Zhang, S</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Zhang%2C+S">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zhang, S"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zhang%2C+S&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zhang, S"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20776">arXiv:2503.20776</a> <span> [<a href="https://arxiv.org/pdf/2503.20776">pdf</a>, <a href="https://arxiv.org/format/2503.20776">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Feature4X: Bridging Any Monocular Video to 4D Agentic AI with Versatile Gaussian Feature Fields </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Shijie Zhou</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hui Ren</a>, <a href="/search/cs?searchtype=author&query=Weng%2C+Y">Yijia Weng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shuwang Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhen Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dejia Xu</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Z">Zhiwen Fan</a>, <a href="/search/cs?searchtype=author&query=You%2C+S">Suya You</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhangyang Wang</a>, <a href="/search/cs?searchtype=author&query=Guibas%2C+L">Leonidas Guibas</a>, <a href="/search/cs?searchtype=author&query=Kadambi%2C+A">Achuta Kadambi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20776v1-abstract-short" style="display: inline;"> Recent advancements in 2D and multimodal models have achieved remarkable success by leveraging large-scale training on extensive datasets. However, extending these achievements to enable free-form interactions and high-level semantic operations with complex 3D/4D scenes remains challenging. This difficulty stems from the limited availability of large-scale, annotated 3D/4D or multi-view datasets,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20776v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20776v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20776v1-abstract-full" style="display: none;"> Recent advancements in 2D and multimodal models have achieved remarkable success by leveraging large-scale training on extensive datasets. However, extending these achievements to enable free-form interactions and high-level semantic operations with complex 3D/4D scenes remains challenging. This difficulty stems from the limited availability of large-scale, annotated 3D/4D or multi-view datasets, which are crucial for generalizable vision and language tasks such as open-vocabulary and prompt-based segmentation, language-guided editing, and visual question answering (VQA). In this paper, we introduce Feature4X, a universal framework designed to extend any functionality from 2D vision foundation model into the 4D realm, using only monocular video input, which is widely available from user-generated content. The "X" in Feature4X represents its versatility, enabling any task through adaptable, model-conditioned 4D feature field distillation. At the core of our framework is a dynamic optimization strategy that unifies multiple model capabilities into a single representation. Additionally, to the best of our knowledge, Feature4X is the first method to distill and lift the features of video foundation models (e.g. SAM2, InternVideo2) into an explicit 4D feature field using Gaussian Splatting. Our experiments showcase novel view segment anything, geometric and appearance scene editing, and free-form VQA across all time steps, empowered by LLMs in feedback loops. These advancements broaden the scope of agentic AI applications by providing a foundation for scalable, contextually and spatiotemporally aware systems capable of immersive dynamic 4D scene interaction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20776v1-abstract-full').style.display = 'none'; document.getElementById('2503.20776v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20752">arXiv:2503.20752</a> <span> [<a href="https://arxiv.org/pdf/2503.20752">pdf</a>, <a href="https://arxiv.org/format/2503.20752">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Reason-RFT: Reinforcement Fine-Tuning for Visual Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+H">Huajie Tan</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+Y">Yuheng Ji</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+X">Xiaoshuai Hao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+M">Minglan Lin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pengwei Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhongyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shanghang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20752v1-abstract-short" style="display: inline;"> Visual reasoning abilities play a crucial role in understanding complex multimodal data, advancing both domain-specific applications and artificial general intelligence (AGI). Existing methods improve VLM reasoning via Chain-of-Thought (CoT) supervised fine-tuning, using meticulously annotated training data to enhance visual reasoning capabilities. However, this training paradigm may lead to overf… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20752v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20752v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20752v1-abstract-full" style="display: none;"> Visual reasoning abilities play a crucial role in understanding complex multimodal data, advancing both domain-specific applications and artificial general intelligence (AGI). Existing methods improve VLM reasoning via Chain-of-Thought (CoT) supervised fine-tuning, using meticulously annotated training data to enhance visual reasoning capabilities. However, this training paradigm may lead to overfitting and cognitive rigidity, restricting the model's ability to transfer visual reasoning skills across domains and limiting its real-world applicability. To address these limitations, we propose Reason-RFT, a novel reinforcement fine-tuning framework that significantly enhances generalization capabilities in visual reasoning tasks. Reason-RFT introduces a two-phase training framework for visual reasoning: (1) Supervised Fine-Tuning (SFT) with curated Chain-of-Thought (CoT) data activates the reasoning potential of Vision-Language Models (VLMs), followed by (2) Group Relative Policy Optimization (GRPO)-based reinforcement learning that generates multiple reasoning-response pairs, significantly enhancing generalization in visual reasoning tasks. To evaluate Reason-RFT's visual reasoning capabilities, we reconstructed a comprehensive dataset spanning visual counting, structure perception, and spatial transformation.cExperimental results demonstrate Reasoning-RFT's three key advantages: (1) Performance Enhancement: achieving state-of-the-art results across multiple tasks, outperforming most mainstream open-source and proprietary models; (2) Generalization Superiority: consistently maintaining robust performance across diverse tasks and domains, outperforming alternative training paradigms; (3) Data Efficiency: excelling in few-shot learning scenarios while surpassing full-dataset SFT baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20752v1-abstract-full').style.display = 'none'; document.getElementById('2503.20752v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">35 pages, 22 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20745">arXiv:2503.20745</a> <span> [<a href="https://arxiv.org/pdf/2503.20745">pdf</a>, <a href="https://arxiv.org/format/2503.20745">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MATHGLANCE: Multimodal Large Language Models Do Not Know Where to Look in Mathematical Diagrams </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yanpeng Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shan Zhang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+W">Wei Tang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+A">Aotian Chen</a>, <a href="/search/cs?searchtype=author&query=Koniusz%2C+P">Piotr Koniusz</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+K">Kai Zou</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+Y">Yuan Xue</a>, <a href="/search/cs?searchtype=author&query=Hengel%2C+A+v+d">Anton van den Hengel</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20745v1-abstract-short" style="display: inline;"> Diagrams serve as a fundamental form of visual language, representing complex concepts and their inter-relationships through structured symbols, shapes, and spatial arrangements. Unlike natural images, their inherently symbolic and abstract nature poses significant challenges for Multimodal Large Language Models (MLLMs). However, current benchmarks conflate perceptual and reasoning tasks, making i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20745v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20745v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20745v1-abstract-full" style="display: none;"> Diagrams serve as a fundamental form of visual language, representing complex concepts and their inter-relationships through structured symbols, shapes, and spatial arrangements. Unlike natural images, their inherently symbolic and abstract nature poses significant challenges for Multimodal Large Language Models (MLLMs). However, current benchmarks conflate perceptual and reasoning tasks, making it difficult to assess whether MLLMs genuinely understand mathematical diagrams beyond superficial pattern recognition. To address this gap, we introduce MATHGLANCE, a benchmark specifically designed to isolate and evaluate mathematical perception in MLLMs. MATHGLANCE comprises 1.2K images and 1.6K carefully curated questions spanning four perception tasks: shape classification, object counting, relationship identification, and object grounding, covering diverse domains including plane geometry, solid geometry, and graphical representations. Our evaluation of MLLMs reveals that their ability to understand diagrams is notably limited, particularly in fine-grained grounding tasks. In response, we construct GeoPeP, a perception-oriented dataset of 200K structured geometry image-text pairs explicitly annotated with geometric primitives and precise spatial relationships. Training MLLM on GeoPeP leads to significant gains in perceptual accuracy, which in turn substantially improves mathematical reasoning. Our benchmark and dataset establish critical standards for evaluating and advancing multimodal mathematical understanding, providing valuable resources and insights to foster future MLLM research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20745v1-abstract-full').style.display = 'none'; document.getElementById('2503.20745v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20384">arXiv:2503.20384</a> <span> [<a href="https://arxiv.org/pdf/2503.20384">pdf</a>, <a href="https://arxiv.org/format/2503.20384">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MoLe-VLA: Dynamic Layer-skipping Vision Language Action Model via Mixture-of-Layers for Efficient Robot Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Rongyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+M">Menghang Dong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Heng%2C+L">Liang Heng</a>, <a href="/search/cs?searchtype=author&query=Chi%2C+X">Xiaowei Chi</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+G">Gaole Dai</a>, <a href="/search/cs?searchtype=author&query=Du%2C+L">Li Du</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Dan Wang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yuan Du</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shanghang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20384v1-abstract-short" style="display: inline;"> Multimodal Large Language Models (MLLMs) excel in understanding complex language and visual data, enabling generalist robotic systems to interpret instructions and perform embodied tasks. Nevertheless, their real-world deployment is hindered by substantial computational and storage demands. Recent insights into the homogeneous patterns in the LLM layer have inspired sparsification techniques to ad… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20384v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20384v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20384v1-abstract-full" style="display: none;"> Multimodal Large Language Models (MLLMs) excel in understanding complex language and visual data, enabling generalist robotic systems to interpret instructions and perform embodied tasks. Nevertheless, their real-world deployment is hindered by substantial computational and storage demands. Recent insights into the homogeneous patterns in the LLM layer have inspired sparsification techniques to address these challenges, such as early exit and token pruning. However, these methods often neglect the critical role of the final layers that encode the semantic information most relevant to downstream robotic tasks. Aligning with the recent breakthrough of the Shallow Brain Hypothesis (SBH) in neuroscience and the mixture of experts in model sparsification, we conceptualize each LLM layer as an expert and propose a Mixture-of-Layers Vision-Language-Action model (MoLe-VLA, or simply MoLe) architecture for dynamic LLM layer activation. We introduce a Spatial-Temporal Aware Router (STAR) for MoLe to selectively activate only parts of the layers based on the robot's current state, mimicking the brain's distinct signal pathways specialized for cognition and causal reasoning. Additionally, to compensate for the cognitive ability of LLMs lost in MoLe, we devise a Cognition Self-Knowledge Distillation (CogKD) framework. CogKD enhances the understanding of task demands and improves the generation of task-relevant action sequences by leveraging cognitive features. Extensive experiments conducted in both RLBench simulation and real-world environments demonstrate the superiority of MoLe-VLA in both efficiency and performance. Specifically, MoLe-VLA achieves an 8% improvement in the mean success rate across ten tasks while reducing computational costs by up to x5.6 compared to standard LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20384v1-abstract-full').style.display = 'none'; document.getElementById('2503.20384v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20314">arXiv:2503.20314</a> <span> [<a href="https://arxiv.org/pdf/2503.20314">pdf</a>, <a href="https://arxiv.org/format/2503.20314">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Wan: Open and Advanced Large-Scale Video Generative Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=WanTeam"> WanTeam</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A">Ang Wang</a>, <a href="/search/cs?searchtype=author&query=Ai%2C+B">Baole Ai</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+B">Bin Wen</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+C">Chaojie Mao</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+C">Chen-Wei Xie</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Di Chen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F">Feiwu Yu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Haiming Zhao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jianxiao Yang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+J">Jianyuan Zeng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiayu Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingfeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jingren Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jinkai Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jixuan Chen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+K">Kai Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+K">Kang Zhao</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+K">Keyu Yan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Lianghua Huang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+M">Mengyang Feng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+N">Ningyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Pandeng Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+P">Pingyu Wu</a> , et al. (38 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20314v1-abstract-short" style="display: inline;"> This report presents Wan, a comprehensive and open suite of video foundation models designed to push the boundaries of video generation. Built upon the mainstream diffusion transformer paradigm, Wan achieves significant advancements in generative capabilities through a series of innovations, including our novel VAE, scalable pre-training strategies, large-scale data curation, and automated evaluat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20314v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20314v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20314v1-abstract-full" style="display: none;"> This report presents Wan, a comprehensive and open suite of video foundation models designed to push the boundaries of video generation. Built upon the mainstream diffusion transformer paradigm, Wan achieves significant advancements in generative capabilities through a series of innovations, including our novel VAE, scalable pre-training strategies, large-scale data curation, and automated evaluation metrics. These contributions collectively enhance the model's performance and versatility. Specifically, Wan is characterized by four key features: Leading Performance: The 14B model of Wan, trained on a vast dataset comprising billions of images and videos, demonstrates the scaling laws of video generation with respect to both data and model size. It consistently outperforms the existing open-source models as well as state-of-the-art commercial solutions across multiple internal and external benchmarks, demonstrating a clear and significant performance superiority. Comprehensiveness: Wan offers two capable models, i.e., 1.3B and 14B parameters, for efficiency and effectiveness respectively. It also covers multiple downstream applications, including image-to-video, instruction-guided video editing, and personal video generation, encompassing up to eight tasks. Consumer-Grade Efficiency: The 1.3B model demonstrates exceptional resource efficiency, requiring only 8.19 GB VRAM, making it compatible with a wide range of consumer-grade GPUs. Openness: We open-source the entire series of Wan, including source code and all models, with the goal of fostering the growth of the video generation community. This openness seeks to significantly expand the creative possibilities of video production in the industry and provide academia with high-quality video foundation models. All the code and models are available at https://github.com/Wan-Video/Wan2.1. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20314v1-abstract-full').style.display = 'none'; document.getElementById('2503.20314v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">60 pages, 33 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20226">arXiv:2503.20226</a> <span> [<a href="https://arxiv.org/pdf/2503.20226">pdf</a>, <a href="https://arxiv.org/format/2503.20226">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3706598.3714074">10.1145/3706598.3714074 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Raising Awareness of Location Information Vulnerabilities in Social Media Photos using LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Ying Ma</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shiquan Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dongju Yang</a>, <a href="/search/cs?searchtype=author&query=Sarsenbayeva%2C+Z">Zhanna Sarsenbayeva</a>, <a href="/search/cs?searchtype=author&query=Knibbe%2C+J">Jarrod Knibbe</a>, <a href="/search/cs?searchtype=author&query=Goncalves%2C+J">Jorge Goncalves</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20226v1-abstract-short" style="display: inline;"> Location privacy leaks can lead to unauthorised tracking, identity theft, and targeted attacks, compromising personal security and privacy. This study explores LLM-powered location privacy leaks associated with photo sharing on social media, focusing on user awareness, attitudes, and opinions. We developed and introduced an LLM-powered location privacy intervention app to 19 participants, who used… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20226v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20226v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20226v1-abstract-full" style="display: none;"> Location privacy leaks can lead to unauthorised tracking, identity theft, and targeted attacks, compromising personal security and privacy. This study explores LLM-powered location privacy leaks associated with photo sharing on social media, focusing on user awareness, attitudes, and opinions. We developed and introduced an LLM-powered location privacy intervention app to 19 participants, who used it over a two-week period. The app prompted users to reflect on potential privacy leaks that a widely available LLM could easily detect, such as visual landmarks & cues that could reveal their location, and provided ways to conceal this information. Through in-depth interviews, we found that our intervention effectively increased users' awareness of location privacy and the risks posed by LLMs. It also encouraged users to consider the importance of maintaining control over their privacy data and sparked discussions about the future of location privacy-preserving technologies. Based on these insights, we offer design implications to support the development of future user-centred, location privacy-preserving technologies for social media photos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20226v1-abstract-full').style.display = 'none'; document.getElementById('2503.20226v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published at ACM CHI 2025 Conference on Human Factors in Computing Systems</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20179">arXiv:2503.20179</a> <span> [<a href="https://arxiv.org/pdf/2503.20179">pdf</a>, <a href="https://arxiv.org/format/2503.20179">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> ProtoBERT-LoRA: Parameter-Efficient Prototypical Finetuning for Immunotherapy Study Identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shijia Zhang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+X">Xiyu Ding</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+K">Kai Ding</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jacob Zhang</a>, <a href="/search/cs?searchtype=author&query=Galinsky%2C+K">Kevin Galinsky</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mengrui Wang</a>, <a href="/search/cs?searchtype=author&query=Mayers%2C+R+P">Ryan P. Mayers</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zheyu Wang</a>, <a href="/search/cs?searchtype=author&query=Kharrazi%2C+H">Hadi Kharrazi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20179v1-abstract-short" style="display: inline;"> Identifying immune checkpoint inhibitor (ICI) studies in genomic repositories like Gene Expression Omnibus (GEO) is vital for cancer research yet remains challenging due to semantic ambiguity, extreme class imbalance, and limited labeled data in low-resource settings. We present ProtoBERT-LoRA, a hybrid framework that combines PubMedBERT with prototypical networks and Low-Rank Adaptation (LoRA) fo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20179v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20179v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20179v1-abstract-full" style="display: none;"> Identifying immune checkpoint inhibitor (ICI) studies in genomic repositories like Gene Expression Omnibus (GEO) is vital for cancer research yet remains challenging due to semantic ambiguity, extreme class imbalance, and limited labeled data in low-resource settings. We present ProtoBERT-LoRA, a hybrid framework that combines PubMedBERT with prototypical networks and Low-Rank Adaptation (LoRA) for efficient fine-tuning. The model enforces class-separable embeddings via episodic prototype training while preserving biomedical domain knowledge. Our dataset was divided as: Training (20 positive, 20 negative), Prototype Set (10 positive, 10 negative), Validation (20 positive, 200 negative), and Test (71 positive, 765 negative). Evaluated on test dataset, ProtoBERT-LoRA achieved F1-score of 0.624 (precision: 0.481, recall: 0.887), outperforming the rule-based system, machine learning baselines and finetuned PubMedBERT. Application to 44,287 unlabeled studies reduced manual review efforts by 82%. Ablation studies confirmed that combining prototypes with LoRA improved performance by 29% over stand-alone LoRA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20179v1-abstract-full').style.display = 'none'; document.getElementById('2503.20179v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to AMIA 2025 Annual Symposium</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20018">arXiv:2503.20018</a> <span> [<a href="https://arxiv.org/pdf/2503.20018">pdf</a>, <a href="https://arxiv.org/format/2503.20018">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Experience Replay Addresses Loss of Plasticity in Continual Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiuqi Wang</a>, <a href="/search/cs?searchtype=author&query=Chandra%2C+R">Rohan Chandra</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shangtong Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20018v1-abstract-short" style="display: inline;"> Loss of plasticity is one of the main challenges in continual learning with deep neural networks, where neural networks trained via backpropagation gradually lose their ability to adapt to new tasks and perform significantly worse than their freshly initialized counterparts. The main contribution of this paper is to propose a new hypothesis that experience replay addresses the loss of plasticity i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20018v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20018v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20018v1-abstract-full" style="display: none;"> Loss of plasticity is one of the main challenges in continual learning with deep neural networks, where neural networks trained via backpropagation gradually lose their ability to adapt to new tasks and perform significantly worse than their freshly initialized counterparts. The main contribution of this paper is to propose a new hypothesis that experience replay addresses the loss of plasticity in continual learning. Here, experience replay is a form of memory. We provide supporting evidence for this hypothesis. In particular, we demonstrate in multiple different tasks, including regression, classification, and policy evaluation, that by simply adding an experience replay and processing the data in the experience replay with Transformers, the loss of plasticity disappears. Notably, we do not alter any standard components of deep learning. For example, we do not change backpropagation. We do not modify the activation functions. And we do not use any regularization. We conjecture that experience replay and Transformers can address the loss of plasticity because of the in-context learning phenomenon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20018v1-abstract-full').style.display = 'none'; document.getElementById('2503.20018v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19860">arXiv:2503.19860</a> <span> [<a href="https://arxiv.org/pdf/2503.19860">pdf</a>, <a href="https://arxiv.org/format/2503.19860">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unpaired Translation of Chest X-ray Images for Lung Opacity Diagnosis via Adaptive Activation Masks and Cross-Domain Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ning%2C+J">Junzhi Ning</a>, <a href="/search/cs?searchtype=author&query=Marshall%2C+D">Dominic Marshall</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yijian Gao</a>, <a href="/search/cs?searchtype=author&query=Nan%2C+X+X+Y">Xiaodan Xing Yang Nan</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yingying Fang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Sheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Komorowski%2C+M">Matthieu Komorowski</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19860v1-abstract-short" style="display: inline;"> Chest X-ray radiographs (CXRs) play a pivotal role in diagnosing and monitoring cardiopulmonary diseases. However, lung opac- ities in CXRs frequently obscure anatomical structures, impeding clear identification of lung borders and complicating the localization of pathology. This challenge significantly hampers segmentation accuracy and precise lesion identification, which are crucial for diagnosi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19860v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19860v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19860v1-abstract-full" style="display: none;"> Chest X-ray radiographs (CXRs) play a pivotal role in diagnosing and monitoring cardiopulmonary diseases. However, lung opac- ities in CXRs frequently obscure anatomical structures, impeding clear identification of lung borders and complicating the localization of pathology. This challenge significantly hampers segmentation accuracy and precise lesion identification, which are crucial for diagnosis. To tackle these issues, our study proposes an unpaired CXR translation framework that converts CXRs with lung opacities into counterparts without lung opacities while preserving semantic features. Central to our approach is the use of adaptive activation masks to selectively modify opacity regions in lung CXRs. Cross-domain alignment ensures translated CXRs without opacity issues align with feature maps and prediction labels from a pre-trained CXR lesion classifier, facilitating the interpretability of the translation process. We validate our method using RSNA, MIMIC-CXR-JPG and JSRT datasets, demonstrating superior translation quality through lower Frechet Inception Distance (FID) and Kernel Inception Distance (KID) scores compared to existing meth- ods (FID: 67.18 vs. 210.4, KID: 0.01604 vs. 0.225). Evaluation on RSNA opacity, MIMIC acute respiratory distress syndrome (ARDS) patient CXRs and JSRT CXRs show our method enhances segmentation accuracy of lung borders and improves lesion classification, further underscoring its potential in clinical settings (RSNA: mIoU: 76.58% vs. 62.58%, Sensitivity: 85.58% vs. 77.03%; MIMIC ARDS: mIoU: 86.20% vs. 72.07%, Sensitivity: 92.68% vs. 86.85%; JSRT: mIoU: 91.08% vs. 85.6%, Sensitivity: 97.62% vs. 95.04%). Our approach advances CXR imaging analysis, especially in investigating segmentation impacts through image translation techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19860v1-abstract-full').style.display = 'none'; document.getElementById('2503.19860v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19786">arXiv:2503.19786</a> <span> [<a href="https://arxiv.org/pdf/2503.19786">pdf</a>, <a href="https://arxiv.org/format/2503.19786">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Gemma 3 Technical Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gemma+Team"> Gemma Team</a>, <a href="/search/cs?searchtype=author&query=Kamath%2C+A">Aishwarya Kamath</a>, <a href="/search/cs?searchtype=author&query=Ferret%2C+J">Johan Ferret</a>, <a href="/search/cs?searchtype=author&query=Pathak%2C+S">Shreya Pathak</a>, <a href="/search/cs?searchtype=author&query=Vieillard%2C+N">Nino Vieillard</a>, <a href="/search/cs?searchtype=author&query=Merhej%2C+R">Ramona Merhej</a>, <a href="/search/cs?searchtype=author&query=Perrin%2C+S">Sarah Perrin</a>, <a href="/search/cs?searchtype=author&query=Matejovicova%2C+T">Tatiana Matejovicova</a>, <a href="/search/cs?searchtype=author&query=Ram%C3%A9%2C+A">Alexandre Ram茅</a>, <a href="/search/cs?searchtype=author&query=Rivi%C3%A8re%2C+M">Morgane Rivi猫re</a>, <a href="/search/cs?searchtype=author&query=Rouillard%2C+L">Louis Rouillard</a>, <a href="/search/cs?searchtype=author&query=Mesnard%2C+T">Thomas Mesnard</a>, <a href="/search/cs?searchtype=author&query=Cideron%2C+G">Geoffrey Cideron</a>, <a href="/search/cs?searchtype=author&query=Grill%2C+J">Jean-bastien Grill</a>, <a href="/search/cs?searchtype=author&query=Ramos%2C+S">Sabela Ramos</a>, <a href="/search/cs?searchtype=author&query=Yvinec%2C+E">Edouard Yvinec</a>, <a href="/search/cs?searchtype=author&query=Casbon%2C+M">Michelle Casbon</a>, <a href="/search/cs?searchtype=author&query=Pot%2C+E">Etienne Pot</a>, <a href="/search/cs?searchtype=author&query=Penchev%2C+I">Ivo Penchev</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+G">Ga毛l Liu</a>, <a href="/search/cs?searchtype=author&query=Visin%2C+F">Francesco Visin</a>, <a href="/search/cs?searchtype=author&query=Kenealy%2C+K">Kathleen Kenealy</a>, <a href="/search/cs?searchtype=author&query=Beyer%2C+L">Lucas Beyer</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+X">Xiaohai Zhai</a>, <a href="/search/cs?searchtype=author&query=Tsitsulin%2C+A">Anton Tsitsulin</a> , et al. (191 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19786v1-abstract-short" style="display: inline;"> We introduce Gemma 3, a multimodal addition to the Gemma family of lightweight open models, ranging in scale from 1 to 27 billion parameters. This version introduces vision understanding abilities, a wider coverage of languages and longer context - at least 128K tokens. We also change the architecture of the model to reduce the KV-cache memory that tends to explode with long context. This is achie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19786v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19786v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19786v1-abstract-full" style="display: none;"> We introduce Gemma 3, a multimodal addition to the Gemma family of lightweight open models, ranging in scale from 1 to 27 billion parameters. This version introduces vision understanding abilities, a wider coverage of languages and longer context - at least 128K tokens. We also change the architecture of the model to reduce the KV-cache memory that tends to explode with long context. This is achieved by increasing the ratio of local to global attention layers, and keeping the span on local attention short. The Gemma 3 models are trained with distillation and achieve superior performance to Gemma 2 for both pre-trained and instruction finetuned versions. In particular, our novel post-training recipe significantly improves the math, chat, instruction-following and multilingual abilities, making Gemma3-4B-IT competitive with Gemma2-27B-IT and Gemma3-27B-IT comparable to Gemini-1.5-Pro across benchmarks. We release all our models to the community. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19786v1-abstract-full').style.display = 'none'; document.getElementById('2503.19786v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19589">arXiv:2503.19589</a> <span> [<a href="https://arxiv.org/pdf/2503.19589">pdf</a>, <a href="https://arxiv.org/format/2503.19589">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Prompt-Guided Dual-Path UNet with Mamba for Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shaolei Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jinyan Liu</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+T">Tianyi Qian</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xuesong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19589v1-abstract-short" style="display: inline;"> Convolutional neural networks (CNNs) and transformers are widely employed in constructing UNet architectures for medical image segmentation tasks. However, CNNs struggle to model long-range dependencies, while transformers suffer from quadratic computational complexity. Recently, Mamba, a type of State Space Models, has gained attention for its exceptional ability to model long-range interactions… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19589v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19589v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19589v1-abstract-full" style="display: none;"> Convolutional neural networks (CNNs) and transformers are widely employed in constructing UNet architectures for medical image segmentation tasks. However, CNNs struggle to model long-range dependencies, while transformers suffer from quadratic computational complexity. Recently, Mamba, a type of State Space Models, has gained attention for its exceptional ability to model long-range interactions while maintaining linear computational complexity. Despite the emergence of several Mamba-based methods, they still present the following limitations: first, their network designs generally lack perceptual capabilities for the original input data; second, they primarily focus on capturing global information, while often neglecting local details. To address these challenges, we propose a prompt-guided CNN-Mamba dual-path UNet, termed PGM-UNet, for medical image segmentation. Specifically, we introduce a prompt-guided residual Mamba module that adaptively extracts dynamic visual prompts from the original input data, effectively guiding Mamba in capturing global information. Additionally, we design a local-global information fusion network, comprising a local information extraction module, a prompt-guided residual Mamba module, and a multi-focus attention fusion module, which effectively integrates local and global information. Furthermore, inspired by Kolmogorov-Arnold Networks (KANs), we develop a multi-scale information extraction module to capture richer contextual information without altering the resolution. We conduct extensive experiments on the ISIC-2017, ISIC-2018, DIAS, and DRIVE. The results demonstrate that the proposed method significantly outperforms state-of-the-art approaches in multiple medical image segmentation tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19589v1-abstract-full').style.display = 'none'; document.getElementById('2503.19589v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19011">arXiv:2503.19011</a> <span> [<a href="https://arxiv.org/pdf/2503.19011">pdf</a>, <a href="https://arxiv.org/format/2503.19011">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RomanTex: Decoupling 3D-aware Rotary Positional Embedded Multi-Attention Network for Texture Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yifei Feng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Mingxin Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuhui Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Sheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jiaao Yu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zibo Zhao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuhong Liu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jie Jiang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+C">Chunchao Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19011v1-abstract-short" style="display: inline;"> Painting textures for existing geometries is a critical yet labor-intensive process in 3D asset generation. Recent advancements in text-to-image (T2I) models have led to significant progress in texture generation. Most existing research approaches this task by first generating images in 2D spaces using image diffusion models, followed by a texture baking process to achieve UV texture. However, the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19011v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19011v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19011v1-abstract-full" style="display: none;"> Painting textures for existing geometries is a critical yet labor-intensive process in 3D asset generation. Recent advancements in text-to-image (T2I) models have led to significant progress in texture generation. Most existing research approaches this task by first generating images in 2D spaces using image diffusion models, followed by a texture baking process to achieve UV texture. However, these methods often struggle to produce high-quality textures due to inconsistencies among the generated multi-view images, resulting in seams and ghosting artifacts. In contrast, 3D-based texture synthesis methods aim to address these inconsistencies, but they often neglect 2D diffusion model priors, making them challenging to apply to real-world objects To overcome these limitations, we propose RomanTex, a multiview-based texture generation framework that integrates a multi-attention network with an underlying 3D representation, facilitated by our novel 3D-aware Rotary Positional Embedding. Additionally, we incorporate a decoupling characteristic in the multi-attention block to enhance the model's robustness in image-to-texture task, enabling semantically-correct back-view synthesis. Furthermore, we introduce a geometry-related Classifier-Free Guidance (CFG) mechanism to further improve the alignment with both geometries and images. Quantitative and qualitative evaluations, along with comprehensive user studies, demonstrate that our method achieves state-of-the-art results in texture quality and consistency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19011v1-abstract-full').style.display = 'none'; document.getElementById('2503.19011v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18854">arXiv:2503.18854</a> <span> </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MC-LLaVA: Multi-Concept Personalized Vision-Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=An%2C+R">Ruichuan An</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Sihan Yang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+M">Ming Lu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Renrui Zhang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+K">Kai Zeng</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yulin Luo</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jiajun Cao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+H">Hao Liang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Ying Chen</a>, <a href="/search/cs?searchtype=author&query=She%2C+Q">Qi She</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shanghang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wentao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18854v2-abstract-short" style="display: inline;"> Current vision-language models (VLMs) show exceptional abilities across diverse tasks, such as visual question answering. To enhance user experience, recent studies investigate VLM personalization to understand user-provided concepts. However, they mainly focus on single-concept personalization, neglecting the existence and interplay of multiple concepts, which limits real-world applicability. Thi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18854v2-abstract-full').style.display = 'inline'; document.getElementById('2503.18854v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18854v2-abstract-full" style="display: none;"> Current vision-language models (VLMs) show exceptional abilities across diverse tasks, such as visual question answering. To enhance user experience, recent studies investigate VLM personalization to understand user-provided concepts. However, they mainly focus on single-concept personalization, neglecting the existence and interplay of multiple concepts, which limits real-world applicability. This paper proposes the first multi-concept personalization paradigm, MC-LLaVA. Specifically, MC-LLaVA employs a multi-concept instruction tuning strategy, effectively integrating multiple concepts in a single training step. To reduce the costs related to joint training, we propose a personalized textual prompt that uses visual token information to initialize concept tokens. Additionally, we introduce a personalized visual prompt during inference, aggregating location confidence maps for enhanced recognition and grounding capabilities. To advance multi-concept personalization research, we further contribute a high-quality instruction tuning dataset. We carefully collect images with multiple characters and objects from movies and manually generate question-answer samples for multi-concept scenarios, featuring superior diversity. Comprehensive qualitative and quantitative experiments demonstrate that MC-LLaVA can achieve impressive multi-concept personalized responses, paving the way for VLMs to become better user-specific assistants. The code and dataset will be publicly available at https://github.com/arctanxarc/MC-LLaVA}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18854v2-abstract-full').style.display = 'none'; document.getElementById('2503.18854v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">I sincerely apologize for any inconvenience caused. We actually uploaded this paper to arXiv in November 2024, as arXiv:2411.11706. During this update, we did not consider the replacement operation of arXiv, which led to duplicate submissions. We have made modifications at the original address arXiv:2411.11706</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18671">arXiv:2503.18671</a> <span> [<a href="https://arxiv.org/pdf/2503.18671">pdf</a>, <a href="https://arxiv.org/format/2503.18671">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Structure-Aware Correspondence Learning for Relative Pose Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yihan Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenfei Yang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Huan Ren</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shifeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianzhu Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Feng Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18671v1-abstract-short" style="display: inline;"> Relative pose estimation provides a promising way for achieving object-agnostic pose estimation. Despite the success of existing 3D correspondence-based methods, the reliance on explicit feature matching suffers from small overlaps in visible regions and unreliable feature estimation for invisible regions. Inspired by humans' ability to assemble two object parts that have small or no overlapping r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18671v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18671v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18671v1-abstract-full" style="display: none;"> Relative pose estimation provides a promising way for achieving object-agnostic pose estimation. Despite the success of existing 3D correspondence-based methods, the reliance on explicit feature matching suffers from small overlaps in visible regions and unreliable feature estimation for invisible regions. Inspired by humans' ability to assemble two object parts that have small or no overlapping regions by considering object structure, we propose a novel Structure-Aware Correspondence Learning method for Relative Pose Estimation, which consists of two key modules. First, a structure-aware keypoint extraction module is designed to locate a set of kepoints that can represent the structure of objects with different shapes and appearance, under the guidance of a keypoint based image reconstruction loss. Second, a structure-aware correspondence estimation module is designed to model the intra-image and inter-image relationships between keypoints to extract structure-aware features for correspondence estimation. By jointly leveraging these two modules, the proposed method can naturally estimate 3D-3D correspondences for unseen objects without explicit feature matching for precise relative pose estimation. Experimental results on the CO3D, Objaverse and LineMOD datasets demonstrate that the proposed method significantly outperforms prior methods, i.e., with 5.7掳reduction in mean angular error on the CO3D dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18671v1-abstract-full').style.display = 'none'; document.getElementById('2503.18671v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18484">arXiv:2503.18484</a> <span> [<a href="https://arxiv.org/pdf/2503.18484">pdf</a>, <a href="https://arxiv.org/format/2503.18484">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> PM4Bench: A Parallel Multilingual Multi-Modal Multi-task Benchmark for Large Vision Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+J">Junyuan Gao</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jiahe Song</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiang Wu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+R">Runchuan Zhu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+G">Guanlin Shen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shasha Wang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xingjian Wei</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Haote Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Songyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weijia Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bin Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+D">Dahua Lin</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lijun Wu</a>, <a href="/search/cs?searchtype=author&query=He%2C+C">Conghui He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18484v1-abstract-short" style="display: inline;"> Existing multilingual benchmarks for Large Vision Language Models (LVLMs) suffer from limitations including language-specific content biases, disjointed multimodal input formats, and a lack of safety evaluation. To address these gaps, we propose PM4Bench, the first Parallel Multilingual Multi-Modal Multi-task Benchmark for LVLMs. PM4Bench features a parallel corpus design across 10 languages, enab… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18484v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18484v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18484v1-abstract-full" style="display: none;"> Existing multilingual benchmarks for Large Vision Language Models (LVLMs) suffer from limitations including language-specific content biases, disjointed multimodal input formats, and a lack of safety evaluation. To address these gaps, we propose PM4Bench, the first Parallel Multilingual Multi-Modal Multi-task Benchmark for LVLMs. PM4Bench features a parallel corpus design across 10 languages, enabling fair and accurate cross-lingual comparisons. It includes the vision setting where text and queries are embedded in images, requiring LVLMs to simultaneously "see", "read", and "think", aligning with real-world applications. Additionally, PM\textsuperscript{4}Bench incorporates safety evaluations, addressing critical oversight in existing multilingual benchmarks. Using PM4Bench, we evaluate 11 mainstream LVLMs, revealing significant cross-linguistic performance disparities, particularly in vision settings, and identifying OCR capability as a key determinant of these imbalances. We will release PM4Bench at https://github.com/opendatalab/PM4Bench . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18484v1-abstract-full').style.display = 'none'; document.getElementById('2503.18484v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Equal contribution: Junyuan Gao, Jiahe Song, Jiang Wu; Corresponding author: Conghui He</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17994">arXiv:2503.17994</a> <span> [<a href="https://arxiv.org/pdf/2503.17994">pdf</a>, <a href="https://arxiv.org/format/2503.17994">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Instructing the Architecture Search for Spatial-temporal Sequence Forecasting with LLM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xue%2C+X">Xin Xue</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Haoyi Zhou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianyu Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shuai Zhang</a>, <a href="/search/cs?searchtype=author&query=Long%2C+Y">Yizhou Long</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jianxin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17994v1-abstract-short" style="display: inline;"> Spatial-temporal sequence forecasting (STSF) is a long-standing research problem with widespread real-world applications. Neural architecture search (NAS), which automates the neural network design, has been shown effective in tackling the STSF problem. However, the existing NAS methods for STSF focus on generating architectures in a time-consuming data-driven fashion, which heavily limits their a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17994v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17994v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17994v1-abstract-full" style="display: none;"> Spatial-temporal sequence forecasting (STSF) is a long-standing research problem with widespread real-world applications. Neural architecture search (NAS), which automates the neural network design, has been shown effective in tackling the STSF problem. However, the existing NAS methods for STSF focus on generating architectures in a time-consuming data-driven fashion, which heavily limits their ability to use background knowledge and explore the complicated search trajectory. Large language models (LLMs) have shown remarkable ability in decision-making with comprehensive internal world knowledge, but how it could benefit NAS for STSF remains unexplored. In this paper, we propose a novel NAS method for STSF based on LLM. Instead of directly generate architectures with LLM, We inspire the LLM's capability with a multi-level enhancement mechanism. Specifically, on the step-level, we decompose the generation task into decision steps with powerful prompt engineering and inspire LLM to serve as instructor for architecture search based on its internal knowledge. On the instance-level, we utilize a one-step tuning framework to quickly evaluate the architecture instance and a memory bank to cumulate knowledge to improve LLM's search ability. On the task-level, we propose a two-stage architecture search, balancing the exploration stage and optimization stage, to reduce the possibility of being trapped in local optima. Extensive experimental results demonstrate that our method can achieve competitive effectiveness with superior efficiency against existing NAS methods for STSF. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17994v1-abstract-full').style.display = 'none'; document.getElementById('2503.17994v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17938">arXiv:2503.17938</a> <span> [<a href="https://arxiv.org/pdf/2503.17938">pdf</a>, <a href="https://arxiv.org/format/2503.17938">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Selecting and Pruning: A Differentiable Causal Sequentialized State-Space Model for Two-View Correspondence Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fang%2C+X">Xiang Fang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shihua Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+T">Tao Lu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Huabing Zhou</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jiayi Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17938v1-abstract-short" style="display: inline;"> Two-view correspondence learning aims to discern true and false correspondences between image pairs by recognizing their underlying different information. Previous methods either treat the information equally or require the explicit storage of the entire context, tending to be laborious in real-world scenarios. Inspired by Mamba's inherent selectivity, we propose \textbf{CorrMamba}, a \textbf{Corr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17938v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17938v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17938v1-abstract-full" style="display: none;"> Two-view correspondence learning aims to discern true and false correspondences between image pairs by recognizing their underlying different information. Previous methods either treat the information equally or require the explicit storage of the entire context, tending to be laborious in real-world scenarios. Inspired by Mamba's inherent selectivity, we propose \textbf{CorrMamba}, a \textbf{Corr}espondence filter leveraging \textbf{Mamba}'s ability to selectively mine information from true correspondences while mitigating interference from false ones, thus achieving adaptive focus at a lower cost. To prevent Mamba from being potentially impacted by unordered keypoints that obscured its ability to mine spatial information, we customize a causal sequential learning approach based on the Gumbel-Softmax technique to establish causal dependencies between features in a fully autonomous and differentiable manner. Additionally, a local-context enhancement module is designed to capture critical contextual cues essential for correspondence pruning, complementing the core framework. Extensive experiments on relative pose estimation, visual localization, and analysis demonstrate that CorrMamba achieves state-of-the-art performance. Notably, in outdoor relative pose estimation, our method surpasses the previous SOTA by $2.58$ absolute percentage points in AUC@20\textdegree, highlighting its practical superiority. Our code will be publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17938v1-abstract-full').style.display = 'none'; document.getElementById('2503.17938v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17693">arXiv:2503.17693</a> <span> [<a href="https://arxiv.org/pdf/2503.17693">pdf</a>, <a href="https://arxiv.org/format/2503.17693">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Conditional Diffusion Model with OOD Mitigation as High-Dimensional Offline Resource Allocation Planner in Clustered Ad Hoc Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Meng%2C+K">Kechen Meng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Sinuo Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Rongpeng Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chan Wang</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+M">Ming Lei</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhifeng Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17693v1-abstract-short" style="display: inline;"> Due to network delays and scalability limitations, clustered ad hoc networks widely adopt Reinforcement Learning (RL) for on-demand resource allocation. Albeit its demonstrated agility, traditional Model-Free RL (MFRL) solutions struggle to tackle the huge action space, which generally explodes exponentially along with the number of resource allocation units, enduring low sampling efficiency and h… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17693v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17693v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17693v1-abstract-full" style="display: none;"> Due to network delays and scalability limitations, clustered ad hoc networks widely adopt Reinforcement Learning (RL) for on-demand resource allocation. Albeit its demonstrated agility, traditional Model-Free RL (MFRL) solutions struggle to tackle the huge action space, which generally explodes exponentially along with the number of resource allocation units, enduring low sampling efficiency and high interaction cost. In contrast to MFRL, Model-Based RL (MBRL) offers an alternative solution to boost sample efficiency and stabilize the training by explicitly leveraging a learned environment model. However, establishing an accurate dynamic model for complex and noisy environments necessitates a careful balance between model accuracy and computational complexity $\&$ stability. To address these issues, we propose a Conditional Diffusion Model Planner (CDMP) for high-dimensional offline resource allocation in clustered ad hoc networks. By leveraging the astonishing generative capability of Diffusion Models (DMs), our approach enables the accurate modeling of high-quality environmental dynamics while leveraging an inverse dynamics model to plan a superior policy. Beyond simply adopting DMs in offline RL, we further incorporate the CDMP algorithm with a theoretically guaranteed, uncertainty-aware penalty metric, which theoretically and empirically manifests itself in mitigating the Out-of-Distribution (OOD)-induced distribution shift issue underlying scarce training data. Extensive experiments also show that our model outperforms MFRL in average reward and Quality of Service (QoS) while demonstrating comparable performance to other MBRL algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17693v1-abstract-full').style.display = 'none'; document.getElementById('2503.17693v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17453">arXiv:2503.17453</a> <span> [<a href="https://arxiv.org/pdf/2503.17453">pdf</a>, <a href="https://arxiv.org/format/2503.17453">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Feature-Based Dual Visual Feature Extraction Model for Compound Multimodal Emotion Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+R">Ran Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fengyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+C">Cong Yu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Longjiang Yang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Z">Zhuofan Wen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Siyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+H">Hailiang Yao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shun Chen</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+Z">Zheng Lian</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bin Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17453v1-abstract-short" style="display: inline;"> This article presents our results for the eighth Affective Behavior Analysis in-the-wild (ABAW) competition.Multimodal emotion recognition (ER) has important applications in affective computing and human-computer interaction. However, in the real world, compound emotion recognition faces greater issues of uncertainty and modal conflicts. For the Compound Expression (CE) Recognition Challenge,this… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17453v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17453v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17453v1-abstract-full" style="display: none;"> This article presents our results for the eighth Affective Behavior Analysis in-the-wild (ABAW) competition.Multimodal emotion recognition (ER) has important applications in affective computing and human-computer interaction. However, in the real world, compound emotion recognition faces greater issues of uncertainty and modal conflicts. For the Compound Expression (CE) Recognition Challenge,this paper proposes a multimodal emotion recognition method that fuses the features of Vision Transformer (ViT) and Residual Network (ResNet). We conducted experiments on the C-EXPR-DB and MELD datasets. The results show that in scenarios with complex visual and audio cues (such as C-EXPR-DB), the model that fuses the features of ViT and ResNet exhibits superior performance.Our code are avalible on https://github.com/MyGitHub-ax/8th_ABAW <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17453v1-abstract-full').style.display = 'none'; document.getElementById('2503.17453v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16545">arXiv:2503.16545</a> <span> [<a href="https://arxiv.org/pdf/2503.16545">pdf</a>, <a href="https://arxiv.org/format/2503.16545">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> EmpathyAgent: Can Embodied Agents Conduct Empathetic Actions? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinyan Chen</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jiaxin Ge</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+H">Hongming Dai</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Q">Qiang Zhou</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Q">Qiuxuan Feng</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jingtong Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yizhou Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaming Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shanghang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16545v1-abstract-short" style="display: inline;"> Empathy is fundamental to human interactions, yet it remains unclear whether embodied agents can provide human-like empathetic support. Existing works have studied agents' tasks solving and social interactions abilities, but whether agents can understand empathetic needs and conduct empathetic behaviors remains overlooked. To address this, we introduce EmpathyAgent, the first benchmark to evaluate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16545v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16545v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16545v1-abstract-full" style="display: none;"> Empathy is fundamental to human interactions, yet it remains unclear whether embodied agents can provide human-like empathetic support. Existing works have studied agents' tasks solving and social interactions abilities, but whether agents can understand empathetic needs and conduct empathetic behaviors remains overlooked. To address this, we introduce EmpathyAgent, the first benchmark to evaluate and enhance agents' empathetic actions across diverse scenarios. EmpathyAgent contains 10,000 multimodal samples with corresponding empathetic task plans and three different challenges. To systematically evaluate the agents' empathetic actions, we propose an empathy-specific evaluation suite that evaluates the agents' empathy process. We benchmark current models and found that exhibiting empathetic actions remains a significant challenge. Meanwhile, we train Llama3-8B using EmpathyAgent and find it can potentially enhance empathetic behavior. By establishing a standard benchmark for evaluating empathetic actions, we hope to advance research in empathetic embodied agents. Our code and data are publicly available at https://github.com/xinyan-cxy/EmpathyAgent. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16545v1-abstract-full').style.display = 'none'; document.getElementById('2503.16545v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16338">arXiv:2503.16338</a> <span> [<a href="https://arxiv.org/pdf/2503.16338">pdf</a>, <a href="https://arxiv.org/format/2503.16338">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Gaussian Graph Network: Learning Efficient and Generalizable Gaussian Representations from Multi-view Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shengjun Zhang</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+X">Xin Fei</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+F">Fangfu Liu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+H">Haixu Song</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+Y">Yueqi Duan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16338v1-abstract-short" style="display: inline;"> 3D Gaussian Splatting (3DGS) has demonstrated impressive novel view synthesis performance. While conventional methods require per-scene optimization, more recently several feed-forward methods have been proposed to generate pixel-aligned Gaussian representations with a learnable network, which are generalizable to different scenes. However, these methods simply combine pixel-aligned Gaussians from… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16338v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16338v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16338v1-abstract-full" style="display: none;"> 3D Gaussian Splatting (3DGS) has demonstrated impressive novel view synthesis performance. While conventional methods require per-scene optimization, more recently several feed-forward methods have been proposed to generate pixel-aligned Gaussian representations with a learnable network, which are generalizable to different scenes. However, these methods simply combine pixel-aligned Gaussians from multiple views as scene representations, thereby leading to artifacts and extra memory cost without fully capturing the relations of Gaussians from different images. In this paper, we propose Gaussian Graph Network (GGN) to generate efficient and generalizable Gaussian representations. Specifically, we construct Gaussian Graphs to model the relations of Gaussian groups from different views. To support message passing at Gaussian level, we reformulate the basic graph operations over Gaussian representations, enabling each Gaussian to benefit from its connected Gaussian groups with Gaussian feature fusion. Furthermore, we design a Gaussian pooling layer to aggregate various Gaussian groups for efficient representations. We conduct experiments on the large-scale RealEstate10K and ACID datasets to demonstrate the efficiency and generalization of our method. Compared to the state-of-the-art methods, our model uses fewer Gaussians and achieves better image quality with higher rendering speed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16338v1-abstract-full').style.display = 'none'; document.getElementById('2503.16338v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15847">arXiv:2503.15847</a> <span> [<a href="https://arxiv.org/pdf/2503.15847">pdf</a>, <a href="https://arxiv.org/format/2503.15847">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Beyond Local Selection: Global Cut Selection for Enhanced Mixed-Integer Programming </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+S">Shuli Zeng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Sijia Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shaoang Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Feng Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang-Yang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15847v1-abstract-short" style="display: inline;"> In mixed-integer programming (MIP) solvers, cutting planes are essential for Branch-and-Cut (B&C) algorithms as they reduce the search space and accelerate the solving process. Traditional methods rely on hard-coded heuristics for cut plane selection but fail to leverage problem-specific structural features. Recent machine learning approaches use neural networks for cut selection but focus narrowl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15847v1-abstract-full').style.display = 'inline'; document.getElementById('2503.15847v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15847v1-abstract-full" style="display: none;"> In mixed-integer programming (MIP) solvers, cutting planes are essential for Branch-and-Cut (B&C) algorithms as they reduce the search space and accelerate the solving process. Traditional methods rely on hard-coded heuristics for cut plane selection but fail to leverage problem-specific structural features. Recent machine learning approaches use neural networks for cut selection but focus narrowly on the efficiency of single-node within the B&C algorithm, without considering the broader contextual information. To address this, we propose Global Cut Selection (GCS), which uses a bipartite graph to represent the search tree and combines graph neural networks with reinforcement learning to develop cut selection strategies. Unlike prior methods, GCS applies cutting planes across all nodes, incorporating richer contextual information. Experiments show GCS significantly improves solving efficiency for synthetic and large-scale real-world MIPs compared to traditional and learning-based methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15847v1-abstract-full').style.display = 'none'; document.getElementById('2503.15847v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15550">arXiv:2503.15550</a> <span> [<a href="https://arxiv.org/pdf/2503.15550">pdf</a>, <a href="https://arxiv.org/format/2503.15550">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Zero-Knowledge Federated Learning: A New Trustworthy and Privacy-Preserving Distributed Learning Paradigm </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Yuxin Jin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Taotao Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Q">Qing Yang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+L">Long Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shengli Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15550v2-abstract-short" style="display: inline;"> Federated Learning (FL) has emerged as a promising paradigm in distributed machine learning, enabling collaborative model training while preserving data privacy. However, despite its many advantages, FL still contends with significant challenges -- most notably regarding security and trust. Zero-Knowledge Proofs (ZKPs) offer a potential solution by establishing trust and enhancing system integrity… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15550v2-abstract-full').style.display = 'inline'; document.getElementById('2503.15550v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15550v2-abstract-full" style="display: none;"> Federated Learning (FL) has emerged as a promising paradigm in distributed machine learning, enabling collaborative model training while preserving data privacy. However, despite its many advantages, FL still contends with significant challenges -- most notably regarding security and trust. Zero-Knowledge Proofs (ZKPs) offer a potential solution by establishing trust and enhancing system integrity throughout the FL process. Although several studies have explored ZKP-based FL (ZK-FL), a systematic framework and comprehensive analysis are still lacking. This article makes two key contributions. First, we propose a structured ZK-FL framework that categorizes and analyzes the technical roles of ZKPs across various FL stages and tasks. Second, we introduce a novel algorithm, Verifiable Client Selection FL (Veri-CS-FL), which employs ZKPs to refine the client selection process. In Veri-CS-FL, participating clients generate verifiable proofs for the performance metrics of their local models and submit these concise proofs to the server for efficient verification. The server then selects clients with high-quality local models for uploading, subsequently aggregating the contributions from these selected clients. By integrating ZKPs, Veri-CS-FL not only ensures the accuracy of performance metrics but also fortifies trust among participants while enhancing the overall efficiency and security of FL systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15550v2-abstract-full').style.display = 'none'; document.getElementById('2503.15550v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 5 figures, 1 table</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14537">arXiv:2503.14537</a> <span> [<a href="https://arxiv.org/pdf/2503.14537">pdf</a>, <a href="https://arxiv.org/format/2503.14537">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Learning-based 3D Reconstruction in Autonomous Driving: A Comprehensive Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liao%2C+L">Liewen Liao</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+W">Weihao Yan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Ming Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Songan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14537v2-abstract-short" style="display: inline;"> Learning-based 3D reconstruction has emerged as a transformative technique in autonomous driving, enabling precise modeling of both dynamic and static environments through advanced neural representations. Despite data augmentation, 3D reconstruction inspires pioneering solution for vital tasks in the field of autonomous driving, such as scene understanding and closed-loop simulation. We investigat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14537v2-abstract-full').style.display = 'inline'; document.getElementById('2503.14537v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14537v2-abstract-full" style="display: none;"> Learning-based 3D reconstruction has emerged as a transformative technique in autonomous driving, enabling precise modeling of both dynamic and static environments through advanced neural representations. Despite data augmentation, 3D reconstruction inspires pioneering solution for vital tasks in the field of autonomous driving, such as scene understanding and closed-loop simulation. We investigates the details of 3D reconstruction and conducts a multi-perspective, in-depth analysis of recent advancements. Specifically, we first provide a systematic introduction of preliminaries, including data modalities, benchmarks and technical preliminaries of learning-based 3D reconstruction, facilitating instant identification of suitable methods according to sensor suites. Then, we systematically review learning-based 3D reconstruction methods in autonomous driving, categorizing approaches by subtasks and conducting multi-dimensional analysis and summary to establish a comprehensive technical reference. The development trends and existing challenges are summarized in the context of learning-based 3D reconstruction in autonomous driving. We hope that our review will inspire future researches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14537v2-abstract-full').style.display = 'none'; document.getElementById('2503.14537v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14517">arXiv:2503.14517</a> <span> [<a href="https://arxiv.org/pdf/2503.14517">pdf</a>, <a href="https://arxiv.org/format/2503.14517">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Cafe-Talk: Generating 3D Talking Face Animation with Multimodal Coarse- and Fine-grained Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hejia Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haoxian Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shoulong Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoqiang Liu</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+S">Sisi Zhuang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+P">Pengfei Wan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Di Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuai Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14517v1-abstract-short" style="display: inline;"> Speech-driven 3D talking face method should offer both accurate lip synchronization and controllable expressions. Previous methods solely adopt discrete emotion labels to globally control expressions throughout sequences while limiting flexible fine-grained facial control within the spatiotemporal domain. We propose a diffusion-transformer-based 3D talking face generation model, Cafe-Talk, which s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14517v1-abstract-full').style.display = 'inline'; document.getElementById('2503.14517v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14517v1-abstract-full" style="display: none;"> Speech-driven 3D talking face method should offer both accurate lip synchronization and controllable expressions. Previous methods solely adopt discrete emotion labels to globally control expressions throughout sequences while limiting flexible fine-grained facial control within the spatiotemporal domain. We propose a diffusion-transformer-based 3D talking face generation model, Cafe-Talk, which simultaneously incorporates coarse- and fine-grained multimodal control conditions. Nevertheless, the entanglement of multiple conditions challenges achieving satisfying performance. To disentangle speech audio and fine-grained conditions, we employ a two-stage training pipeline. Specifically, Cafe-Talk is initially trained using only speech audio and coarse-grained conditions. Then, a proposed fine-grained control adapter gradually adds fine-grained instructions represented by action units (AUs), preventing unfavorable speech-lip synchronization. To disentangle coarse- and fine-grained conditions, we design a swap-label training mechanism, which enables the dominance of the fine-grained conditions. We also devise a mask-based CFG technique to regulate the occurrence and intensity of fine-grained control. In addition, a text-based detector is introduced with text-AU alignment to enable natural language user input and further support multimodal control. Extensive experimental results prove that Cafe-Talk achieves state-of-the-art lip synchronization and expressiveness performance and receives wide acceptance in fine-grained control in user studies. Project page: https://harryxd2018.github.io/cafe-talk/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14517v1-abstract-full').style.display = 'none'; document.getElementById('2503.14517v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICLR'25</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14411">arXiv:2503.14411</a> <span> [<a href="https://arxiv.org/pdf/2503.14411">pdf</a>, <a href="https://arxiv.org/format/2503.14411">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Unifying Text Semantics and Graph Structures for Temporal Text-attributed Graphs with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Siwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yun Xiong</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yateng Tang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xi Chen</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Z">Zian Jia</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Z">Zehao Gu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiarong Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiawei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14411v1-abstract-short" style="display: inline;"> Temporal graph neural networks (TGNNs) have shown remarkable performance in temporal graph modeling. However, real-world temporal graphs often possess rich textual information, giving rise to temporal text-attributed graphs (TTAGs). Such combination of dynamic text semantics and evolving graph structures introduces heightened complexity. Existing TGNNs embed texts statically and rely heavily on en… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14411v1-abstract-full').style.display = 'inline'; document.getElementById('2503.14411v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14411v1-abstract-full" style="display: none;"> Temporal graph neural networks (TGNNs) have shown remarkable performance in temporal graph modeling. However, real-world temporal graphs often possess rich textual information, giving rise to temporal text-attributed graphs (TTAGs). Such combination of dynamic text semantics and evolving graph structures introduces heightened complexity. Existing TGNNs embed texts statically and rely heavily on encoding mechanisms that biasedly prioritize structural information, overlooking the temporal evolution of text semantics and the essential interplay between semantics and structures for synergistic reinforcement. To tackle these issues, we present \textbf{Cross}, a novel framework that seamlessly extends existing TGNNs for TTAG modeling. The key idea is to employ the advanced large language models (LLMs) to extract the dynamic semantics in text space and then generate expressive representations unifying both semantics and structures. Specifically, we propose a Temporal Semantics Extractor in the {Cross} framework, which empowers the LLM to offer the temporal semantic understanding of node's evolving contexts of textual neighborhoods, facilitating semantic dynamics. Subsequently, we introduce the Semantic-structural Co-encoder, which collaborates with the above Extractor for synthesizing illuminating representations by jointly considering both semantic and structural information while encouraging their mutual reinforcement. Extensive experimental results on four public datasets and one practical industrial dataset demonstrate {Cross}'s significant effectiveness and robustness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14411v1-abstract-full').style.display = 'none'; document.getElementById('2503.14411v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submit to ICML2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14074">arXiv:2503.14074</a> <span> [<a href="https://arxiv.org/pdf/2503.14074">pdf</a>, <a href="https://arxiv.org/format/2503.14074">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TMM.2023.3286278">10.1109/TMM.2023.3286278 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Limb-Aware Virtual Try-On Network with Progressive Clothing Warping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shengping Zhang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiaoyu Han</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weigang Zhang</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+X">Xiangyuan Lan</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+H">Hongxun Yao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Q">Qingming Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14074v1-abstract-short" style="display: inline;"> Image-based virtual try-on aims to transfer an in-shop clothing image to a person image. Most existing methods adopt a single global deformation to perform clothing warping directly, which lacks fine-grained modeling of in-shop clothing and leads to distorted clothing appearance. In addition, existing methods usually fail to generate limb details well because they are limited by the used clothing-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14074v1-abstract-full').style.display = 'inline'; document.getElementById('2503.14074v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14074v1-abstract-full" style="display: none;"> Image-based virtual try-on aims to transfer an in-shop clothing image to a person image. Most existing methods adopt a single global deformation to perform clothing warping directly, which lacks fine-grained modeling of in-shop clothing and leads to distorted clothing appearance. In addition, existing methods usually fail to generate limb details well because they are limited by the used clothing-agnostic person representation without referring to the limb textures of the person image. To address these problems, we propose Limb-aware Virtual Try-on Network named PL-VTON, which performs fine-grained clothing warping progressively and generates high-quality try-on results with realistic limb details. Specifically, we present Progressive Clothing Warping (PCW) that explicitly models the location and size of in-shop clothing and utilizes a two-stage alignment strategy to progressively align the in-shop clothing with the human body. Moreover, a novel gravity-aware loss that considers the fit of the person wearing clothing is adopted to better handle the clothing edges. Then, we design Person Parsing Estimator (PPE) with a non-limb target parsing map to semantically divide the person into various regions, which provides structural constraints on the human body and therefore alleviates texture bleeding between clothing and body regions. Finally, we introduce Limb-aware Texture Fusion (LTF) that focuses on generating realistic details in limb regions, where a coarse try-on result is first generated by fusing the warped clothing image with the person image, then limb textures are further fused with the coarse result under limb-aware guidance to refine limb details. Extensive experiments demonstrate that our PL-VTON outperforms the state-of-the-art methods both qualitatively and quantitatively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14074v1-abstract-full').style.display = 'none'; document.getElementById('2503.14074v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE Transactions on Multimedia (TMM). The code is available at https://github.com/aipixel/PL-VTONv2</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> in IEEE Transactions on Multimedia, vol. 26, pp. 1731-1746, 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14040">arXiv:2503.14040</a> <span> [<a href="https://arxiv.org/pdf/2503.14040">pdf</a>, <a href="https://arxiv.org/format/2503.14040">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> MAG: Multi-Modal Aligned Autoregressive Co-Speech Gesture Generation without Vector Quantization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+B">Binjie Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lina Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Sanyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+S">Songen Gu</a>, <a href="/search/cs?searchtype=author&query=Zhi%2C+Y">Yihao Zhi</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+T">Tianyi Zhu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Lei Yang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+L">Long Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14040v1-abstract-short" style="display: inline;"> This work focuses on full-body co-speech gesture generation. Existing methods typically employ an autoregressive model accompanied by vector-quantized tokens for gesture generation, which results in information loss and compromises the realism of the generated gestures. To address this, inspired by the natural continuity of real-world human motion, we propose MAG, a novel multi-modal aligned frame… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14040v1-abstract-full').style.display = 'inline'; document.getElementById('2503.14040v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14040v1-abstract-full" style="display: none;"> This work focuses on full-body co-speech gesture generation. Existing methods typically employ an autoregressive model accompanied by vector-quantized tokens for gesture generation, which results in information loss and compromises the realism of the generated gestures. To address this, inspired by the natural continuity of real-world human motion, we propose MAG, a novel multi-modal aligned framework for high-quality and diverse co-speech gesture synthesis without relying on discrete tokenization. Specifically, (1) we introduce a motion-text-audio-aligned variational autoencoder (MTA-VAE), which leverages pre-trained WavCaps' text and audio embeddings to enhance both semantic and rhythmic alignment with motion, ultimately producing more realistic gestures. (2) Building on this, we propose a multimodal masked autoregressive model (MMAG) that enables autoregressive modeling in continuous motion embeddings through diffusion without vector quantization. To further ensure multi-modal consistency, MMAG incorporates a hybrid granularity audio-text fusion block, which serves as conditioning for diffusion process. Extensive experiments on two benchmark datasets demonstrate that MAG achieves stateof-the-art performance both quantitatively and qualitatively, producing highly realistic and diverse co-speech gestures.The code will be released to facilitate future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14040v1-abstract-full').style.display = 'none'; document.getElementById('2503.14040v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13966">arXiv:2503.13966</a> <span> [<a href="https://arxiv.org/pdf/2503.13966">pdf</a>, <a href="https://arxiv.org/format/2503.13966">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> FlexVLN: Flexible Adaptation for Diverse Vision-and-Language Navigation Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Siqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yanyuan Qiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qunbo Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+L">Longteng Guo</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Z">Zhihua Wei</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jing Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13966v1-abstract-short" style="display: inline;"> The aspiration of the Vision-and-Language Navigation (VLN) task has long been to develop an embodied agent with robust adaptability, capable of seamlessly transferring its navigation capabilities across various tasks. Despite remarkable advancements in recent years, most methods necessitate dataset-specific training, thereby lacking the capability to generalize across diverse datasets encompassing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13966v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13966v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13966v1-abstract-full" style="display: none;"> The aspiration of the Vision-and-Language Navigation (VLN) task has long been to develop an embodied agent with robust adaptability, capable of seamlessly transferring its navigation capabilities across various tasks. Despite remarkable advancements in recent years, most methods necessitate dataset-specific training, thereby lacking the capability to generalize across diverse datasets encompassing distinct types of instructions. Large language models (LLMs) have demonstrated exceptional reasoning and generalization abilities, exhibiting immense potential in robot action planning. In this paper, we propose FlexVLN, an innovative hierarchical approach to VLN that integrates the fundamental navigation ability of a supervised-learning-based Instruction Follower with the robust generalization ability of the LLM Planner, enabling effective generalization across diverse VLN datasets. Moreover, a verification mechanism and a multi-model integration mechanism are proposed to mitigate potential hallucinations by the LLM Planner and enhance execution accuracy of the Instruction Follower. We take REVERIE, SOON, and CVDN-target as out-of-domain datasets for assessing generalization ability. The generalization performance of FlexVLN surpasses that of all the previous methods to a large extent. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13966v1-abstract-full').style.display = 'none'; document.getElementById('2503.13966v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13926">arXiv:2503.13926</a> <span> [<a href="https://arxiv.org/pdf/2503.13926">pdf</a>, <a href="https://arxiv.org/format/2503.13926">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Learning Shape-Independent Transformation via Spherical Representations for Category-Level Object Pose Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ren%2C+H">Huan Ren</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenfei Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiang Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shifeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianzhu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13926v2-abstract-short" style="display: inline;"> Category-level object pose estimation aims to determine the pose and size of novel objects in specific categories. Existing correspondence-based approaches typically adopt point-based representations to establish the correspondences between primitive observed points and normalized object coordinates. However, due to the inherent shape-dependence of canonical coordinates, these methods suffer from… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13926v2-abstract-full').style.display = 'inline'; document.getElementById('2503.13926v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13926v2-abstract-full" style="display: none;"> Category-level object pose estimation aims to determine the pose and size of novel objects in specific categories. Existing correspondence-based approaches typically adopt point-based representations to establish the correspondences between primitive observed points and normalized object coordinates. However, due to the inherent shape-dependence of canonical coordinates, these methods suffer from semantic incoherence across diverse object shapes. To resolve this issue, we innovatively leverage the sphere as a shared proxy shape of objects to learn shape-independent transformation via spherical representations. Based on this insight, we introduce a novel architecture called SpherePose, which yields precise correspondence prediction through three core designs. Firstly, We endow the point-wise feature extraction with SO(3)-invariance, which facilitates robust mapping between camera coordinate space and object coordinate space regardless of rotation transformation. Secondly, the spherical attention mechanism is designed to propagate and integrate features among spherical anchors from a comprehensive perspective, thus mitigating the interference of noise and incomplete point cloud. Lastly, a hyperbolic correspondence loss function is designed to distinguish subtle distinctions, which can promote the precision of correspondence prediction. Experimental results on CAMERA25, REAL275 and HouseCat6D benchmarks demonstrate the superior performance of our method, verifying the effectiveness of spherical representations and architectural innovations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13926v2-abstract-full').style.display = 'none'; document.getElementById('2503.13926v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICLR 2025. Project page is available at https://renhuan1999.github.io/SpherePose</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13522">arXiv:2503.13522</a> <span> [<a href="https://arxiv.org/pdf/2503.13522">pdf</a>, <a href="https://arxiv.org/ps/2503.13522">ps</a>, <a href="https://arxiv.org/format/2503.13522">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Advanced Deep Learning Methods for Protein Structure Prediction and Design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tianyang Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yichao Zhang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+N">Ningyuan Deng</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xinyuan Song</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+Z">Ziqian Bi</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Z">Zheyu Yao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Keyu Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Ming Li</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+Q">Qian Niu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Junyu Liu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+B">Benji Peng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Sen Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Ming Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Li Zhang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+X">Xuanhe Pan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jinlang Wang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+P">Pohsun Feng</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Y">Yizhu Wen</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+L+K">Lawrence KQ Yan</a>, <a href="/search/cs?searchtype=author&query=Tseng%2C+H">Hongming Tseng</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+Y">Yan Zhong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yunze Wang</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Z">Ziyuan Qin</a>, <a href="/search/cs?searchtype=author&query=Jing%2C+B">Bowen Jing</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Junjie Yang</a> , et al. (3 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13522v2-abstract-short" style="display: inline;"> After AlphaFold won the Nobel Prize, protein prediction with deep learning once again became a hot topic. We comprehensively explore advanced deep learning methods applied to protein structure prediction and design. It begins by examining recent innovations in prediction architectures, with detailed discussions on improvements such as diffusion based frameworks and novel pairwise attention modules… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13522v2-abstract-full').style.display = 'inline'; document.getElementById('2503.13522v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13522v2-abstract-full" style="display: none;"> After AlphaFold won the Nobel Prize, protein prediction with deep learning once again became a hot topic. We comprehensively explore advanced deep learning methods applied to protein structure prediction and design. It begins by examining recent innovations in prediction architectures, with detailed discussions on improvements such as diffusion based frameworks and novel pairwise attention modules. The text analyses key components including structure generation, evaluation metrics, multiple sequence alignment processing, and network architecture, thereby illustrating the current state of the art in computational protein modelling. Subsequent chapters focus on practical applications, presenting case studies that range from individual protein predictions to complex biomolecular interactions. Strategies for enhancing prediction accuracy and integrating deep learning techniques with experimental validation are thoroughly explored. The later sections review the industry landscape of protein design, highlighting the transformative role of artificial intelligence in biotechnology and discussing emerging market trends and future challenges. Supplementary appendices provide essential resources such as databases and open source tools, making this volume a valuable reference for researchers and students. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13522v2-abstract-full').style.display = 'none'; document.getElementById('2503.13522v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13447">arXiv:2503.13447</a> <span> [<a href="https://arxiv.org/pdf/2503.13447">pdf</a>, <a href="https://arxiv.org/format/2503.13447">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MetaScale: Test-Time Scaling with Evolving Meta-Thoughts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qin Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Wenxuan Zhou</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+N">Nan Xu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J+Y">James Y. Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fei Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Sheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Poon%2C+H">Hoifung Poon</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Muhao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13447v1-abstract-short" style="display: inline;"> One critical challenge for large language models (LLMs) for making complex reasoning is their reliance on matching reasoning patterns from training data, instead of proactively selecting the most appropriate cognitive strategy to solve a given task. Existing approaches impose fixed cognitive structures that enhance performance in specific tasks but lack adaptability across diverse scenarios. To ad… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13447v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13447v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13447v1-abstract-full" style="display: none;"> One critical challenge for large language models (LLMs) for making complex reasoning is their reliance on matching reasoning patterns from training data, instead of proactively selecting the most appropriate cognitive strategy to solve a given task. Existing approaches impose fixed cognitive structures that enhance performance in specific tasks but lack adaptability across diverse scenarios. To address this limitation, we introduce METASCALE, a test-time scaling framework based on meta-thoughts -- adaptive thinking strategies tailored to each task. METASCALE initializes a pool of candidate meta-thoughts, then iteratively selects and evaluates them using a multi-armed bandit algorithm with upper confidence bound selection, guided by a reward model. To further enhance adaptability, a genetic algorithm evolves high-reward meta-thoughts, refining and extending the strategy pool over time. By dynamically proposing and optimizing meta-thoughts at inference time, METASCALE improves both accuracy and generalization across a wide range of tasks. Experimental results demonstrate that MetaScale consistently outperforms standard inference approaches, achieving an 11% performance gain in win rate on Arena-Hard for GPT-4o, surpassing o1-mini by 0.9% under style control. Notably, METASCALE scales more effectively with increasing sampling budgets and produces more structured, expert-level responses. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13447v1-abstract-full').style.display = 'none'; document.getElementById('2503.13447v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13322">arXiv:2503.13322</a> <span> [<a href="https://arxiv.org/pdf/2503.13322">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SMPR: A structure-enhanced multimodal drug-disease prediction model for drug repositioning and cold start </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+X">Xin Dong</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+R">Rui Miao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Suyan Zhang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+S">Shuaibing Jia</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Leifeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yong Liang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianhua Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y+Z">Yi Zhun Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13322v1-abstract-short" style="display: inline;"> Repositioning drug-disease relationships has always been a hot field of research. However, actual cases of biologically validated drug relocation remain very limited, and existing models have not yet fully utilized the structural information of the drug. Furthermore, most repositioning models are only used to complete the relationship matrix, and their practicality is poor when dealing with drug c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13322v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13322v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13322v1-abstract-full" style="display: none;"> Repositioning drug-disease relationships has always been a hot field of research. However, actual cases of biologically validated drug relocation remain very limited, and existing models have not yet fully utilized the structural information of the drug. Furthermore, most repositioning models are only used to complete the relationship matrix, and their practicality is poor when dealing with drug cold start problems. This paper proposes a structure-enhanced multimodal relationship prediction model (SMRP). SMPR is based on the SMILE structure of the drug, using the Mol2VEC method to generate drug embedded representations, and learn disease embedded representations through heterogeneous network graph neural networks. Ultimately, a drug-disease relationship matrix is constructed. In addition, to reduce the difficulty of users' use, SMPR also provides a cold start interface based on structural similarity based on reposition results to simply and quickly predict drug-related diseases. The repositioning ability and cold start capability of the model are verified from multiple perspectives. While the AUC and ACUPR scores of repositioning reach 99% and 61% respectively, the AUC of cold start achieve 80%. In particular, the cold start Recall indicator can reach more than 70%, which means that SMPR is more sensitive to positive samples. Finally, case analysis is used to verify the practical value of the model and visual analysis directly demonstrates the improvement of the structure to the model. For quick use, we also provide local deployment of the model and package it into an executable program. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13322v1-abstract-full').style.display = 'none'; document.getElementById('2503.13322v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12746">arXiv:2503.12746</a> <span> [<a href="https://arxiv.org/pdf/2503.12746">pdf</a>, <a href="https://arxiv.org/ps/2503.12746">ps</a>, <a href="https://arxiv.org/format/2503.12746">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Geometry">cs.CG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> Constant Approximation of Fr茅chet Distance in Strongly Subquadratic Time </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+S">Siu-Wing Cheng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Haoqiang Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shuo Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12746v1-abstract-short" style="display: inline;"> Let $蟿$ and $蟽$ be two polygonal curves in $\mathbb{R}^d$ for any fixed $d$. Suppose that $蟿$ and $蟽$ have $n$ and $m$ vertices, respectively, and $m\le n$. While conditional lower bounds prevent approximating the Fr茅chet distance between $蟿$ and $蟽$ within a factor of 3 in strongly subquadratic time, the current best approximation algorithm attains a ratio of $n^c$ in strongly subquadratic time,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12746v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12746v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12746v1-abstract-full" style="display: none;"> Let $蟿$ and $蟽$ be two polygonal curves in $\mathbb{R}^d$ for any fixed $d$. Suppose that $蟿$ and $蟽$ have $n$ and $m$ vertices, respectively, and $m\le n$. While conditional lower bounds prevent approximating the Fr茅chet distance between $蟿$ and $蟽$ within a factor of 3 in strongly subquadratic time, the current best approximation algorithm attains a ratio of $n^c$ in strongly subquadratic time, for some constant $c\in(0,1)$. We present a randomized algorithm with running time $O(nm^{0.99}\log(n/\varepsilon))$ that approximates the Fr茅chet distance within a factor of $7+\varepsilon$, with a success probability at least $1-1/n^6$. We also adapt our techniques to develop a randomized algorithm that approximates the \emph{discrete} Fr茅chet distance within a factor of $7+\varepsilon$ in strongly subquadratic time. They are the first algorithms to approximate the Fr茅chet distance and the discrete Fr茅chet distance within constant factors in strongly subquadratic time. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12746v1-abstract-full').style.display = 'none'; document.getElementById('2503.12746v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear at STOC 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12588">arXiv:2503.12588</a> <span> [<a href="https://arxiv.org/pdf/2503.12588">pdf</a>, <a href="https://arxiv.org/format/2503.12588">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3503161.3547999">10.1145/3503161.3547999 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Progressive Limb-Aware Virtual Try-On </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiaoyu Han</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shengping Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qinglin Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zonglin Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chenyang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12588v1-abstract-short" style="display: inline;"> Existing image-based virtual try-on methods directly transfer specific clothing to a human image without utilizing clothing attributes to refine the transferred clothing geometry and textures, which causes incomplete and blurred clothing appearances. In addition, these methods usually mask the limb textures of the input for the clothing-agnostic person representation, which results in inaccurate p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12588v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12588v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12588v1-abstract-full" style="display: none;"> Existing image-based virtual try-on methods directly transfer specific clothing to a human image without utilizing clothing attributes to refine the transferred clothing geometry and textures, which causes incomplete and blurred clothing appearances. In addition, these methods usually mask the limb textures of the input for the clothing-agnostic person representation, which results in inaccurate predictions for human limb regions (i.e., the exposed arm skin), especially when transforming between long-sleeved and short-sleeved garments. To address these problems, we present a progressive virtual try-on framework, named PL-VTON, which performs pixel-level clothing warping based on multiple attributes of clothing and embeds explicit limb-aware features to generate photo-realistic try-on results. Specifically, we design a Multi-attribute Clothing Warping (MCW) module that adopts a two-stage alignment strategy based on multiple attributes to progressively estimate pixel-level clothing displacements. A Human Parsing Estimator (HPE) is then introduced to semantically divide the person into various regions, which provides structural constraints on the human body and therefore alleviates texture bleeding between clothing and limb regions. Finally, we propose a Limb-aware Texture Fusion (LTF) module to estimate high-quality details in limb regions by fusing textures of the clothing and the human body with the guidance of explicit limb-aware features. Extensive experiments demonstrate that our proposed method outperforms the state-of-the-art virtual try-on methods both qualitatively and quantitatively. The code is available at https://github.com/xyhanHIT/PL-VTON. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12588v1-abstract-full').style.display = 'none'; document.getElementById('2503.12588v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACM MM 2022. The code is available at https://github.com/xyhanHIT/PL-VTON</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12383">arXiv:2503.12383</a> <span> [<a href="https://arxiv.org/pdf/2503.12383">pdf</a>, <a href="https://arxiv.org/format/2503.12383">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VRsketch2Gaussian: 3D VR Sketch Guided 3D Object Generation with Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gu%2C+S">Songen Gu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+H">Haoxuan Song</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Binjie Liu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Q">Qian Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Sanyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Haiyong Jiang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jin Huang</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+F">Feng Tian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12383v1-abstract-short" style="display: inline;"> We propose VRSketch2Gaussian, a first VR sketch-guided, multi-modal, native 3D object generation framework that incorporates a 3D Gaussian Splatting representation. As part of our work, we introduce VRSS, the first large-scale paired dataset containing VR sketches, text, images, and 3DGS, bridging the gap in multi-modal VR sketch-based generation. Our approach features the following key innovation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12383v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12383v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12383v1-abstract-full" style="display: none;"> We propose VRSketch2Gaussian, a first VR sketch-guided, multi-modal, native 3D object generation framework that incorporates a 3D Gaussian Splatting representation. As part of our work, we introduce VRSS, the first large-scale paired dataset containing VR sketches, text, images, and 3DGS, bridging the gap in multi-modal VR sketch-based generation. Our approach features the following key innovations: 1) Sketch-CLIP feature alignment. We propose a two-stage alignment strategy that bridges the domain gap between sparse VR sketch embeddings and rich CLIP embeddings, facilitating both VR sketch-based retrieval and generation tasks. 2) Fine-Grained multi-modal conditioning. We disentangle the 3D generation process by using explicit VR sketches for geometric conditioning and text descriptions for appearance control. To facilitate this, we propose a generalizable VR sketch encoder that effectively aligns different modalities. 3) Efficient and high-fidelity 3D native generation. Our method leverages a 3D-native generation approach that enables fast and texture-rich 3D object synthesis. Experiments conducted on our VRSS dataset demonstrate that our method achieves high-quality, multi-modal VR sketch-based 3D generation. We believe our VRSS dataset and VRsketch2Gaussian method will be beneficial for the 3D generation community. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12383v1-abstract-full').style.display = 'none'; document.getElementById('2503.12383v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12333">arXiv:2503.12333</a> <span> [<a href="https://arxiv.org/pdf/2503.12333">pdf</a>, <a href="https://arxiv.org/format/2503.12333">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> GameChat: Multi-LLM Dialogue for Safe, Agile, and Socially Optimal Multi-Agent Navigation in Constrained Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mahadevan%2C+V">Vagul Mahadevan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shangtong Zhang</a>, <a href="/search/cs?searchtype=author&query=Chandra%2C+R">Rohan Chandra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12333v1-abstract-short" style="display: inline;"> Safe, agile, and socially compliant multi-robot navigation in cluttered and constrained environments remains a critical challenge. This is especially difficult with self-interested agents in decentralized settings, where there is no central authority to resolve conflicts induced by spatial symmetry. We address this challenge by proposing a novel approach, GameChat, which facilitates safe, agile, a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12333v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12333v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12333v1-abstract-full" style="display: none;"> Safe, agile, and socially compliant multi-robot navigation in cluttered and constrained environments remains a critical challenge. This is especially difficult with self-interested agents in decentralized settings, where there is no central authority to resolve conflicts induced by spatial symmetry. We address this challenge by proposing a novel approach, GameChat, which facilitates safe, agile, and deadlock-free navigation for both cooperative and self-interested agents. Key to our approach is the use of natural language communication to resolve conflicts, enabling agents to prioritize more urgent tasks and break spatial symmetry in a socially optimal manner. Our algorithm ensures subgame perfect equilibrium, preventing agents from deviating from agreed-upon behaviors and supporting cooperation. Furthermore, we guarantee safety through control barrier functions and preserve agility by minimizing disruptions to agents' planned trajectories. We evaluate GameChat in simulated environments with doorways and intersections. The results show that even in the worst case, GameChat reduces the time for all agents to reach their goals by over 35% from a naive baseline and by over 20% from SMG-CBF in the intersection scenario, while doubling the rate of ensuring the agent with a higher priority task reaches the goal first, from 50% (equivalent to random chance) to a 100% perfect performance at maximizing social welfare. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12333v1-abstract-full').style.display = 'none'; document.getElementById('2503.12333v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12053">arXiv:2503.12053</a> <span> [<a href="https://arxiv.org/pdf/2503.12053">pdf</a>, <a href="https://arxiv.org/format/2503.12053">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Ferret: An Efficient Online Continual Learning Framework under Varying Memory Constraints </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuhao Zhou</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yuxin Tian</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+J">Jindi Lv</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+M">Mingjia Shi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuanxi Li</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Q">Qing Ye</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shuhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+J">Jiancheng Lv</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12053v1-abstract-short" style="display: inline;"> In the realm of high-frequency data streams, achieving real-time learning within varying memory constraints is paramount. This paper presents Ferret, a comprehensive framework designed to enhance online accuracy of Online Continual Learning (OCL) algorithms while dynamically adapting to varying memory budgets. Ferret employs a fine-grained pipeline parallelism strategy combined with an iterative g… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12053v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12053v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12053v1-abstract-full" style="display: none;"> In the realm of high-frequency data streams, achieving real-time learning within varying memory constraints is paramount. This paper presents Ferret, a comprehensive framework designed to enhance online accuracy of Online Continual Learning (OCL) algorithms while dynamically adapting to varying memory budgets. Ferret employs a fine-grained pipeline parallelism strategy combined with an iterative gradient compensation algorithm, ensuring seamless handling of high-frequency data with minimal latency, and effectively counteracting the challenge of stale gradients in parallel training. To adapt to varying memory budgets, its automated model partitioning and pipeline planning optimizes performance regardless of memory limitations. Extensive experiments across 20 benchmarks and 5 integrated OCL algorithms show Ferret's remarkable efficiency, achieving up to 3.7$\times$ lower memory overhead to reach the same online accuracy compared to competing methods. Furthermore, Ferret consistently outperforms these methods across diverse memory budgets, underscoring its superior adaptability. These findings position Ferret as a premier solution for efficient and adaptive OCL framework in real-time environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12053v1-abstract-full').style.display = 'none'; document.getElementById('2503.12053v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11794">arXiv:2503.11794</a> <span> [<a href="https://arxiv.org/pdf/2503.11794">pdf</a>, <a href="https://arxiv.org/format/2503.11794">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Semantic-Clipping: Efficient Vision-Language Modeling with Semantic-Guidedd Visual Selection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+B">Bangzheng Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fei Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Wenxuan Zhou</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+N">Nan Xu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Ben Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Sheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Poon%2C+H">Hoifung Poon</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Muhao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11794v1-abstract-short" style="display: inline;"> Vision-Language Models (VLMs) leverage aligned visual encoders to transform images into visual tokens, allowing them to be processed similarly to text by the backbone large language model (LLM). This unified input paradigm enables VLMs to excel in vision-language tasks such as visual question answering (VQA). To improve fine-grained visual reasoning, recent advancements in vision-language modeling… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11794v1-abstract-full').style.display = 'inline'; document.getElementById('2503.11794v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11794v1-abstract-full" style="display: none;"> Vision-Language Models (VLMs) leverage aligned visual encoders to transform images into visual tokens, allowing them to be processed similarly to text by the backbone large language model (LLM). This unified input paradigm enables VLMs to excel in vision-language tasks such as visual question answering (VQA). To improve fine-grained visual reasoning, recent advancements in vision-language modeling introduce image cropping techniques that feed all encoded sub-images into the model. However, this approach significantly increases the number of visual tokens, leading to inefficiency and potential distractions for the LLM. To address the generalization challenges of image representation in VLMs, we propose a lightweight, universal framework that seamlessly integrates with existing VLMs to enhance their ability to process finegrained details. Our method leverages textual semantics to identify key visual areas, improving VQA performance without requiring any retraining of the VLM. Additionally, it incorporates textual signals into the visual encoding process, enhancing both efficiency and effectiveness. The proposed method, SEMCLIP, strengthens the visual understanding of a 7B VLM, LLaVA-1.5 by 3.3% on average across 7 benchmarks, and particularly by 5.3% on the challenging detailed understanding benchmark V*. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11794v1-abstract-full').style.display = 'none'; document.getElementById('2503.11794v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10875">arXiv:2503.10875</a> <span> [<a href="https://arxiv.org/pdf/2503.10875">pdf</a>, <a href="https://arxiv.org/format/2503.10875">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Convolutional Rectangular Attention Module </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nguyen%2C+H">Hai-Vy Nguyen</a>, <a href="/search/cs?searchtype=author&query=Gamboa%2C+F">Fabrice Gamboa</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Sixin Zhang</a>, <a href="/search/cs?searchtype=author&query=Chhaibi%2C+R">Reda Chhaibi</a>, <a href="/search/cs?searchtype=author&query=Gratton%2C+S">Serge Gratton</a>, <a href="/search/cs?searchtype=author&query=Giaccone%2C+T">Thierry Giaccone</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10875v1-abstract-short" style="display: inline;"> In this paper, we introduce a novel spatial attention module, that can be integrated to any convolutional network. This module guides the model to pay attention to the most discriminative part of an image. This enables the model to attain a better performance by an end-to-end training. In standard approaches, a spatial attention map is generated in a position-wise fashion. We observe that this res… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10875v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10875v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10875v1-abstract-full" style="display: none;"> In this paper, we introduce a novel spatial attention module, that can be integrated to any convolutional network. This module guides the model to pay attention to the most discriminative part of an image. This enables the model to attain a better performance by an end-to-end training. In standard approaches, a spatial attention map is generated in a position-wise fashion. We observe that this results in very irregular boundaries. This could make it difficult to generalize to new samples. In our method, the attention region is constrained to be rectangular. This rectangle is parametrized by only 5 parameters, allowing for a better stability and generalization to new samples. In our experiments, our method systematically outperforms the position-wise counterpart. Thus, this provides us a novel useful spatial attention mechanism for convolutional models. Besides, our module also provides the interpretability concerning the ``where to look" question, as it helps to know the part of the input on which the model focuses to produce the prediction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10875v1-abstract-full').style.display = 'none'; document.getElementById('2503.10875v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10713">arXiv:2503.10713</a> <span> [<a href="https://arxiv.org/pdf/2503.10713">pdf</a>, <a href="https://arxiv.org/format/2503.10713">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> HiCMamba: Enhancing Hi-C Resolution and Identifying 3D Genome Structures with State Space Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+M">Minghao Yang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhi-An Huang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zhihang Zheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuqiao Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shichen Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Pengfei Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+H">Hui Xiong</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Shaojun Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10713v1-abstract-short" style="display: inline;"> Hi-C technology measures genome-wide interaction frequencies, providing a powerful tool for studying the 3D genomic structure within the nucleus. However, high sequencing costs and technical challenges often result in Hi-C data with limited coverage, leading to imprecise estimates of chromatin interaction frequencies. To address this issue, we present a novel deep learning-based method HiCMamba to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10713v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10713v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10713v1-abstract-full" style="display: none;"> Hi-C technology measures genome-wide interaction frequencies, providing a powerful tool for studying the 3D genomic structure within the nucleus. However, high sequencing costs and technical challenges often result in Hi-C data with limited coverage, leading to imprecise estimates of chromatin interaction frequencies. To address this issue, we present a novel deep learning-based method HiCMamba to enhance the resolution of Hi-C contact maps using a state space model. We adopt the UNet-based auto-encoder architecture to stack the proposed holistic scan block, enabling the perception of both global and local receptive fields at multiple scales. Experimental results demonstrate that HiCMamba outperforms state-of-the-art methods while significantly reducing computational resources. Furthermore, the 3D genome structures, including topologically associating domains (TADs) and loops, identified in the contact maps recovered by HiCMamba are validated through associated epigenomic features. Our work demonstrates the potential of a state space model as foundational frameworks in the field of Hi-C resolution enhancement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10713v1-abstract-full').style.display = 'none'; document.getElementById('2503.10713v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10631">arXiv:2503.10631</a> <span> [<a href="https://arxiv.org/pdf/2503.10631">pdf</a>, <a href="https://arxiv.org/format/2503.10631">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> HybridVLA: Collaborative Diffusion and Autoregression in a Unified Vision-Language-Action Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaming Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=An%2C+P">Pengju An</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhuoyang Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Renrui Zhang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+C">Chenyang Gu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaoqi Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Ziyu Guo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Sixiang Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mengzhen Liu</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+C">Chengkai Hou</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+M">Mengdi Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+K+a">KC alex Zhou</a>, <a href="/search/cs?searchtype=author&query=Heng%2C+P">Pheng-Ann Heng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shanghang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10631v2-abstract-short" style="display: inline;"> Recent advancements in vision-language models (VLMs) for common-sense reasoning have led to the development of vision-language-action (VLA) models, enabling robots to perform generalized manipulation. Although existing autoregressive VLA methods leverage large-scale pretrained knowledge, they disrupt the continuity of actions. Meanwhile, some VLA methods incorporate an additional diffusion head to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10631v2-abstract-full').style.display = 'inline'; document.getElementById('2503.10631v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10631v2-abstract-full" style="display: none;"> Recent advancements in vision-language models (VLMs) for common-sense reasoning have led to the development of vision-language-action (VLA) models, enabling robots to perform generalized manipulation. Although existing autoregressive VLA methods leverage large-scale pretrained knowledge, they disrupt the continuity of actions. Meanwhile, some VLA methods incorporate an additional diffusion head to predict continuous actions, relying solely on VLM-extracted features, which limits their reasoning capabilities. In this paper, we introduce HybridVLA, a unified framework that seamlessly integrates the strengths of both autoregressive and diffusion policies within a single large language model, rather than simply connecting them. To bridge the generation gap, a collaborative training recipe is proposed that injects the diffusion modeling directly into the next-token prediction. With this recipe, we find that these two forms of action prediction not only reinforce each other but also exhibit varying performance across different tasks. Therefore, we design a collaborative action ensemble mechanism that adaptively fuses these two predictions, leading to more robust control. In experiments, HybridVLA outperforms previous state-of-the-art VLA methods across various simulation and real-world tasks, including both single-arm and dual-arm robots, while demonstrating stable manipulation in previously unseen configurations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10631v2-abstract-full').style.display = 'none'; document.getElementById('2503.10631v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10480">arXiv:2503.10480</a> <span> [<a href="https://arxiv.org/pdf/2503.10480">pdf</a>, <a href="https://arxiv.org/format/2503.10480">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> World Modeling Makes a Better Planner: Dual Preference Optimization for Embodied Task Planning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Siyin Wang</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+Z">Zhaoye Fei</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Q">Qinyuan Cheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shiduo Zhang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+P">Panpan Cai</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+J">Jinlan Fu</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xipeng Qiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10480v1-abstract-short" style="display: inline;"> Recent advances in large vision-language models (LVLMs) have shown promise for embodied task planning, yet they struggle with fundamental challenges like dependency constraints and efficiency. Existing approaches either solely optimize action selection or leverage world models during inference, overlooking the benefits of learning to model the world as a way to enhance planning capabilities. We pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10480v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10480v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10480v1-abstract-full" style="display: none;"> Recent advances in large vision-language models (LVLMs) have shown promise for embodied task planning, yet they struggle with fundamental challenges like dependency constraints and efficiency. Existing approaches either solely optimize action selection or leverage world models during inference, overlooking the benefits of learning to model the world as a way to enhance planning capabilities. We propose Dual Preference Optimization (D$^2$PO), a new learning framework that jointly optimizes state prediction and action selection through preference learning, enabling LVLMs to understand environment dynamics for better planning. To automatically collect trajectories and stepwise preference data without human annotation, we introduce a tree search mechanism for extensive exploration via trial-and-error. Extensive experiments on VoTa-Bench demonstrate that our D$^2$PO-based method significantly outperforms existing methods and GPT-4o when applied to Qwen2-VL (7B), LLaVA-1.6 (7B), and LLaMA-3.2 (11B), achieving superior task success rates with more efficient execution paths. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10480v1-abstract-full').style.display = 'none'; document.getElementById('2503.10480v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09926">arXiv:2503.09926</a> <span> [<a href="https://arxiv.org/pdf/2503.09926">pdf</a>, <a href="https://arxiv.org/format/2503.09926">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VideoMerge: Towards Training-free Long Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Siyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Harry Yang</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+S">Ser-Nam Lim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09926v1-abstract-short" style="display: inline;"> Long video generation remains a challenging and compelling topic in computer vision. Diffusion based models, among the various approaches to video generation, have achieved state of the art quality with their iterative denoising procedures. However, the intrinsic complexity of the video domain renders the training of such diffusion models exceedingly expensive in terms of both data curation and co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09926v1-abstract-full').style.display = 'inline'; document.getElementById('2503.09926v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09926v1-abstract-full" style="display: none;"> Long video generation remains a challenging and compelling topic in computer vision. Diffusion based models, among the various approaches to video generation, have achieved state of the art quality with their iterative denoising procedures. However, the intrinsic complexity of the video domain renders the training of such diffusion models exceedingly expensive in terms of both data curation and computational resources. Moreover, these models typically operate on a fixed noise tensor that represents the video, resulting in predetermined spatial and temporal dimensions. Although several high quality open-source pretrained video diffusion models, jointly trained on images and videos of varying lengths and resolutions, are available, it is generally not recommended to specify a video length at inference that was not included in the training set. Consequently, these models are not readily adaptable to the direct generation of longer videos by merely increasing the specified video length. In addition to feasibility challenges, long-video generation also encounters quality issues. The domain of long videos is inherently more complex than that of short videos: extended durations introduce greater variability and necessitate long-range temporal consistency, thereby increasing the overall difficulty of the task. We propose VideoMerge, a training-free method that can be seamlessly adapted to merge short videos generated by pretrained text-to-video diffusion model. Our approach preserves the model's original expressiveness and consistency while allowing for extended duration and dynamic variation as specified by the user. By leveraging the strengths of pretrained models, our method addresses challenges related to smoothness, consistency, and dynamic content through orthogonal strategies that operate collaboratively to achieve superior quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09926v1-abstract-full').style.display = 'none'; document.getElementById('2503.09926v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09445">arXiv:2503.09445</a> <span> [<a href="https://arxiv.org/pdf/2503.09445">pdf</a>, <a href="https://arxiv.org/format/2503.09445">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Astrea: A MOE-based Visual Understanding Model with Progressive Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaoda Yang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">JunYu Lu</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+H">Hongshun Qiu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Sijing Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hao Li</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+S">Shengpeng Ji</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xudong Tang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiayang Xu</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+J">Jiaqi Duan</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Cong Lin</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+S">Sihang Cai</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Z">Zejian Xie</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zhuoyang Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Songxin Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09445v1-abstract-short" style="display: inline;"> Vision-Language Models (VLMs) based on Mixture-of-Experts (MoE) architectures have emerged as a pivotal paradigm in multimodal understanding, offering a powerful framework for integrating visual and linguistic information. However, the increasing complexity and diversity of tasks present significant challenges in coordinating load balancing across heterogeneous visual experts, where optimizing one… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09445v1-abstract-full').style.display = 'inline'; document.getElementById('2503.09445v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09445v1-abstract-full" style="display: none;"> Vision-Language Models (VLMs) based on Mixture-of-Experts (MoE) architectures have emerged as a pivotal paradigm in multimodal understanding, offering a powerful framework for integrating visual and linguistic information. However, the increasing complexity and diversity of tasks present significant challenges in coordinating load balancing across heterogeneous visual experts, where optimizing one specialist's performance often compromises others' capabilities. To address task heterogeneity and expert load imbalance, we propose Astrea, a novel multi-expert collaborative VLM architecture based on progressive pre-alignment. Astrea introduces three key innovations: 1) A heterogeneous expert coordination mechanism that integrates four specialized models (detection, segmentation, classification, captioning) into a comprehensive expert matrix covering essential visual comprehension elements; 2) A dynamic knowledge fusion strategy featuring progressive pre-alignment to harmonize experts within the VLM latent space through contrastive learning, complemented by probabilistically activated stochastic residual connections to preserve knowledge continuity; 3) An enhanced optimization framework utilizing momentum contrastive learning for long-range dependency modeling and adaptive weight allocators for real-time expert contribution calibration. Extensive evaluations across 12 benchmark tasks spanning VQA, image captioning, and cross-modal retrieval demonstrate Astrea's superiority over state-of-the-art models, achieving an average performance gain of +4.7\%. This study provides the first empirical demonstration that progressive pre-alignment strategies enable VLMs to overcome task heterogeneity limitations, establishing new methodological foundations for developing general-purpose multimodal agents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09445v1-abstract-full').style.display = 'none'; document.getElementById('2503.09445v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09260">arXiv:2503.09260</a> <span> [<a href="https://arxiv.org/pdf/2503.09260">pdf</a>, <a href="https://arxiv.org/format/2503.09260">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.patcog.2025.111545">10.1016/j.patcog.2025.111545 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Neural Normalized Cut: A Differential and Generalizable Approach for Spectral Clustering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+W">Wei He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shangzhi Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chun-Guang Li</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+X">Xianbiao Qi</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+R">Rong Xiao</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jun Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09260v1-abstract-short" style="display: inline;"> Spectral clustering, as a popular tool for data clustering, requires an eigen-decomposition step on a given affinity to obtain the spectral embedding. Nevertheless, such a step suffers from the lack of generalizability and scalability. Moreover, the obtained spectral embeddings can hardly provide a good approximation to the ground-truth partition and thus a k-means step is adopted to quantize the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09260v1-abstract-full').style.display = 'inline'; document.getElementById('2503.09260v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09260v1-abstract-full" style="display: none;"> Spectral clustering, as a popular tool for data clustering, requires an eigen-decomposition step on a given affinity to obtain the spectral embedding. Nevertheless, such a step suffers from the lack of generalizability and scalability. Moreover, the obtained spectral embeddings can hardly provide a good approximation to the ground-truth partition and thus a k-means step is adopted to quantize the embedding. In this paper, we propose a simple yet effective scalable and generalizable approach, called Neural Normalized Cut (NeuNcut), to learn the clustering membership for spectral clustering directly. In NeuNcut, we properly reparameterize the unknown cluster membership via a neural network, and train the neural network via stochastic gradient descent with a properly relaxed normalized cut loss. As a result, our NeuNcut enjoys a desired generalization ability to directly infer clustering membership for out-of-sample unseen data and hence brings us an efficient way to handle clustering task with ultra large-scale data. We conduct extensive experiments on both synthetic data and benchmark datasets and experimental results validate the effectiveness and the superiority of our approach. Our code is available at: https://github.com/hewei98/NeuNcut. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09260v1-abstract-full').style.display = 'none'; document.getElementById('2503.09260v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 figures, 8 tables, accepted by Pattern Recognition (2025-03-11)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09100">arXiv:2503.09100</a> <span> [<a href="https://arxiv.org/pdf/2503.09100">pdf</a>, <a href="https://arxiv.org/format/2503.09100">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Tacchi 2.0: A Low Computational Cost and Comprehensive Dynamic Contact Simulator for Vision-based Tactile Sensors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuhao Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shixin Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenzhuang Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jie Zhao</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+J">Jianhua Shan</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Z">Zirong Shen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zixi Chen</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+F">Fuchun Sun</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D">Di Guo</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+B">Bin Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09100v1-abstract-short" style="display: inline;"> With the development of robotics technology, some tactile sensors, such as vision-based sensors, have been applied to contact-rich robotics tasks. However, the durability of vision-based tactile sensors significantly increases the cost of tactile information acquisition. Utilizing simulation to generate tactile data has emerged as a reliable approach to address this issue. While data-driven method… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09100v1-abstract-full').style.display = 'inline'; document.getElementById('2503.09100v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09100v1-abstract-full" style="display: none;"> With the development of robotics technology, some tactile sensors, such as vision-based sensors, have been applied to contact-rich robotics tasks. However, the durability of vision-based tactile sensors significantly increases the cost of tactile information acquisition. Utilizing simulation to generate tactile data has emerged as a reliable approach to address this issue. While data-driven methods for tactile data generation lack robustness, finite element methods (FEM) based approaches require significant computational costs. To address these issues, we integrated a pinhole camera model into the low computational cost vision-based tactile simulator Tacchi that used the Material Point Method (MPM) as the simulated method, completing the simulation of marker motion images. We upgraded Tacchi and introduced Tacchi 2.0. This simulator can simulate tactile images, marked motion images, and joint images under different motion states like pressing, slipping, and rotating. Experimental results demonstrate the reliability of our method and its robustness across various vision-based tactile sensors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09100v1-abstract-full').style.display = 'none'; document.getElementById('2503.09100v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08931">arXiv:2503.08931</a> <span> [<a href="https://arxiv.org/pdf/2503.08931">pdf</a>, <a href="https://arxiv.org/format/2503.08931">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> ARCHED: A Human-Centered Framework for Transparent, Responsible, and Collaborative AI-Assisted Instructional Design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongming Li</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yizirui Fang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shan Zhang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S+M">Seiyon M. Lee</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yiming Wang</a>, <a href="/search/cs?searchtype=author&query=Trexler%2C+M">Mark Trexler</a>, <a href="/search/cs?searchtype=author&query=Botelho%2C+A+F">Anthony F. Botelho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08931v1-abstract-short" style="display: inline;"> Integrating Large Language Models (LLMs) in educational technology presents unprecedented opportunities to improve instructional design (ID), yet existing approaches often prioritize automation over pedagogical rigor and human agency. This paper introduces ARCHED (AI for Responsible, Collaborative, Human-centered Education Instructional Design), a structured multi-stage framework that ensures huma… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08931v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08931v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08931v1-abstract-full" style="display: none;"> Integrating Large Language Models (LLMs) in educational technology presents unprecedented opportunities to improve instructional design (ID), yet existing approaches often prioritize automation over pedagogical rigor and human agency. This paper introduces ARCHED (AI for Responsible, Collaborative, Human-centered Education Instructional Design), a structured multi-stage framework that ensures human educators remain central in the design process while leveraging AI capabilities. Unlike traditional AI-generated instructional materials that lack transparency, ARCHED employs a cascaded workflow aligned with Bloom's taxonomy. The framework integrates specialized AI agents - one generating diverse pedagogical options and another evaluating alignment with learning objectives - while maintaining educators as primary decision-makers. This approach addresses key limitations in current AI-assisted instructional design, ensuring transparency, pedagogical foundation, and meaningful human agency. Empirical evaluations demonstrate that ARCHED enhances instructional design quality while preserving educator oversight, marking a step forward in responsible AI integration in education. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08931v1-abstract-full').style.display = 'none'; document.getElementById('2503.08931v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to the iRAISE Workshop at AAAI 2025. To be published in PMLR Volume 273</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> K.3.1; I.2.6 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08914">arXiv:2503.08914</a> <span> [<a href="https://arxiv.org/pdf/2503.08914">pdf</a>, <a href="https://arxiv.org/format/2503.08914">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Cabinet: Dynamically Weighted Consensus Made Fast </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Gengrui Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shiquan Zhang</a>, <a href="/search/cs?searchtype=author&query=Bachras%2C+M">Michail Bachras</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuqiu Zhang</a>, <a href="/search/cs?searchtype=author&query=Jacobsen%2C+H">Hans-Arno Jacobsen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08914v1-abstract-short" style="display: inline;"> Conventional consensus algorithms, such as Paxos and Raft, encounter inefficiencies when applied to large-scale distributed systems due to the requirement of waiting for replies from a majority of nodes. To address these challenges, we propose Cabinet, a novel consensus algorithm that introduces dynamically weighted consensus, allocating distinct weights to nodes based on any given failure thresho… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08914v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08914v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08914v1-abstract-full" style="display: none;"> Conventional consensus algorithms, such as Paxos and Raft, encounter inefficiencies when applied to large-scale distributed systems due to the requirement of waiting for replies from a majority of nodes. To address these challenges, we propose Cabinet, a novel consensus algorithm that introduces dynamically weighted consensus, allocating distinct weights to nodes based on any given failure thresholds. Cabinet dynamically adjusts nodes' weights according to their responsiveness, assigning higher weights to faster nodes. The dynamic weight assignment maintains an optimal system performance, especially in large-scale and heterogeneous systems where node responsiveness varies. We evaluate Cabinet against Raft with distributed MongoDB and PostgreSQL databases using YCSB and TPC-C workloads. The evaluation results show that Cabinet outperforms Raft in throughput and latency under increasing system scales, complex networks, and failures in both homogeneous and heterogeneous clusters, offering a promising high-performance consensus solution. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08914v1-abstract-full').style.display = 'none'; document.getElementById('2503.08914v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07891">arXiv:2503.07891</a> <span> [<a href="https://arxiv.org/pdf/2503.07891">pdf</a>, <a href="https://arxiv.org/format/2503.07891">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Gemini Embedding: Generalizable Embeddings from Gemini </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jinhyuk Lee</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+F">Feiyang Chen</a>, <a href="/search/cs?searchtype=author&query=Dua%2C+S">Sahil Dua</a>, <a href="/search/cs?searchtype=author&query=Cer%2C+D">Daniel Cer</a>, <a href="/search/cs?searchtype=author&query=Shanbhogue%2C+M">Madhuri Shanbhogue</a>, <a href="/search/cs?searchtype=author&query=Naim%2C+I">Iftekhar Naim</a>, <a href="/search/cs?searchtype=author&query=%C3%81brego%2C+G+H">Gustavo Hern谩ndez 脕brego</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhe Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kaifeng Chen</a>, <a href="/search/cs?searchtype=author&query=Vera%2C+H+S">Henrique Schechter Vera</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+X">Xiaoqi Ren</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shanfeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Salz%2C+D">Daniel Salz</a>, <a href="/search/cs?searchtype=author&query=Boratko%2C+M">Michael Boratko</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jay Han</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Blair Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Shuo Huang</a>, <a href="/search/cs?searchtype=author&query=Rao%2C+V">Vikram Rao</a>, <a href="/search/cs?searchtype=author&query=Suganthan%2C+P">Paul Suganthan</a>, <a href="/search/cs?searchtype=author&query=Han%2C+F">Feng Han</a>, <a href="/search/cs?searchtype=author&query=Doumanoglou%2C+A">Andreas Doumanoglou</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+N">Nithi Gupta</a>, <a href="/search/cs?searchtype=author&query=Moiseev%2C+F">Fedor Moiseev</a>, <a href="/search/cs?searchtype=author&query=Yip%2C+C">Cathy Yip</a>, <a href="/search/cs?searchtype=author&query=Jain%2C+A">Aashi Jain</a> , et al. (22 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07891v1-abstract-short" style="display: inline;"> In this report, we introduce Gemini Embedding, a state-of-the-art embedding model leveraging the power of Gemini, Google's most capable large language model. Capitalizing on Gemini's inherent multilingual and code understanding capabilities, Gemini Embedding produces highly generalizable embeddings for text spanning numerous languages and textual modalities. The representations generated by Gemini… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07891v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07891v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07891v1-abstract-full" style="display: none;"> In this report, we introduce Gemini Embedding, a state-of-the-art embedding model leveraging the power of Gemini, Google's most capable large language model. Capitalizing on Gemini's inherent multilingual and code understanding capabilities, Gemini Embedding produces highly generalizable embeddings for text spanning numerous languages and textual modalities. The representations generated by Gemini Embedding can be precomputed and applied to a variety of downstream tasks including classification, similarity, clustering, ranking, and retrieval. Evaluated on the Massive Multilingual Text Embedding Benchmark (MMTEB), which includes over one hundred tasks across 250+ languages, Gemini Embedding substantially outperforms prior state-of-the-art models, demonstrating considerable improvements in embedding quality. Achieving state-of-the-art performance across MMTEB's multilingual, English, and code benchmarks, our unified model demonstrates strong capabilities across a broad selection of tasks and surpasses specialized domain-specific models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07891v1-abstract-full').style.display = 'none'; document.getElementById('2503.07891v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>