Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 1,446 results for author: <span class="mathjax">Wang, F</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Wang%2C+F">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Wang, F"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Wang%2C+F&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Wang, F"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Wang%2C+F&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Wang%2C+F&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+F&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+F&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+F&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+F&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.24057">arXiv:2503.24057</a> <span> [<a href="https://arxiv.org/pdf/2503.24057">pdf</a>, <a href="https://arxiv.org/format/2503.24057">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AMMSM: Adaptive Motion Magnification and Sparse Mamba for Micro-Expression Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xuxiong Liu</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+T">Tengteng Dong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fei Wang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+W">Weijie Feng</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xiao Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.24057v1-abstract-short" style="display: inline;"> Micro-expressions are typically regarded as unconscious manifestations of a person's genuine emotions. However, their short duration and subtle signals pose significant challenges for downstream recognition. We propose a multi-task learning framework named the Adaptive Motion Magnification and Sparse Mamba (AMMSM) to address this. This framework aims to enhance the accurate capture of micro-expres… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.24057v1-abstract-full').style.display = 'inline'; document.getElementById('2503.24057v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.24057v1-abstract-full" style="display: none;"> Micro-expressions are typically regarded as unconscious manifestations of a person's genuine emotions. However, their short duration and subtle signals pose significant challenges for downstream recognition. We propose a multi-task learning framework named the Adaptive Motion Magnification and Sparse Mamba (AMMSM) to address this. This framework aims to enhance the accurate capture of micro-expressions through self-supervised subtle motion magnification, while the sparse spatial selection Mamba architecture combines sparse activation with the advanced Visual Mamba model to model key motion regions and their valuable representations more effectively. Additionally, we employ evolutionary search to optimize the magnification factor and the sparsity ratios of spatial selection, followed by fine-tuning to improve performance further. Extensive experiments on two standard datasets demonstrate that the proposed AMMSM achieves state-of-the-art (SOTA) accuracy and robustness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.24057v1-abstract-full').style.display = 'none'; document.getElementById('2503.24057v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICME 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.24021">arXiv:2503.24021</a> <span> [<a href="https://arxiv.org/pdf/2503.24021">pdf</a>, <a href="https://arxiv.org/format/2503.24021">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> IntelliCircos: A Data-driven and AI-powered Authoring Tool for Circos Plots </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gu%2C+M">Mingyang Gu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiamin Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qipeng Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fengjie Wang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+X">Xiaolin Wen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yong Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Min Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.24021v1-abstract-short" style="display: inline;"> Genomics data is essential in biological and medical domains, and bioinformatics analysts often manually create circos plots to analyze the data and extract valuable insights. However, creating circos plots is complex, as it requires careful design for multiple track attributes and positional relationships between them. Typically, analysts often seek inspiration from existing circos plots, and the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.24021v1-abstract-full').style.display = 'inline'; document.getElementById('2503.24021v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.24021v1-abstract-full" style="display: none;"> Genomics data is essential in biological and medical domains, and bioinformatics analysts often manually create circos plots to analyze the data and extract valuable insights. However, creating circos plots is complex, as it requires careful design for multiple track attributes and positional relationships between them. Typically, analysts often seek inspiration from existing circos plots, and they have to iteratively adjust and refine the plot to achieve a satisfactory final design, making the process both tedious and time-intensive. To address these challenges, we propose IntelliCircos, an AI-powered interactive authoring tool that streamlines the process from initial visual design to the final implementation of circos plots. Specifically, we build a new dataset containing 4396 circos plots with corresponding annotations and configurations, which are extracted and labeled from published papers. With the dataset, we further identify track combination patterns, and utilize Large Language Model (LLM) to provide domain-specific design recommendations and configuration references to navigate the design of circos plots. We conduct a user study with 8 bioinformatics analysts to evaluate IntelliCircos, and the results demonstrate its usability and effectiveness in authoring circos plots. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.24021v1-abstract-full').style.display = 'none'; document.getElementById('2503.24021v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23881">arXiv:2503.23881</a> <span> [<a href="https://arxiv.org/pdf/2503.23881">pdf</a>, <a href="https://arxiv.org/format/2503.23881">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ExScene: Free-View 3D Scene Reconstruction with Gaussian Splatting from a Single Image </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gong%2C+T">Tianyi Gong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Boyan Li</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+Y">Yifei Zhong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fangxin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23881v1-abstract-short" style="display: inline;"> The increasing demand for augmented and virtual reality applications has highlighted the importance of crafting immersive 3D scenes from a simple single-view image. However, due to the partial priors provided by single-view input, existing methods are often limited to reconstruct low-consistency 3D scenes with narrow fields of view from single-view input. These limitations make them less capable o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23881v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23881v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23881v1-abstract-full" style="display: none;"> The increasing demand for augmented and virtual reality applications has highlighted the importance of crafting immersive 3D scenes from a simple single-view image. However, due to the partial priors provided by single-view input, existing methods are often limited to reconstruct low-consistency 3D scenes with narrow fields of view from single-view input. These limitations make them less capable of generalizing to reconstruct immersive scenes. To address this problem, we propose ExScene, a two-stage pipeline to reconstruct an immersive 3D scene from any given single-view image. ExScene designs a novel multimodal diffusion model to generate a high-fidelity and globally consistent panoramic image. We then develop a panoramic depth estimation approach to calculate geometric information from panorama, and we combine geometric information with high-fidelity panoramic image to train an initial 3D Gaussian Splatting (3DGS) model. Following this, we introduce a GS refinement technique with 2D stable video diffusion priors. We add camera trajectory consistency and color-geometric priors into the denoising process of diffusion to improve color and spatial consistency across image sequences. These refined sequences are then used to fine-tune the initial 3DGS model, leading to better reconstruction quality. Experimental results demonstrate that our ExScene achieves consistent and immersive scene reconstruction using only single-view input, significantly surpassing state-of-the-art baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23881v1-abstract-full').style.display = 'none'; document.getElementById('2503.23881v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICME 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23771">arXiv:2503.23771</a> <span> [<a href="https://arxiv.org/pdf/2503.23771">pdf</a>, <a href="https://arxiv.org/format/2503.23771">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> XLRS-Bench: Could Your Multimodal LLMs Understand Extremely Large Ultra-High-Resolution Remote Sensing Imagery? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fengxiang Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongzhen Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mingshuo Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Di Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yulin Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zonghao Guo</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Q">Qiang Ma</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+L">Long Lan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenjing Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23771v1-abstract-short" style="display: inline;"> The astonishing breakthrough of multimodal large language models (MLLMs) has necessitated new benchmarks to quantitatively assess their capabilities, reveal their limitations, and indicate future research directions. However, this is challenging in the context of remote sensing (RS), since the imagery features ultra-high resolution that incorporates extremely complex semantic relationships. Existi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23771v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23771v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23771v1-abstract-full" style="display: none;"> The astonishing breakthrough of multimodal large language models (MLLMs) has necessitated new benchmarks to quantitatively assess their capabilities, reveal their limitations, and indicate future research directions. However, this is challenging in the context of remote sensing (RS), since the imagery features ultra-high resolution that incorporates extremely complex semantic relationships. Existing benchmarks usually adopt notably smaller image sizes than real-world RS scenarios, suffer from limited annotation quality, and consider insufficient dimensions of evaluation. To address these issues, we present XLRS-Bench: a comprehensive benchmark for evaluating the perception and reasoning capabilities of MLLMs in ultra-high-resolution RS scenarios. XLRS-Bench boasts the largest average image size (8500$\times$8500) observed thus far, with all evaluation samples meticulously annotated manually, assisted by a novel semi-automatic captioner on ultra-high-resolution RS images. On top of the XLRS-Bench, 16 sub-tasks are defined to evaluate MLLMs' 10 kinds of perceptual capabilities and 6 kinds of reasoning capabilities, with a primary emphasis on advanced cognitive processes that facilitate real-world decision-making and the capture of spatiotemporal changes. The results of both general and RS-focused MLLMs on XLRS-Bench indicate that further efforts are needed for real-world RS applications. We have open-sourced XLRS-Bench to support further research in developing more powerful MLLMs for remote sensing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23771v1-abstract-full').style.display = 'none'; document.getElementById('2503.23771v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">It has been accepted by CVPR2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23002">arXiv:2503.23002</a> <span> [<a href="https://arxiv.org/pdf/2503.23002">pdf</a>, <a href="https://arxiv.org/format/2503.23002">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Learning Structure-enhanced Temporal Point Processes with Gromov-Wasserstein Regularization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qingmei Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fanmeng Wang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+B">Bing Su</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hongteng Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23002v1-abstract-short" style="display: inline;"> Real-world event sequences are often generated by different temporal point processes (TPPs) and thus have clustering structures. Nonetheless, in the modeling and prediction of event sequences, most existing TPPs ignore the inherent clustering structures of the event sequences, leading to the models with unsatisfactory interpretability. In this study, we learn structure-enhanced TPPs with the help… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23002v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23002v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23002v1-abstract-full" style="display: none;"> Real-world event sequences are often generated by different temporal point processes (TPPs) and thus have clustering structures. Nonetheless, in the modeling and prediction of event sequences, most existing TPPs ignore the inherent clustering structures of the event sequences, leading to the models with unsatisfactory interpretability. In this study, we learn structure-enhanced TPPs with the help of Gromov-Wasserstein (GW) regularization, which imposes clustering structures on the sequence-level embeddings of the TPPs in the maximum likelihood estimation framework.In the training phase, the proposed method leverages a nonparametric TPP kernel to regularize the similarity matrix derived based on the sequence embeddings. In large-scale applications, we sample the kernel matrix and implement the regularization as a Gromov-Wasserstein (GW) discrepancy term, which achieves a trade-off between regularity and computational efficiency.The TPPs learned through this method result in clustered sequence embeddings and demonstrate competitive predictive and clustering performance, significantly improving the model interpretability without compromising prediction accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23002v1-abstract-full').style.display = 'none'; document.getElementById('2503.23002v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at the Web Conference workshop 2025</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 60G55; 62M10 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21816">arXiv:2503.21816</a> <span> [<a href="https://arxiv.org/pdf/2503.21816">pdf</a>, <a href="https://arxiv.org/format/2503.21816">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> EVPGS: Enhanced View Prior Guidance for Splatting-based Extrapolated View Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiahe Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Feiyu Wang</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+X">Xiaochao Qu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chengjing Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Luoqi Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Ting Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21816v1-abstract-short" style="display: inline;"> Gaussian Splatting (GS)-based methods rely on sufficient training view coverage and perform synthesis on interpolated views. In this work, we tackle the more challenging and underexplored Extrapolated View Synthesis (EVS) task. Here we enable GS-based models trained with limited view coverage to generalize well to extrapolated views. To achieve our goal, we propose a view augmentation framework to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21816v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21816v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21816v1-abstract-full" style="display: none;"> Gaussian Splatting (GS)-based methods rely on sufficient training view coverage and perform synthesis on interpolated views. In this work, we tackle the more challenging and underexplored Extrapolated View Synthesis (EVS) task. Here we enable GS-based models trained with limited view coverage to generalize well to extrapolated views. To achieve our goal, we propose a view augmentation framework to guide training through a coarse-to-fine process. At the coarse stage, we reduce rendering artifacts due to insufficient view coverage by introducing a regularization strategy at both appearance and geometry levels. At the fine stage, we generate reliable view priors to provide further training guidance. To this end, we incorporate an occlusion awareness into the view prior generation process, and refine the view priors with the aid of coarse stage output. We call our framework Enhanced View Prior Guidance for Splatting (EVPGS). To comprehensively evaluate EVPGS on the EVS task, we collect a real-world dataset called Merchandise3D dedicated to the EVS scenario. Experiments on three datasets including both real and synthetic demonstrate EVPGS achieves state-of-the-art performance, while improving synthesis quality at extrapolated views for GS-based methods both qualitatively and quantitatively. We will make our code, dataset, and models public. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21816v1-abstract-full').style.display = 'none'; document.getElementById('2503.21816v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21781">arXiv:2503.21781</a> <span> [<a href="https://arxiv.org/pdf/2503.21781">pdf</a>, <a href="https://arxiv.org/format/2503.21781">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VideoMage: Multi-Subject and Motion Customization of Text-to-Video Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chi-Pin Huang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yen-Siang Wu</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+H">Hung-Kai Chung</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Po Chang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+F">Fu-En Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y+F">Yu-Chiang Frank Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21781v1-abstract-short" style="display: inline;"> Customized text-to-video generation aims to produce high-quality videos that incorporate user-specified subject identities or motion patterns. However, existing methods mainly focus on personalizing a single concept, either subject identity or motion pattern, limiting their effectiveness for multiple subjects with the desired motion patterns. To tackle this challenge, we propose a unified framewor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21781v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21781v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21781v1-abstract-full" style="display: none;"> Customized text-to-video generation aims to produce high-quality videos that incorporate user-specified subject identities or motion patterns. However, existing methods mainly focus on personalizing a single concept, either subject identity or motion pattern, limiting their effectiveness for multiple subjects with the desired motion patterns. To tackle this challenge, we propose a unified framework VideoMage for video customization over both multiple subjects and their interactive motions. VideoMage employs subject and motion LoRAs to capture personalized content from user-provided images and videos, along with an appearance-agnostic motion learning approach to disentangle motion patterns from visual appearance. Furthermore, we develop a spatial-temporal composition scheme to guide interactions among subjects within the desired motion patterns. Extensive experiments demonstrate that VideoMage outperforms existing methods, generating coherent, user-controlled videos with consistent subject identities and interactions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21781v1-abstract-full').style.display = 'none'; document.getElementById('2503.21781v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2025. Project Page: https://jasper0314-huang.github.io/videomage-customization</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21562">arXiv:2503.21562</a> <span> [<a href="https://arxiv.org/pdf/2503.21562">pdf</a>, <a href="https://arxiv.org/format/2503.21562">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> uLayout: Unified Room Layout Estimation for Perspective and Panoramic Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jonathan Lee</a>, <a href="/search/cs?searchtype=author&query=Solarte%2C+B">Bolivar Solarte</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chin-Hsuan Wu</a>, <a href="/search/cs?searchtype=author&query=Jhang%2C+J">Jin-Cheng Jhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fu-En Wang</a>, <a href="/search/cs?searchtype=author&query=Tsai%2C+Y">Yi-Hsuan Tsai</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Min Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21562v1-abstract-short" style="display: inline;"> We present uLayout, a unified model for estimating room layout geometries from both perspective and panoramic images, whereas traditional solutions require different model designs for each image type. The key idea of our solution is to unify both domains into the equirectangular projection, particularly, allocating perspective images into the most suitable latitude coordinate to effectively exploi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21562v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21562v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21562v1-abstract-full" style="display: none;"> We present uLayout, a unified model for estimating room layout geometries from both perspective and panoramic images, whereas traditional solutions require different model designs for each image type. The key idea of our solution is to unify both domains into the equirectangular projection, particularly, allocating perspective images into the most suitable latitude coordinate to effectively exploit both domains seamlessly. To address the Field-of-View (FoV) difference between the input domains, we design uLayout with a shared feature extractor with an extra 1D-Convolution layer to condition each domain input differently. This conditioning allows us to efficiently formulate a column-wise feature regression problem regardless of the FoV input. This simple yet effective approach achieves competitive performance with current state-of-the-art solutions and shows for the first time a single end-to-end model for both domains. Extensive experiments in the real-world datasets, LSUN, Matterport3D, PanoContext, and Stanford 2D-3D evidence the contribution of our approach. Code is available at https://github.com/JonathanLee112/uLayout. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21562v1-abstract-full').style.display = 'none'; document.getElementById('2503.21562v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to WACV-2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21122">arXiv:2503.21122</a> <span> [<a href="https://arxiv.org/pdf/2503.21122">pdf</a>, <a href="https://arxiv.org/format/2503.21122">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> One Snapshot is All You Need: A Generalized Method for mmWave Signal Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+T">Teng Huang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+H">Han Ding</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+W">Wenxin Sun</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Cui Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Ge Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fei Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+K">Kun Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhi Wang</a>, <a href="/search/cs?searchtype=author&query=Xi%2C+W">Wei Xi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21122v1-abstract-short" style="display: inline;"> Wireless sensing systems, particularly those using mmWave technology, offer distinct advantages over traditional vision-based approaches, such as enhanced privacy and effectiveness in poor lighting conditions. These systems, leveraging FMCW signals, have shown success in human-centric applications like localization, gesture recognition, and so on. However, comprehensive mmWave datasets for diverse… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21122v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21122v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21122v1-abstract-full" style="display: none;"> Wireless sensing systems, particularly those using mmWave technology, offer distinct advantages over traditional vision-based approaches, such as enhanced privacy and effectiveness in poor lighting conditions. These systems, leveraging FMCW signals, have shown success in human-centric applications like localization, gesture recognition, and so on. However, comprehensive mmWave datasets for diverse applications are scarce, often constrained by pre-processed signatures (e.g., point clouds or RA heatmaps) and inconsistent annotation formats. To overcome these limitations, we propose mmGen, a novel and generalized framework tailored for full-scene mmWave signal generation. By constructing physical signal transmission models, mmGen synthesizes human-reflected and environment-reflected mmWave signals from the constructed 3D meshes. Additionally, we incorporate methods to account for material properties, antenna gains, and multipath reflections, enhancing the realism of the synthesized signals. We conduct extensive experiments using a prototype system with commercial mmWave devices and Kinect sensors. The results show that the average similarity of Range-Angle and micro-Doppler signatures between the synthesized and real-captured signals across three different environments exceeds 0.91 and 0.89, respectively, demonstrating the effectiveness and practical applicability of mmGen. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21122v1-abstract-full').style.display = 'none'; document.getElementById('2503.21122v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IEEE INFOCOM 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19281">arXiv:2503.19281</a> <span> [<a href="https://arxiv.org/pdf/2503.19281">pdf</a>, <a href="https://arxiv.org/format/2503.19281">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CubeRobot: Grounding Language in Rubik's Cube Manipulation via Vision-Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+F">Feiyang Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xiaomin Yu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wangyu Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19281v1-abstract-short" style="display: inline;"> Proving Rubik's Cube theorems at the high level represents a notable milestone in human-level spatial imagination and logic thinking and reasoning. Traditional Rubik's Cube robots, relying on complex vision systems and fixed algorithms, often struggle to adapt to complex and dynamic scenarios. To overcome this limitation, we introduce CubeRobot, a novel vision-language model (VLM) tailored for sol… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19281v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19281v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19281v1-abstract-full" style="display: none;"> Proving Rubik's Cube theorems at the high level represents a notable milestone in human-level spatial imagination and logic thinking and reasoning. Traditional Rubik's Cube robots, relying on complex vision systems and fixed algorithms, often struggle to adapt to complex and dynamic scenarios. To overcome this limitation, we introduce CubeRobot, a novel vision-language model (VLM) tailored for solving 3x3 Rubik's Cubes, empowering embodied agents with multimodal understanding and execution capabilities. We used the CubeCoT image dataset, which contains multiple-level tasks (43 subtasks in total) that humans are unable to handle, encompassing various cube states. We incorporate a dual-loop VisionCoT architecture and Memory Stream, a paradigm for extracting task-related features from VLM-generated planning queries, thus enabling CubeRobot to independent planning, decision-making, reflection and separate management of high- and low-level Rubik's Cube tasks. Furthermore, in low-level Rubik's Cube restoration tasks, CubeRobot achieved a high accuracy rate of 100%, similar to 100% in medium-level tasks, and achieved an accuracy rate of 80% in high-level tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19281v1-abstract-full').style.display = 'none'; document.getElementById('2503.19281v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19199">arXiv:2503.19199</a> <span> [<a href="https://arxiv.org/pdf/2503.19199">pdf</a>, <a href="https://arxiv.org/format/2503.19199">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Open-Vocabulary Functional 3D Scene Graphs for Real-World Indoor Spaces </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenyangguang Zhang</a>, <a href="/search/cs?searchtype=author&query=Delitzas%2C+A">Alexandros Delitzas</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fangjinhua Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruida Zhang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+X">Xiangyang Ji</a>, <a href="/search/cs?searchtype=author&query=Pollefeys%2C+M">Marc Pollefeys</a>, <a href="/search/cs?searchtype=author&query=Engelmann%2C+F">Francis Engelmann</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19199v1-abstract-short" style="display: inline;"> We introduce the task of predicting functional 3D scene graphs for real-world indoor environments from posed RGB-D images. Unlike traditional 3D scene graphs that focus on spatial relationships of objects, functional 3D scene graphs capture objects, interactive elements, and their functional relationships. Due to the lack of training data, we leverage foundation models, including visual language m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19199v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19199v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19199v1-abstract-full" style="display: none;"> We introduce the task of predicting functional 3D scene graphs for real-world indoor environments from posed RGB-D images. Unlike traditional 3D scene graphs that focus on spatial relationships of objects, functional 3D scene graphs capture objects, interactive elements, and their functional relationships. Due to the lack of training data, we leverage foundation models, including visual language models (VLMs) and large language models (LLMs), to encode functional knowledge. We evaluate our approach on an extended SceneFun3D dataset and a newly collected dataset, FunGraph3D, both annotated with functional 3D scene graphs. Our method significantly outperforms adapted baselines, including Open3DSG and ConceptGraph, demonstrating its effectiveness in modeling complex scene functionalities. We also demonstrate downstream applications such as 3D question answering and robotic manipulation using functional 3D scene graphs. See our project page at https://openfungraph.github.io <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19199v1-abstract-full').style.display = 'none'; document.getElementById('2503.19199v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18988">arXiv:2503.18988</a> <span> [<a href="https://arxiv.org/pdf/2503.18988">pdf</a>, <a href="https://arxiv.org/format/2503.18988">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> SG-Tailor: Inter-Object Commonsense Relationship Reasoning for Scene Graph Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shang%2C+H">Haoliang Shang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Hanyu Wu</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+G">Guangyao Zhai</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+B">Boyang Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fangjinhua Wang</a>, <a href="/search/cs?searchtype=author&query=Tombari%2C+F">Federico Tombari</a>, <a href="/search/cs?searchtype=author&query=Pollefeys%2C+M">Marc Pollefeys</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18988v1-abstract-short" style="display: inline;"> Scene graphs capture complex relationships among objects, serving as strong priors for content generation and manipulation. Yet, reasonably manipulating scene graphs -- whether by adding nodes or modifying edges -- remains a challenging and untouched task. Tasks such as adding a node to the graph or reasoning about a node's relationships with all others are computationally intractable, as even a s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18988v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18988v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18988v1-abstract-full" style="display: none;"> Scene graphs capture complex relationships among objects, serving as strong priors for content generation and manipulation. Yet, reasonably manipulating scene graphs -- whether by adding nodes or modifying edges -- remains a challenging and untouched task. Tasks such as adding a node to the graph or reasoning about a node's relationships with all others are computationally intractable, as even a single edge modification can trigger conflicts due to the intricate interdependencies within the graph. To address these challenges, we introduce SG-Tailor, an autoregressive model that predicts the conflict-free relationship between any two nodes. SG-Tailor not only infers inter-object relationships, including generating commonsense edges for newly added nodes but also resolves conflicts arising from edge modifications to produce coherent, manipulated graphs for downstream tasks. For node addition, the model queries the target node and other nodes from the graph to predict the appropriate relationships. For edge modification, SG-Tailor employs a Cut-And-Stitch strategy to solve the conflicts and globally adjust the graph. Extensive experiments demonstrate that SG-Tailor outperforms competing methods by a large margin and can be seamlessly integrated as a plug-in module for scene generation and robotic manipulation tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18988v1-abstract-full').style.display = 'none'; document.getElementById('2503.18988v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The code will be available at https://github.com/josef5838/SG-Tailor</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18246">arXiv:2503.18246</a> <span> [<a href="https://arxiv.org/pdf/2503.18246">pdf</a>, <a href="https://arxiv.org/format/2503.18246">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ZECO: ZeroFusion Guided 3D MRI Conditional Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+F">Feiran Wang</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+B">Bin Duan</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+J">Jiachen Tao</a>, <a href="/search/cs?searchtype=author&query=Sharma%2C+N">Nikhil Sharma</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+D">Dawen Cai</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yan Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18246v1-abstract-short" style="display: inline;"> Medical image segmentation is crucial for enhancing diagnostic accuracy and treatment planning in Magnetic Resonance Imaging (MRI). However, acquiring precise lesion masks for segmentation model training demands specialized expertise and significant time investment, leading to a small dataset scale in clinical practice. In this paper, we present ZECO, a ZeroFusion guided 3D MRI conditional generat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18246v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18246v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18246v1-abstract-full" style="display: none;"> Medical image segmentation is crucial for enhancing diagnostic accuracy and treatment planning in Magnetic Resonance Imaging (MRI). However, acquiring precise lesion masks for segmentation model training demands specialized expertise and significant time investment, leading to a small dataset scale in clinical practice. In this paper, we present ZECO, a ZeroFusion guided 3D MRI conditional generation framework that extracts, compresses, and generates high-fidelity MRI images with corresponding 3D segmentation masks to mitigate data scarcity. To effectively capture inter-slice relationships within volumes, we introduce a Spatial Transformation Module that encodes MRI images into a compact latent space for the diffusion process. Moving beyond unconditional generation, our novel ZeroFusion method progressively maps 3D masks to MRI images in latent space, enabling robust training on limited datasets while avoiding overfitting. ZECO outperforms state-of-the-art models in both quantitative and qualitative evaluations on Brain MRI datasets across various modalities, showcasing its exceptional capability in synthesizing high-quality MRI images conditioned on segmentation masks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18246v1-abstract-full').style.display = 'none'; document.getElementById('2503.18246v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: \url{https://brack-wang.github.io/ZECO_web/}; Github Code: \url{https://github.com/Brack-Wang/ZECO}</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17752">arXiv:2503.17752</a> <span> [<a href="https://arxiv.org/pdf/2503.17752">pdf</a>, <a href="https://arxiv.org/format/2503.17752">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HiLoTs: High-Low Temporal Sensitive Representation Learning for Semi-Supervised LiDAR Segmentation in Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+R+D">R. D. Lin</a>, <a href="/search/cs?searchtype=author&query=Weng%2C+P">Pengcheng Weng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yinqiao Wang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+H">Han Ding</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jinsong Han</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17752v1-abstract-short" style="display: inline;"> LiDAR point cloud semantic segmentation plays a crucial role in autonomous driving. In recent years, semi-supervised methods have gained popularity due to their significant reduction in annotation labor and time costs. Current semi-supervised methods typically focus on point cloud spatial distribution or consider short-term temporal representations, e.g., only two adjacent frames, often overlookin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17752v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17752v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17752v1-abstract-full" style="display: none;"> LiDAR point cloud semantic segmentation plays a crucial role in autonomous driving. In recent years, semi-supervised methods have gained popularity due to their significant reduction in annotation labor and time costs. Current semi-supervised methods typically focus on point cloud spatial distribution or consider short-term temporal representations, e.g., only two adjacent frames, often overlooking the rich long-term temporal properties inherent in autonomous driving scenarios. In driving experience, we observe that nearby objects, such as roads and vehicles, remain stable while driving, whereas distant objects exhibit greater variability in category and shape. This natural phenomenon is also captured by LiDAR, which reflects lower temporal sensitivity for nearby objects and higher sensitivity for distant ones. To leverage these characteristics, we propose HiLoTs, which learns high-temporal sensitivity and low-temporal sensitivity representations from continuous LiDAR frames. These representations are further enhanced and fused using a cross-attention mechanism. Additionally, we employ a teacher-student framework to align the representations learned by the labeled and unlabeled branches, effectively utilizing the large amounts of unlabeled data. Experimental results on the SemanticKITTI and nuScenes datasets demonstrate that our proposed HiLoTs outperforms state-of-the-art semi-supervised methods, and achieves performance close to LiDAR+Camera multimodal approaches. Code is available on https://github.com/rdlin118/HiLoTs <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17752v1-abstract-full').style.display = 'none'; document.getElementById('2503.17752v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17343">arXiv:2503.17343</a> <span> [<a href="https://arxiv.org/pdf/2503.17343">pdf</a>, <a href="https://arxiv.org/format/2503.17343">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Commercial Dishes Can Be My Ladder: Sustainable and Collaborative Data Offloading in LEO Satellite Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chou%2C+Y+C">Yi Ching Chou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Long Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hengzhi Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Feng Wang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+H">Hao Fang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Haoyuan Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Miao Zhang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+X">Xiaoyi Fan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiangchuan Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17343v1-abstract-short" style="display: inline;"> Low Earth Orbit (LEO) satellite networks, characterized by their high data throughput and low latency, have gained significant interest from both industry and academia. Routing data efficiently within these networks is essential for maintaining a high quality of service. However, current routing strategies, such as bent-pipe and inter-satellite link (ISL) routing, have their unique challenges. The… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17343v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17343v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17343v1-abstract-full" style="display: none;"> Low Earth Orbit (LEO) satellite networks, characterized by their high data throughput and low latency, have gained significant interest from both industry and academia. Routing data efficiently within these networks is essential for maintaining a high quality of service. However, current routing strategies, such as bent-pipe and inter-satellite link (ISL) routing, have their unique challenges. The bent-pipe strategy requires a dense deployment of dedicated ground stations, while the ISL-based strategy can negatively impact satellite battery lifespan due to increased traffic load, leading to sustainability issues. In this paper, we propose sustainable collaborative offloading, a framework that orchestrates groups of existing commercial resources like ground stations and 5G base stations for data offloading. This orchestration enhances total capacity, overcoming the limitations of a single resource. We propose the collaborator group set construction algorithm to construct candidate groups and the collaborator selection and total payment algorithm to select offloading targets and determine payments no less than the costs. Extensive real-world-based simulations show that our solution significantly improves energy consumption, satellite service life, and end-to-end latency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17343v1-abstract-full').style.display = 'none'; document.getElementById('2503.17343v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This is a preliminary extended version of the paper accepted to INFOCOM 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17057">arXiv:2503.17057</a> <span> [<a href="https://arxiv.org/pdf/2503.17057">pdf</a>, <a href="https://arxiv.org/format/2503.17057">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Semi-supervised Cervical Segmentation on Ultrasound by A Dual Framework for Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fangyijie Wang</a>, <a href="/search/cs?searchtype=author&query=Curran%2C+K+M">Kathleen M. Curran</a>, <a href="/search/cs?searchtype=author&query=Silvestre%2C+G">Gu茅nol茅 Silvestre</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17057v1-abstract-short" style="display: inline;"> Accurate segmentation of ultrasound (US) images of the cervical muscles is crucial for precision healthcare. The demand for automatic computer-assisted methods is high. However, the scarcity of labeled data hinders the development of these methods. Advanced semi-supervised learning approaches have displayed promise in overcoming this challenge by utilizing labeled and unlabeled data. This study in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17057v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17057v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17057v1-abstract-full" style="display: none;"> Accurate segmentation of ultrasound (US) images of the cervical muscles is crucial for precision healthcare. The demand for automatic computer-assisted methods is high. However, the scarcity of labeled data hinders the development of these methods. Advanced semi-supervised learning approaches have displayed promise in overcoming this challenge by utilizing labeled and unlabeled data. This study introduces a novel semi-supervised learning (SSL) framework that integrates dual neural networks. This SSL framework utilizes both networks to generate pseudo-labels and cross-supervise each other at the pixel level. Additionally, a self-supervised contrastive learning strategy is introduced, which employs a pair of deep representations to enhance feature learning capabilities, particularly on unlabeled data. Our framework demonstrates competitive performance in cervical segmentation tasks. Our codes are publicly available on https://github.com/13204942/SSL\_Cervical\_Segmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17057v1-abstract-full').style.display = 'none'; document.getElementById('2503.17057v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for an oral presentation at ISBI 2025 Fetal Ultrasound Grand Challenge: Semi-Supervised Cervical Segmentation</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16302">arXiv:2503.16302</a> <span> [<a href="https://arxiv.org/pdf/2503.16302">pdf</a>, <a href="https://arxiv.org/format/2503.16302">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Unleashing Vecset Diffusion Model for Fast Shape Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lai%2C+Z">Zeqiang Lai</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yunfei Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zibo Zhao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haolin Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fuyun Wang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+H">Huiwen Shi</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xianghui Yang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Q">Qingxiang Lin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jingwei Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuhong Liu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jie Jiang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+C">Chunchao Guo</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+X">Xiangyu Yue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16302v2-abstract-short" style="display: inline;"> 3D shape generation has greatly flourished through the development of so-called "native" 3D diffusion, particularly through the Vecset Diffusion Model (VDM). While recent advancements have shown promising results in generating high-resolution 3D shapes, VDM still struggles with high-speed generation. Challenges exist because of difficulties not only in accelerating diffusion sampling but also VAE… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16302v2-abstract-full').style.display = 'inline'; document.getElementById('2503.16302v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16302v2-abstract-full" style="display: none;"> 3D shape generation has greatly flourished through the development of so-called "native" 3D diffusion, particularly through the Vecset Diffusion Model (VDM). While recent advancements have shown promising results in generating high-resolution 3D shapes, VDM still struggles with high-speed generation. Challenges exist because of difficulties not only in accelerating diffusion sampling but also VAE decoding in VDM, areas under-explored in previous works. To address these challenges, we present FlashVDM, a systematic framework for accelerating both VAE and DiT in VDM. For DiT, FlashVDM enables flexible diffusion sampling with as few as 5 inference steps and comparable quality, which is made possible by stabilizing consistency distillation with our newly introduced Progressive Flow Distillation. For VAE, we introduce a lightning vecset decoder equipped with Adaptive KV Selection, Hierarchical Volume Decoding, and Efficient Network Design. By exploiting the locality of the vecset and the sparsity of shape surface in the volume, our decoder drastically lowers FLOPs, minimizing the overall decoding overhead. We apply FlashVDM to Hunyuan3D-2 to obtain Hunyuan3D-2 Turbo. Through systematic evaluation, we show that our model significantly outperforms existing fast 3D generation methods, achieving comparable performance to the state-of-the-art while reducing inference time by over 45x for reconstruction and 32x for generation. Code and models are available at https://github.com/Tencent/FlashVDM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16302v2-abstract-full').style.display = 'none'; document.getElementById('2503.16302v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13447">arXiv:2503.13447</a> <span> [<a href="https://arxiv.org/pdf/2503.13447">pdf</a>, <a href="https://arxiv.org/format/2503.13447">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MetaScale: Test-Time Scaling with Evolving Meta-Thoughts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qin Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Wenxuan Zhou</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+N">Nan Xu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J+Y">James Y. Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fei Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Sheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Poon%2C+H">Hoifung Poon</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Muhao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13447v1-abstract-short" style="display: inline;"> One critical challenge for large language models (LLMs) for making complex reasoning is their reliance on matching reasoning patterns from training data, instead of proactively selecting the most appropriate cognitive strategy to solve a given task. Existing approaches impose fixed cognitive structures that enhance performance in specific tasks but lack adaptability across diverse scenarios. To ad… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13447v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13447v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13447v1-abstract-full" style="display: none;"> One critical challenge for large language models (LLMs) for making complex reasoning is their reliance on matching reasoning patterns from training data, instead of proactively selecting the most appropriate cognitive strategy to solve a given task. Existing approaches impose fixed cognitive structures that enhance performance in specific tasks but lack adaptability across diverse scenarios. To address this limitation, we introduce METASCALE, a test-time scaling framework based on meta-thoughts -- adaptive thinking strategies tailored to each task. METASCALE initializes a pool of candidate meta-thoughts, then iteratively selects and evaluates them using a multi-armed bandit algorithm with upper confidence bound selection, guided by a reward model. To further enhance adaptability, a genetic algorithm evolves high-reward meta-thoughts, refining and extending the strategy pool over time. By dynamically proposing and optimizing meta-thoughts at inference time, METASCALE improves both accuracy and generalization across a wide range of tasks. Experimental results demonstrate that MetaScale consistently outperforms standard inference approaches, achieving an 11% performance gain in win rate on Arena-Hard for GPT-4o, surpassing o1-mini by 0.9% under style control. Notably, METASCALE scales more effectively with increasing sampling budgets and produces more structured, expert-level responses. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13447v1-abstract-full').style.display = 'none'; document.getElementById('2503.13447v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11835">arXiv:2503.11835</a> <span> [<a href="https://arxiv.org/pdf/2503.11835">pdf</a>, <a href="https://arxiv.org/format/2503.11835">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> How Can Time Series Analysis Benefit From Multiple Modalities? A Survey and Outlook </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haoxin Liu</a>, <a href="/search/cs?searchtype=author&query=Kamarthi%2C+H">Harshavardhan Kamarthi</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhiyuan Zhao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shangqing Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shiyu Wang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Q">Qingsong Wen</a>, <a href="/search/cs?searchtype=author&query=Hartvigsen%2C+T">Tom Hartvigsen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fei Wang</a>, <a href="/search/cs?searchtype=author&query=Prakash%2C+B+A">B. Aditya Prakash</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11835v3-abstract-short" style="display: inline;"> Time series analysis (TSA) is a longstanding research topic in the data mining community and has wide real-world significance. Compared to "richer" modalities such as language and vision, which have recently experienced explosive development and are densely connected, the time-series modality remains relatively underexplored and isolated. We notice that many recent TSA works have formed a new rese… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11835v3-abstract-full').style.display = 'inline'; document.getElementById('2503.11835v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11835v3-abstract-full" style="display: none;"> Time series analysis (TSA) is a longstanding research topic in the data mining community and has wide real-world significance. Compared to "richer" modalities such as language and vision, which have recently experienced explosive development and are densely connected, the time-series modality remains relatively underexplored and isolated. We notice that many recent TSA works have formed a new research field, i.e., Multiple Modalities for TSA (MM4TSA). In general, these MM4TSA works follow a common motivation: how TSA can benefit from multiple modalities. This survey is the first to offer a comprehensive review and a detailed outlook for this emerging field. Specifically, we systematically discuss three benefits: (1) reusing foundation models of other modalities for efficient TSA, (2) multimodal extension for enhanced TSA, and (3) cross-modality interaction for advanced TSA. We further group the works by the introduced modality type, including text, images, audio, tables, and others, within each perspective. Finally, we identify the gaps with future opportunities, including the reused modalities selections, heterogeneous modality combinations, and unseen tasks generalizations, corresponding to the three benefits. We release an up-to-date GitHub repository that includes key papers and resources. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11835v3-abstract-full').style.display = 'none'; document.getElementById('2503.11835v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Github Repo: https://github.com/AdityaLab/MM4TSA</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11794">arXiv:2503.11794</a> <span> [<a href="https://arxiv.org/pdf/2503.11794">pdf</a>, <a href="https://arxiv.org/format/2503.11794">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Semantic-Clipping: Efficient Vision-Language Modeling with Semantic-Guidedd Visual Selection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+B">Bangzheng Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fei Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Wenxuan Zhou</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+N">Nan Xu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Ben Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Sheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Poon%2C+H">Hoifung Poon</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Muhao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11794v1-abstract-short" style="display: inline;"> Vision-Language Models (VLMs) leverage aligned visual encoders to transform images into visual tokens, allowing them to be processed similarly to text by the backbone large language model (LLM). This unified input paradigm enables VLMs to excel in vision-language tasks such as visual question answering (VQA). To improve fine-grained visual reasoning, recent advancements in vision-language modeling… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11794v1-abstract-full').style.display = 'inline'; document.getElementById('2503.11794v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11794v1-abstract-full" style="display: none;"> Vision-Language Models (VLMs) leverage aligned visual encoders to transform images into visual tokens, allowing them to be processed similarly to text by the backbone large language model (LLM). This unified input paradigm enables VLMs to excel in vision-language tasks such as visual question answering (VQA). To improve fine-grained visual reasoning, recent advancements in vision-language modeling introduce image cropping techniques that feed all encoded sub-images into the model. However, this approach significantly increases the number of visual tokens, leading to inefficiency and potential distractions for the LLM. To address the generalization challenges of image representation in VLMs, we propose a lightweight, universal framework that seamlessly integrates with existing VLMs to enhance their ability to process finegrained details. Our method leverages textual semantics to identify key visual areas, improving VQA performance without requiring any retraining of the VLM. Additionally, it incorporates textual signals into the visual encoding process, enhancing both efficiency and effectiveness. The proposed method, SEMCLIP, strengthens the visual understanding of a 7B VLM, LLaVA-1.5 by 3.3% on average across 7 benchmarks, and particularly by 5.3% on the challenging detailed understanding benchmark V*. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11794v1-abstract-full').style.display = 'none'; document.getElementById('2503.11794v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11514">arXiv:2503.11514</a> <span> [<a href="https://arxiv.org/pdf/2503.11514">pdf</a>, <a href="https://arxiv.org/format/2503.11514">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Exploring the Vulnerabilities of Federated Learning: A Deep Dive into Gradient Inversion Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+P">Pengxin Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Runxi Wang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+S">Shuang Zeng</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jinjing Zhu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Haoning Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanran Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuyin Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Feifei Wang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+H">Hui Xiong</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Liangqiong Qu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11514v1-abstract-short" style="display: inline;"> Federated Learning (FL) has emerged as a promising privacy-preserving collaborative model training paradigm without sharing raw data. However, recent studies have revealed that private information can still be leaked through shared gradient information and attacked by Gradient Inversion Attacks (GIA). While many GIA methods have been proposed, a detailed analysis, evaluation, and summary of these… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11514v1-abstract-full').style.display = 'inline'; document.getElementById('2503.11514v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11514v1-abstract-full" style="display: none;"> Federated Learning (FL) has emerged as a promising privacy-preserving collaborative model training paradigm without sharing raw data. However, recent studies have revealed that private information can still be leaked through shared gradient information and attacked by Gradient Inversion Attacks (GIA). While many GIA methods have been proposed, a detailed analysis, evaluation, and summary of these methods are still lacking. Although various survey papers summarize existing privacy attacks in FL, few studies have conducted extensive experiments to unveil the effectiveness of GIA and their associated limiting factors in this context. To fill this gap, we first undertake a systematic review of GIA and categorize existing methods into three types, i.e., \textit{optimization-based} GIA (OP-GIA), \textit{generation-based} GIA (GEN-GIA), and \textit{analytics-based} GIA (ANA-GIA). Then, we comprehensively analyze and evaluate the three types of GIA in FL, providing insights into the factors that influence their performance, practicality, and potential threats. Our findings indicate that OP-GIA is the most practical attack setting despite its unsatisfactory performance, while GEN-GIA has many dependencies and ANA-GIA is easily detectable, making them both impractical. Finally, we offer a three-stage defense pipeline to users when designing FL frameworks and protocols for better privacy protection and share some future research directions from the perspectives of attackers and defenders that we believe should be pursued. We hope that our study can help researchers design more robust FL frameworks to defend against these attacks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11514v1-abstract-full').style.display = 'none'; document.getElementById('2503.11514v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10392">arXiv:2503.10392</a> <span> [<a href="https://arxiv.org/pdf/2503.10392">pdf</a>, <a href="https://arxiv.org/format/2503.10392">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> RoMA: Scaling up Mamba-based Foundation Models for Remote Sensing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fengxiang Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongzhen Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yulin Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Di Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mingshuo Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Haiyan Zhao</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yangang Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuo Wang</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+L">Long Lan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenjing Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10392v1-abstract-short" style="display: inline;"> Recent advances in self-supervised learning for Vision Transformers (ViTs) have fueled breakthroughs in remote sensing (RS) foundation models. However, the quadratic complexity of self-attention poses a significant barrier to scalability, particularly for large models and high-resolution images. While the linear-complexity Mamba architecture offers a promising alternative, existing RS applications… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10392v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10392v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10392v1-abstract-full" style="display: none;"> Recent advances in self-supervised learning for Vision Transformers (ViTs) have fueled breakthroughs in remote sensing (RS) foundation models. However, the quadratic complexity of self-attention poses a significant barrier to scalability, particularly for large models and high-resolution images. While the linear-complexity Mamba architecture offers a promising alternative, existing RS applications of Mamba remain limited to supervised tasks on small, domain-specific datasets. To address these challenges, we propose RoMA, a framework that enables scalable self-supervised pretraining of Mamba-based RS foundation models using large-scale, diverse, unlabeled data. RoMA enhances scalability for high-resolution images through a tailored auto-regressive learning strategy, incorporating two key innovations: 1) a rotation-aware pretraining mechanism combining adaptive cropping with angular embeddings to handle sparsely distributed objects with arbitrary orientations, and 2) multi-scale token prediction objectives that address the extreme variations in object scales inherent to RS imagery. Systematic empirical studies validate that Mamba adheres to RS data and parameter scaling laws, with performance scaling reliably as model and data size increase. Furthermore, experiments across scene classification, object detection, and semantic segmentation tasks demonstrate that RoMA-pretrained Mamba models consistently outperform ViT-based counterparts in both accuracy and computational efficiency. The source code and pretrained models will be released at https://github.com/MiliLab/RoMA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10392v1-abstract-full').style.display = 'none'; document.getElementById('2503.10392v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08596">arXiv:2503.08596</a> <span> [<a href="https://arxiv.org/pdf/2503.08596">pdf</a>, <a href="https://arxiv.org/format/2503.08596">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> X-Field: A Physically Grounded Representation for 3D X-ray Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+F">Feiran Wang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+J">Jiachen Tao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Junyi Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+B">Bin Duan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kai Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zongxin Yang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yan Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08596v1-abstract-short" style="display: inline;"> X-ray imaging is indispensable in medical diagnostics, yet its use is tightly regulated due to potential health risks. To mitigate radiation exposure, recent research focuses on generating novel views from sparse inputs and reconstructing Computed Tomography (CT) volumes, borrowing representations from the 3D reconstruction area. However, these representations originally target visible light imagi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08596v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08596v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08596v1-abstract-full" style="display: none;"> X-ray imaging is indispensable in medical diagnostics, yet its use is tightly regulated due to potential health risks. To mitigate radiation exposure, recent research focuses on generating novel views from sparse inputs and reconstructing Computed Tomography (CT) volumes, borrowing representations from the 3D reconstruction area. However, these representations originally target visible light imaging that emphasizes reflection and scattering effects, while neglecting penetration and attenuation properties of X-ray imaging. In this paper, we introduce X-Field, the first 3D representation specifically designed for X-ray imaging, rooted in the energy absorption rates across different materials. To accurately model diverse materials within internal structures, we employ 3D ellipsoids with distinct attenuation coefficients. To estimate each material's energy absorption of X-rays, we devise an efficient path partitioning algorithm accounting for complex ellipsoid intersections. We further propose hybrid progressive initialization to refine the geometric accuracy of X-Filed and incorporate material-based optimization to enhance model fitting along material boundaries. Experiments show that X-Field achieves superior visual fidelity on both real-world human organ and synthetic object datasets, outperforming state-of-the-art methods in X-ray Novel View Synthesis and CT Reconstruction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08596v1-abstract-full').style.display = 'none'; document.getElementById('2503.08596v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: \url{https://brack-wang.github.io/XField/}, Github Code: \url{https://github.com/Brack-Wang/X-Field}</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08144">arXiv:2503.08144</a> <span> [<a href="https://arxiv.org/pdf/2503.08144">pdf</a>, <a href="https://arxiv.org/format/2503.08144">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Bring Remote Sensing Object Detect Into Nature Language Model: Using SFT Method </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fei Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chengcheng Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hongyu Chen</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+Y">Yugang Chang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+W">Weiming Zeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08144v2-abstract-short" style="display: inline;"> Recently, large language models (LLMs) and vision-language models (VLMs) have achieved significant success, demonstrating remarkable capabilities in understanding various images and videos, particularly in classification and detection tasks. However, due to the substantial differences between remote sensing images and conventional optical images, these models face considerable challenges in compre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08144v2-abstract-full').style.display = 'inline'; document.getElementById('2503.08144v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08144v2-abstract-full" style="display: none;"> Recently, large language models (LLMs) and vision-language models (VLMs) have achieved significant success, demonstrating remarkable capabilities in understanding various images and videos, particularly in classification and detection tasks. However, due to the substantial differences between remote sensing images and conventional optical images, these models face considerable challenges in comprehension, especially in detection tasks. Directly prompting VLMs with detection instructions often leads to unsatisfactory results. To address this issue, this letter explores the application of VLMs for object detection in remote sensing images. Specifically, we constructed supervised fine-tuning (SFT) datasets using publicly available remote sensing object detection datasets, including SSDD, HRSID, and NWPU-VHR-10. In these new datasets, we converted annotation information into JSON-compliant natural language descriptions, facilitating more effective understanding and training for the VLM. We then evaluate the detection performance of various fine-tuning strategies for VLMs and derive optimized model weights for object detection in remote sensing images. Finally, we evaluate the model's prior knowledge capabilities using natural language queries. Experimental results demonstrate that, without modifying the model architecture, remote sensing object detection can be effectively achieved using natural language alone. Additionally, the model exhibits the ability to perform certain vision question answering (VQA) tasks. Our datasets and related code will be released soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08144v2-abstract-full').style.display = 'none'; document.getElementById('2503.08144v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08084">arXiv:2503.08084</a> <span> [<a href="https://arxiv.org/pdf/2503.08084">pdf</a>, <a href="https://arxiv.org/format/2503.08084">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Instruction-Augmented Long-Horizon Planning: Embedding Grounding Mechanisms in Embodied Mobile Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fangyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Shipeng Lyu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+P">Peng Zhou</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+A">Anqing Duan</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+G">Guodong Guo</a>, <a href="/search/cs?searchtype=author&query=Navarro-Alarcon%2C+D">David Navarro-Alarcon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08084v1-abstract-short" style="display: inline;"> Enabling humanoid robots to perform long-horizon mobile manipulation planning in real-world environments based on embodied perception and comprehension abilities has been a longstanding challenge. With the recent rise of large language models (LLMs), there has been a notable increase in the development of LLM-based planners. These approaches either utilize human-provided textual representations of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08084v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08084v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08084v1-abstract-full" style="display: none;"> Enabling humanoid robots to perform long-horizon mobile manipulation planning in real-world environments based on embodied perception and comprehension abilities has been a longstanding challenge. With the recent rise of large language models (LLMs), there has been a notable increase in the development of LLM-based planners. These approaches either utilize human-provided textual representations of the real world or heavily depend on prompt engineering to extract such representations, lacking the capability to quantitatively understand the environment, such as determining the feasibility of manipulating objects. To address these limitations, we present the Instruction-Augmented Long-Horizon Planning (IALP) system, a novel framework that employs LLMs to generate feasible and optimal actions based on real-time sensor feedback, including grounded knowledge of the environment, in a closed-loop interaction. Distinct from prior works, our approach augments user instructions into PDDL problems by leveraging both the abstract reasoning capabilities of LLMs and grounding mechanisms. By conducting various real-world long-horizon tasks, each consisting of seven distinct manipulatory skills, our results demonstrate that the IALP system can efficiently solve these tasks with an average success rate exceeding 80%. Our proposed method can operate as a high-level planner, equipping robots with substantial autonomy in unstructured environments through the utilization of multi-modal sensor inputs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08084v1-abstract-full').style.display = 'none'; document.getElementById('2503.08084v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 11 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> AAAI 2025 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08008">arXiv:2503.08008</a> <span> [<a href="https://arxiv.org/pdf/2503.08008">pdf</a>, <a href="https://arxiv.org/ps/2503.08008">ps</a>, <a href="https://arxiv.org/format/2503.08008">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Survey on Wi-Fi Sensing Generalizability: Taxonomy, Techniques, Datasets, and Future Research Prospects </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fei Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tingting Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+B">Bintong Zhao</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+L">Libao Xing</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tiantian Wang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+H">Han Ding</a>, <a href="/search/cs?searchtype=author&query=Han%2C+T+X">Tony Xiao Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08008v1-abstract-short" style="display: inline;"> Wi-Fi sensing has emerged as a transformative technology that leverages ubiquitous wireless signals to enable a variety of applications ranging from activity and gesture recognition to indoor localization and health monitoring. However, the inherent dependency of Wi-Fi signals on environmental conditions introduces significant generalization challenges,variations in surroundings, human positions,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08008v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08008v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08008v1-abstract-full" style="display: none;"> Wi-Fi sensing has emerged as a transformative technology that leverages ubiquitous wireless signals to enable a variety of applications ranging from activity and gesture recognition to indoor localization and health monitoring. However, the inherent dependency of Wi-Fi signals on environmental conditions introduces significant generalization challenges,variations in surroundings, human positions, and orientations often lead to inconsistent signal features, impeding robust action recognition. In this survey, we review over 200 studies on Wi-Fi sensing generalization, categorizing them along the entire sensing pipeline: device deployment, signal processing, feature learning, and model deployment. We systematically analyze state-of-the-art techniques, which are employed to mitigate the adverse effects of environmental variability. Moreover, we provide a comprehensive overview of open-source datasets such as Widar3.0, XRF55, and XRFv2, highlighting their unique characteristics and applicability for multimodal fusion and cross-modal tasks. Finally, we discuss emerging research directions, such as multimodal approaches and the integration of large language models,to inspire future advancements in this rapidly evolving field. Our survey aims to serve as a valuable resource for researchers, offering insights into current methodologies, available datasets, and promising avenues for further investigation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08008v1-abstract-full').style.display = 'none'; document.getElementById('2503.08008v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">38 pages, 318 references</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07416">arXiv:2503.07416</a> <span> [<a href="https://arxiv.org/pdf/2503.07416">pdf</a>, <a href="https://arxiv.org/format/2503.07416">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TimeStep Master: Asymmetrical Mixture of Timestep LoRA Experts for Versatile and Efficient Diffusion Models in Vision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhuang%2C+S">Shaobin Zhuang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yiwei Guo</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Yanbo Ding</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kunchang Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinyuan Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaohui Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fangyikang Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Ying Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chen Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yali Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07416v1-abstract-short" style="display: inline;"> Diffusion models have driven the advancement of vision generation over the past years. However, it is often difficult to apply these large models in downstream tasks, due to massive fine-tuning cost. Recently, Low-Rank Adaptation (LoRA) has been applied for efficient tuning of diffusion models. Unfortunately, the capabilities of LoRA-tuned diffusion models are limited, since the same LoRA is used… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07416v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07416v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07416v1-abstract-full" style="display: none;"> Diffusion models have driven the advancement of vision generation over the past years. However, it is often difficult to apply these large models in downstream tasks, due to massive fine-tuning cost. Recently, Low-Rank Adaptation (LoRA) has been applied for efficient tuning of diffusion models. Unfortunately, the capabilities of LoRA-tuned diffusion models are limited, since the same LoRA is used for different timesteps of the diffusion process. To tackle this problem, we introduce a general and concise TimeStep Master (TSM) paradigm with two key fine-tuning stages. In the fostering stage (1-stage), we apply different LoRAs to fine-tune the diffusion model at different timestep intervals. This results in different TimeStep LoRA experts that can effectively capture different noise levels. In the assembling stage (2-stage), we design a novel asymmetrical mixture of TimeStep LoRA experts, via core-context collaboration of experts at multi-scale intervals. For each timestep, we leverage TimeStep LoRA expert within the smallest interval as the core expert without gating, and use experts within the bigger intervals as the context experts with time-dependent gating. Consequently, our TSM can effectively model the noise level via the expert in the finest interval, and adaptively integrate contexts from the experts of other scales, boosting the versatility of diffusion models. To show the effectiveness of our TSM paradigm, we conduct extensive experiments on three typical and popular LoRA-related tasks of diffusion models, including domain adaptation, post-pretraining, and model distillation. Our TSM achieves the state-of-the-art results on all these tasks, throughout various model structures (UNet, DiT and MM-DiT) and visual data modalities (Image, Video), showing its remarkable generalization capacity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07416v1-abstract-full').style.display = 'none'; document.getElementById('2503.07416v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 5 figures, 13 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07170">arXiv:2503.07170</a> <span> [<a href="https://arxiv.org/pdf/2503.07170">pdf</a>, <a href="https://arxiv.org/format/2503.07170">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DeFine: A Decomposed and Fine-Grained Annotated Dataset for Long-form Article Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+M">Ming Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fang Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+M">Minghao Hu</a>, <a href="/search/cs?searchtype=author&query=He%2C+L">Li He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haiyang Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jun Zhang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+T">Tianwei Yan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Li Li</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Z">Zhunchen Luo</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+W">Wei Luo</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+X">Xiaoying Bai</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+G">Guotong Geng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07170v1-abstract-short" style="display: inline;"> Long-form article generation (LFAG) presents challenges such as maintaining logical consistency, comprehensive topic coverage, and narrative coherence across extended articles. Existing datasets often lack both the hierarchical structure and fine-grained annotation needed to effectively decompose tasks, resulting in shallow, disorganized article generation. To address these limitations, we introdu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07170v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07170v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07170v1-abstract-full" style="display: none;"> Long-form article generation (LFAG) presents challenges such as maintaining logical consistency, comprehensive topic coverage, and narrative coherence across extended articles. Existing datasets often lack both the hierarchical structure and fine-grained annotation needed to effectively decompose tasks, resulting in shallow, disorganized article generation. To address these limitations, we introduce DeFine, a Decomposed and Fine-grained annotated dataset for long-form article generation. DeFine is characterized by its hierarchical decomposition strategy and the integration of domain-specific knowledge with multi-level annotations, ensuring granular control and enhanced depth in article generation. To construct the dataset, a multi-agent collaborative pipeline is proposed, which systematically segments the generation process into four parts: Data Miner, Cite Retreiver, Q&A Annotator and Data Cleaner. To validate the effectiveness of DeFine, we designed and tested three LFAG baselines: the web retrieval, the local retrieval, and the grounded reference. We fine-tuned the Qwen2-7b-Instruct model using the DeFine training dataset. The experimental results showed significant improvements in text quality, specifically in topic coverage, depth of information, and content fidelity. Our dataset publicly available to facilitate future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07170v1-abstract-full').style.display = 'none'; document.getElementById('2503.07170v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06949">arXiv:2503.06949</a> <span> [<a href="https://arxiv.org/pdf/2503.06949">pdf</a>, <a href="https://arxiv.org/format/2503.06949">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LexPro-1.0 Technical Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haotian Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yanyu Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Boyan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chaoyue Zhao</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiaoyu Han</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fang Wang</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+L">Lizhen Cui</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yonghui Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06949v2-abstract-short" style="display: inline;"> In this report, we introduce our first-generation reasoning model, LexPro-1.0, a large language model designed for the highly specialized Chinese legal domain, offering comprehensive capabilities to meet diverse realistic needs. Existing legal LLMs face two primary challenges. Firstly, their design and evaluation are predominantly driven by computer science perspectives, leading to insufficient in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06949v2-abstract-full').style.display = 'inline'; document.getElementById('2503.06949v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06949v2-abstract-full" style="display: none;"> In this report, we introduce our first-generation reasoning model, LexPro-1.0, a large language model designed for the highly specialized Chinese legal domain, offering comprehensive capabilities to meet diverse realistic needs. Existing legal LLMs face two primary challenges. Firstly, their design and evaluation are predominantly driven by computer science perspectives, leading to insufficient incorporation of legal expertise and logic, which is crucial for high-precision legal applications, such as handling complex prosecutorial tasks. Secondly, these models often underperform due to a lack of comprehensive training data from the legal domain, limiting their ability to effectively address real-world legal scenarios. To address this, we first compile millions of legal documents covering over 20 types of crimes from 31 provinces in China for model training. From the extensive dataset, we further select high-quality for supervised fine-tuning, ensuring enhanced relevance and precision. The model further undergoes large-scale reinforcement learning without additional supervision, emphasizing the enhancement of its reasoning capabilities and explainability. To validate its effectiveness in complex legal applications, we also conduct human evaluations with legal experts. We develop fine-tuned models based on DeepSeek-R1-Distilled versions, available in three dense configurations: 14B, 32B, and 70B. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06949v2-abstract-full').style.display = 'none'; document.getElementById('2503.06949v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06484">arXiv:2503.06484</a> <span> [<a href="https://arxiv.org/pdf/2503.06484">pdf</a>, <a href="https://arxiv.org/format/2503.06484">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Sign Language Translation using Frame and Event Stream: Benchmark Dataset and Algorithms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiao Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuehang Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fuling Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bo Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaowei Wang</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yonghong Tian</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jin Tang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+B">Bin Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06484v1-abstract-short" style="display: inline;"> Accurate sign language understanding serves as a crucial communication channel for individuals with disabilities. Current sign language translation algorithms predominantly rely on RGB frames, which may be limited by fixed frame rates, variable lighting conditions, and motion blur caused by rapid hand movements. Inspired by the recent successful application of event cameras in other fields, we pro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06484v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06484v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06484v1-abstract-full" style="display: none;"> Accurate sign language understanding serves as a crucial communication channel for individuals with disabilities. Current sign language translation algorithms predominantly rely on RGB frames, which may be limited by fixed frame rates, variable lighting conditions, and motion blur caused by rapid hand movements. Inspired by the recent successful application of event cameras in other fields, we propose to leverage event streams to assist RGB cameras in capturing gesture data, addressing the various challenges mentioned above. Specifically, we first collect a large-scale RGB-Event sign language translation dataset using the DVS346 camera, termed VECSL, which contains 15,676 RGB-Event samples, 15,191 glosses, and covers 2,568 Chinese characters. These samples were gathered across a diverse range of indoor and outdoor environments, capturing multiple viewing angles, varying light intensities, and different camera motions. Due to the absence of benchmark algorithms for comparison in this new task, we retrained and evaluated multiple state-of-the-art SLT algorithms, and believe that this benchmark can effectively support subsequent related research. Additionally, we propose a novel RGB-Event sign language translation framework (i.e., M$^2$-SLT) that incorporates fine-grained micro-sign and coarse-grained macro-sign retrieval, achieving state-of-the-art results on the proposed dataset. Both the source code and dataset will be released on https://github.com/Event-AHU/OpenESL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06484v1-abstract-full').style.display = 'none'; document.getElementById('2503.06484v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In Peer Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06268">arXiv:2503.06268</a> <span> [<a href="https://arxiv.org/pdf/2503.06268">pdf</a>, <a href="https://arxiv.org/format/2503.06268">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Get In Video: Add Anything You Want to the Video </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhuang%2C+S">Shaobin Zhuang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhipeng Huang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+B">Binxin Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Ying Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fangyikang Wang</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+C">Canmiao Fu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+C">Chong Sun</a>, <a href="/search/cs?searchtype=author&query=Zha%2C+Z">Zheng-Jun Zha</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chen Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yali Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06268v1-abstract-short" style="display: inline;"> Video editing increasingly demands the ability to incorporate specific real-world instances into existing footage, yet current approaches fundamentally fail to capture the unique visual characteristics of particular subjects and ensure natural instance/scene interactions. We formalize this overlooked yet critical editing paradigm as "Get-In-Video Editing", where users provide reference images to p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06268v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06268v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06268v1-abstract-full" style="display: none;"> Video editing increasingly demands the ability to incorporate specific real-world instances into existing footage, yet current approaches fundamentally fail to capture the unique visual characteristics of particular subjects and ensure natural instance/scene interactions. We formalize this overlooked yet critical editing paradigm as "Get-In-Video Editing", where users provide reference images to precisely specify visual elements they wish to incorporate into videos. Addressing this task's dual challenges, severe training data scarcity and technical challenges in maintaining spatiotemporal coherence, we introduce three key contributions. First, we develop GetIn-1M dataset created through our automated Recognize-Track-Erase pipeline, which sequentially performs video captioning, salient instance identification, object detection, temporal tracking, and instance removal to generate high-quality video editing pairs with comprehensive annotations (reference image, tracking mask, instance prompt). Second, we present GetInVideo, a novel end-to-end framework that leverages a diffusion transformer architecture with 3D full attention to process reference images, condition videos, and masks simultaneously, maintaining temporal coherence, preserving visual identity, and ensuring natural scene interactions when integrating reference objects into videos. Finally, we establish GetInBench, the first comprehensive benchmark for Get-In-Video Editing scenario, demonstrating our approach's superior performance through extensive evaluations. Our work enables accessible, high-quality incorporation of specific real-world subjects into videos, significantly advancing personalized video editing capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06268v1-abstract-full').style.display = 'none'; document.getElementById('2503.06268v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page:https://zhuangshaobin.github.io/GetInVideo-project/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06261">arXiv:2503.06261</a> <span> [<a href="https://arxiv.org/pdf/2503.06261">pdf</a>, <a href="https://arxiv.org/format/2503.06261">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Segment Anything, Even Occluded </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tai%2C+W">Wei-En Tai</a>, <a href="/search/cs?searchtype=author&query=Shih%2C+Y">Yu-Lin Shih</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+C">Cheng Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y+F">Yu-Chiang Frank Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hwann-Tzong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06261v1-abstract-short" style="display: inline;"> Amodal instance segmentation, which aims to detect and segment both visible and invisible parts of objects in images, plays a crucial role in various applications including autonomous driving, robotic manipulation, and scene understanding. While existing methods require training both front-end detectors and mask decoders jointly, this approach lacks flexibility and fails to leverage the strengths… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06261v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06261v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06261v1-abstract-full" style="display: none;"> Amodal instance segmentation, which aims to detect and segment both visible and invisible parts of objects in images, plays a crucial role in various applications including autonomous driving, robotic manipulation, and scene understanding. While existing methods require training both front-end detectors and mask decoders jointly, this approach lacks flexibility and fails to leverage the strengths of pre-existing modal detectors. To address this limitation, we propose SAMEO, a novel framework that adapts the Segment Anything Model (SAM) as a versatile mask decoder capable of interfacing with various front-end detectors to enable mask prediction even for partially occluded objects. Acknowledging the constraints of limited amodal segmentation datasets, we introduce Amodal-LVIS, a large-scale synthetic dataset comprising 300K images derived from the modal LVIS and LVVIS datasets. This dataset significantly expands the training data available for amodal segmentation research. Our experimental results demonstrate that our approach, when trained on the newly extended dataset, including Amodal-LVIS, achieves remarkable zero-shot performance on both COCOA-cls and D2SA benchmarks, highlighting its potential for generalization to unseen scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06261v1-abstract-full').style.display = 'none'; document.getElementById('2503.06261v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.05492">arXiv:2503.05492</a> <span> [<a href="https://arxiv.org/pdf/2503.05492">pdf</a>, <a href="https://arxiv.org/format/2503.05492">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> FastMap: Fast Queries Initialization Based Vectorized HD Map Reconstruction Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+H">Haotian Hu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jingwei Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fanyi Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Toyota Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaonong Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+L">Laifeng Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhiwang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.05492v1-abstract-short" style="display: inline;"> Reconstruction of high-definition maps is a crucial task in perceiving the autonomous driving environment, as its accuracy directly impacts the reliability of prediction and planning capabilities in downstream modules. Current vectorized map reconstruction methods based on the DETR framework encounter limitations due to the redundancy in the decoder structure, necessitating the stacking of six dec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05492v1-abstract-full').style.display = 'inline'; document.getElementById('2503.05492v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.05492v1-abstract-full" style="display: none;"> Reconstruction of high-definition maps is a crucial task in perceiving the autonomous driving environment, as its accuracy directly impacts the reliability of prediction and planning capabilities in downstream modules. Current vectorized map reconstruction methods based on the DETR framework encounter limitations due to the redundancy in the decoder structure, necessitating the stacking of six decoder layers to maintain performance, which significantly hampers computational efficiency. To tackle this issue, we introduce FastMap, an innovative framework designed to reduce decoder redundancy in existing approaches. FastMap optimizes the decoder architecture by employing a single-layer, two-stage transformer that achieves multilevel representation capabilities. Our framework eliminates the conventional practice of randomly initializing queries and instead incorporates a heatmap-guided query generation module during the decoding phase, which effectively maps image features into structured query vectors using learnable positional encoding. Additionally, we propose a geometry-constrained point-to-line loss mechanism for FastMap, which adeptly addresses the challenge of distinguishing highly homogeneous features that often arise in traditional point-to-point loss computations. Extensive experiments demonstrate that FastMap achieves state-of-the-art performance in both nuScenes and Argoverse2 datasets, with its decoder operating 3.2 faster than the baseline. Code and more demos are available at https://github.com/hht1996ok/FastMap. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05492v1-abstract-full').style.display = 'none'; document.getElementById('2503.05492v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.05248">arXiv:2503.05248</a> <span> [<a href="https://arxiv.org/pdf/2503.05248">pdf</a>, <a href="https://arxiv.org/format/2503.05248">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Optimizing LLM Inference Throughput via Memory-aware and SLA-constrained Dynamic Batching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pang%2C+B">Bowen Pang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kai Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Feifan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.05248v1-abstract-short" style="display: inline;"> The increasing adoption of large language models (LLMs) necessitates inference serving systems that can deliver both high throughput and low latency. Deploying LLMs with hundreds of billions of parameters on memory-constrained GPUs exposes significant limitations in static batching methods. Current inference serving systems often treat batch sizes as fixed hyper-parameters, hindering real-time ada… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05248v1-abstract-full').style.display = 'inline'; document.getElementById('2503.05248v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.05248v1-abstract-full" style="display: none;"> The increasing adoption of large language models (LLMs) necessitates inference serving systems that can deliver both high throughput and low latency. Deploying LLMs with hundreds of billions of parameters on memory-constrained GPUs exposes significant limitations in static batching methods. Current inference serving systems often treat batch sizes as fixed hyper-parameters, hindering real-time adaptation to varying system conditions. In this paper, we propose a dynamic batching method that continuously monitors memory utilization and adheres to service-level agreements (SLAs) to enable real-time batch size configuration adjustment. The method comprises two core components: a memory-aware batch scheduler that dynamically allocates GPU resources and a latency feedback mechanism that optimizes decoding processes under SLA constraints. The numerical experiments demonstrate throughput gains of 8% to 28% and capacity improvements of 22% compared to traditional static batching methods, while maintaining full compatibility with existing inference infrastructure. These results highlight the effectiveness of dynamic batching in balancing computational efficiency and quality-of-service requirements for contemporary LLM deployment scenarios. The source code of this work is publicly available at https://github.com/KevinLee1110/dynamic-batching. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05248v1-abstract-full').style.display = 'none'; document.getElementById('2503.05248v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.05139">arXiv:2503.05139</a> <span> [<a href="https://arxiv.org/pdf/2503.05139">pdf</a>, <a href="https://arxiv.org/format/2503.05139">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Every FLOP Counts: Scaling a 300B Mixture-of-Experts LING LLM without Premium GPUs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ling+Team"> Ling Team</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+B">Binwei Zeng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chao Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chao Zhang</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+C">Changxin Tian</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Cong Chen</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+D">Dingnan Jin</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F">Feng Yu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+F">Feng Zhu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+F">Feng Yuan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fakang Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Gangshan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+G">Guangyao Zhai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haitao Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Huizhong Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jun Zhou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jia Liu</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+J">Junpeng Fang</a>, <a href="/search/cs?searchtype=author&query=Ou%2C+J">Junjie Ou</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jun Hu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+J">Ji Luo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Ji Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jian Liu</a>, <a href="/search/cs?searchtype=author&query=Sha%2C+J">Jian Sha</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+J">Jianxue Qian</a> , et al. (49 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.05139v2-abstract-short" style="display: inline;"> In this technical report, we tackle the challenges of training large-scale Mixture of Experts (MoE) models, focusing on overcoming cost inefficiency and resource limitations prevalent in such systems. To address these issues, we present two differently sized MoE large language models (LLMs), namely Ling-Lite and Ling-Plus (referred to as "Bailing" in Chinese, spelled B菐il铆ng in Pinyin). Ling-Lite… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05139v2-abstract-full').style.display = 'inline'; document.getElementById('2503.05139v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.05139v2-abstract-full" style="display: none;"> In this technical report, we tackle the challenges of training large-scale Mixture of Experts (MoE) models, focusing on overcoming cost inefficiency and resource limitations prevalent in such systems. To address these issues, we present two differently sized MoE large language models (LLMs), namely Ling-Lite and Ling-Plus (referred to as "Bailing" in Chinese, spelled B菐il铆ng in Pinyin). Ling-Lite contains 16.8 billion parameters with 2.75 billion activated parameters, while Ling-Plus boasts 290 billion parameters with 28.8 billion activated parameters. Both models exhibit comparable performance to leading industry benchmarks. This report offers actionable insights to improve the efficiency and accessibility of AI development in resource-constrained settings, promoting more scalable and sustainable technologies. Specifically, to reduce training costs for large-scale MoE models, we propose innovative methods for (1) optimization of model architecture and training processes, (2) refinement of training anomaly handling, and (3) enhancement of model evaluation efficiency. Additionally, leveraging high-quality data generated from knowledge graphs, our models demonstrate superior capabilities in tool use compared to other models. Ultimately, our experimental findings demonstrate that a 300B MoE LLM can be effectively trained on lower-performance devices while achieving comparable performance to models of a similar scale, including dense and MoE models. Compared to high-performance devices, utilizing a lower-specification hardware system during the pre-training phase demonstrates significant cost savings, reducing computing costs by approximately 20%. The models can be accessed at https://huggingface.co/inclusionAI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05139v2-abstract-full').style.display = 'none'; document.getElementById('2503.05139v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">34 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.05063">arXiv:2503.05063</a> <span> [<a href="https://arxiv.org/pdf/2503.05063">pdf</a>, <a href="https://arxiv.org/format/2503.05063">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Lightweight Hypercomplex MRI Reconstruction: A Generalized Kronecker-Parameterized Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haosen Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiahao Huang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yinzhe Wu</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+C">Congren Dai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fanwen Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhenxuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.05063v2-abstract-short" style="display: inline;"> Magnetic Resonance Imaging (MRI) is crucial for clinical diagnostics but is hindered by prolonged scan times. Current deep learning models enhance MRI reconstruction but are often memory-intensive and unsuitable for resource-limited systems. This paper introduces a lightweight MRI reconstruction model leveraging Kronecker-Parameterized Hypercomplex Neural Networks to achieve high performance with… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05063v2-abstract-full').style.display = 'inline'; document.getElementById('2503.05063v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.05063v2-abstract-full" style="display: none;"> Magnetic Resonance Imaging (MRI) is crucial for clinical diagnostics but is hindered by prolonged scan times. Current deep learning models enhance MRI reconstruction but are often memory-intensive and unsuitable for resource-limited systems. This paper introduces a lightweight MRI reconstruction model leveraging Kronecker-Parameterized Hypercomplex Neural Networks to achieve high performance with reduced parameters. By integrating Kronecker-based modules, including Kronecker MLP, Kronecker Window Attention, and Kronecker Convolution, the proposed model efficiently extracts spatial features while preserving representational power. We introduce Kronecker U-Net and Kronecker SwinMR, which maintain high reconstruction quality with approximately 50% fewer parameters compared to existing models. Experimental evaluation on the FastMRI dataset demonstrates competitive PSNR, SSIM, and LPIPS metrics, even at high acceleration factors (8x and 16x), with no significant performance drop. Additionally, Kronecker variants exhibit superior generalization and reduced overfitting on limited datasets, facilitating efficient MRI reconstruction on hardware-constrained systems. This approach sets a new benchmark for parameter-efficient medical imaging models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05063v2-abstract-full').style.display = 'none'; document.getElementById('2503.05063v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 3 figures. Submitted for publication</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6; I.4.5 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.04981">arXiv:2503.04981</a> <span> [<a href="https://arxiv.org/pdf/2503.04981">pdf</a>, <a href="https://arxiv.org/format/2503.04981">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Topology-Aware Conformal Prediction for Stream Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jifan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fangxin Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+P+S">Philip S. Yu</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+K">Kaize Ding</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+S">Shixiang Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.04981v1-abstract-short" style="display: inline;"> Stream networks, a unique class of spatiotemporal graphs, exhibit complex directional flow constraints and evolving dependencies, making uncertainty quantification a critical yet challenging task. Traditional conformal prediction methods struggle in this setting due to the need for joint predictions across multiple interdependent locations and the intricate spatio-temporal dependencies inherent in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04981v1-abstract-full').style.display = 'inline'; document.getElementById('2503.04981v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.04981v1-abstract-full" style="display: none;"> Stream networks, a unique class of spatiotemporal graphs, exhibit complex directional flow constraints and evolving dependencies, making uncertainty quantification a critical yet challenging task. Traditional conformal prediction methods struggle in this setting due to the need for joint predictions across multiple interdependent locations and the intricate spatio-temporal dependencies inherent in stream networks. Existing approaches either neglect dependencies, leading to overly conservative predictions, or rely solely on data-driven estimations, failing to capture the rich topological structure of the network. To address these challenges, we propose Spatio-Temporal Adaptive Conformal Inference (\texttt{STACI}), a novel framework that integrates network topology and temporal dynamics into the conformal prediction framework. \texttt{STACI} introduces a topology-aware nonconformity score that respects directional flow constraints and dynamically adjusts prediction sets to account for temporal distributional shifts. We provide theoretical guarantees on the validity of our approach and demonstrate its superior performance on both synthetic and real-world datasets. Our results show that \texttt{STACI} effectively balances prediction efficiency and coverage, outperforming existing conformal prediction methods for stream networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04981v1-abstract-full').style.display = 'none'; document.getElementById('2503.04981v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.04739">arXiv:2503.04739</a> <span> [<a href="https://arxiv.org/pdf/2503.04739">pdf</a>, <a href="https://arxiv.org/format/2503.04739">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Responsible Artificial Intelligence Systems: A Roadmap to Society's Trust through Trustworthy AI, Auditability, Accountability, and Governance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Herrera-Poyatos%2C+A">Andr茅s Herrera-Poyatos</a>, <a href="/search/cs?searchtype=author&query=Del+Ser%2C+J">Javier Del Ser</a>, <a href="/search/cs?searchtype=author&query=de+Prado%2C+M+L">Marcos L贸pez de Prado</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fei-Yue Wang</a>, <a href="/search/cs?searchtype=author&query=Herrera-Viedma%2C+E">Enrique Herrera-Viedma</a>, <a href="/search/cs?searchtype=author&query=Herrera%2C+F">Francisco Herrera</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.04739v1-abstract-short" style="display: inline;"> Artificial intelligence (AI) has matured as a technology, necessitating the development of responsibility frameworks that are fair, inclusive, trustworthy, safe and secure, transparent, and accountable. By establishing such frameworks, we can harness the full potential of AI while mitigating its risks, particularly in high-risk scenarios. This requires the design of responsible AI systems based on… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04739v1-abstract-full').style.display = 'inline'; document.getElementById('2503.04739v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.04739v1-abstract-full" style="display: none;"> Artificial intelligence (AI) has matured as a technology, necessitating the development of responsibility frameworks that are fair, inclusive, trustworthy, safe and secure, transparent, and accountable. By establishing such frameworks, we can harness the full potential of AI while mitigating its risks, particularly in high-risk scenarios. This requires the design of responsible AI systems based on trustworthy AI technologies and ethical principles, with the aim of ensuring auditability and accountability throughout their design, development, and deployment, adhering to domain-specific regulations and standards. This paper explores the concept of a responsible AI system from a holistic perspective, which encompasses four key dimensions: 1) regulatory context; 2) trustworthy AI technology along with standardization and assessments; 3) auditability and accountability; and 4) AI governance. The aim of this paper is double. First, we analyze and understand these four dimensions and their interconnections in the form of an analysis and overview. Second, the final goal of the paper is to propose a roadmap in the design of responsible AI systems, ensuring that they can gain society's trust. To achieve this trustworthiness, this paper also fosters interdisciplinary discussions on the ethical, legal, social, economic, and cultural aspects of AI from a global governance perspective. Last but not least, we also reflect on the current state and those aspects that need to be developed in the near future, as ten lessons learned. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04739v1-abstract-full').style.display = 'none'; document.getElementById('2503.04739v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages, 7 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.0 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.04184">arXiv:2503.04184</a> <span> [<a href="https://arxiv.org/pdf/2503.04184">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Large-Scale AI in Telecom: Charting the Roadmap for Innovation, Scalability, and Enhanced Digital Experiences </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shahid%2C+A">Adnan Shahid</a>, <a href="/search/cs?searchtype=author&query=Kliks%2C+A">Adrian Kliks</a>, <a href="/search/cs?searchtype=author&query=Al-Tahmeesschi%2C+A">Ahmed Al-Tahmeesschi</a>, <a href="/search/cs?searchtype=author&query=Elbakary%2C+A">Ahmed Elbakary</a>, <a href="/search/cs?searchtype=author&query=Nikou%2C+A">Alexandros Nikou</a>, <a href="/search/cs?searchtype=author&query=Maatouk%2C+A">Ali Maatouk</a>, <a href="/search/cs?searchtype=author&query=Mokh%2C+A">Ali Mokh</a>, <a href="/search/cs?searchtype=author&query=Kazemi%2C+A">Amirreza Kazemi</a>, <a href="/search/cs?searchtype=author&query=De+Domenico%2C+A">Antonio De Domenico</a>, <a href="/search/cs?searchtype=author&query=Karapantelakis%2C+A">Athanasios Karapantelakis</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+B">Bo Cheng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+B">Bo Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bohao Wang</a>, <a href="/search/cs?searchtype=author&query=Fischione%2C+C">Carlo Fischione</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chao Zhang</a>, <a href="/search/cs?searchtype=author&query=Issaid%2C+C+B">Chaouki Ben Issaid</a>, <a href="/search/cs?searchtype=author&query=Yuen%2C+C">Chau Yuen</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chenghui Peng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chongwen Huang</a>, <a href="/search/cs?searchtype=author&query=Chaccour%2C+C">Christina Chaccour</a>, <a href="/search/cs?searchtype=author&query=Thomas%2C+C+K">Christo Kurisummoottil Thomas</a>, <a href="/search/cs?searchtype=author&query=Sharma%2C+D">Dheeraj Sharma</a>, <a href="/search/cs?searchtype=author&query=Kalogiros%2C+D">Dimitris Kalogiros</a>, <a href="/search/cs?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/cs?searchtype=author&query=De+Poorter%2C+E">Eli De Poorter</a> , et al. (110 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.04184v1-abstract-short" style="display: inline;"> This white paper discusses the role of large-scale AI in the telecommunications industry, with a specific focus on the potential of generative AI to revolutionize network functions and user experiences, especially in the context of 6G systems. It highlights the development and deployment of Large Telecom Models (LTMs), which are tailored AI models designed to address the complex challenges faced b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04184v1-abstract-full').style.display = 'inline'; document.getElementById('2503.04184v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.04184v1-abstract-full" style="display: none;"> This white paper discusses the role of large-scale AI in the telecommunications industry, with a specific focus on the potential of generative AI to revolutionize network functions and user experiences, especially in the context of 6G systems. It highlights the development and deployment of Large Telecom Models (LTMs), which are tailored AI models designed to address the complex challenges faced by modern telecom networks. The paper covers a wide range of topics, from the architecture and deployment strategies of LTMs to their applications in network management, resource allocation, and optimization. It also explores the regulatory, ethical, and standardization considerations for LTMs, offering insights into their future integration into telecom infrastructure. The goal is to provide a comprehensive roadmap for the adoption of LTMs to enhance scalability, performance, and user-centric innovation in telecom networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04184v1-abstract-full').style.display = 'none'; document.getElementById('2503.04184v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.02255">arXiv:2503.02255</a> <span> [<a href="https://arxiv.org/pdf/2503.02255">pdf</a>, <a href="https://arxiv.org/ps/2503.02255">ps</a>, <a href="https://arxiv.org/format/2503.02255">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> AxBERT: An Interpretable Chinese Spelling Correction Method Driven by Associative Knowledge Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fanyu Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Hangyu Zhu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Z">Zhenping Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.02255v1-abstract-short" style="display: inline;"> Deep learning has shown promising performance on various machine learning tasks. Nevertheless, the uninterpretability of deep learning models severely restricts the usage domains that require feature explanations, such as text correction. Therefore, a novel interpretable deep learning model (named AxBERT) is proposed for Chinese spelling correction by aligning with an associative knowledge network… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.02255v1-abstract-full').style.display = 'inline'; document.getElementById('2503.02255v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.02255v1-abstract-full" style="display: none;"> Deep learning has shown promising performance on various machine learning tasks. Nevertheless, the uninterpretability of deep learning models severely restricts the usage domains that require feature explanations, such as text correction. Therefore, a novel interpretable deep learning model (named AxBERT) is proposed for Chinese spelling correction by aligning with an associative knowledge network (AKN). Wherein AKN is constructed based on the co-occurrence relations among Chinese characters, which denotes the interpretable statistic logic contrasted with uninterpretable BERT logic. And a translator matrix between BERT and AKN is introduced for the alignment and regulation of the attention component in BERT. In addition, a weight regulator is designed to adjust the attention distributions in BERT to appropriately model the sentence semantics. Experimental results on SIGHAN datasets demonstrate that AxBERT can achieve extraordinary performance, especially upon model precision compared to baselines. Our interpretable analysis, together with qualitative reasoning, can effectively illustrate the interpretability of AxBERT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.02255v1-abstract-full').style.display = 'none'; document.getElementById('2503.02255v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.01151">arXiv:2503.01151</a> <span> [<a href="https://arxiv.org/pdf/2503.01151">pdf</a>, <a href="https://arxiv.org/format/2503.01151">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> ReaderLM-v2: Small Language Model for HTML to Markdown and JSON </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+F">Feng Wang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zesheng Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bo Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+N">Nan Wang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+H">Han Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.01151v1-abstract-short" style="display: inline;"> We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding large language models. The model's effectiveness results from two key innovations: (1) a three-stage data synthes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01151v1-abstract-full').style.display = 'inline'; document.getElementById('2503.01151v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.01151v1-abstract-full" style="display: none;"> We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding large language models. The model's effectiveness results from two key innovations: (1) a three-stage data synthesis pipeline that generates high quality, diverse training data by iteratively drafting, refining, and critiquing web content extraction; and (2) a unified training framework combining continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20\% on carefully curated benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly lower computational requirements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01151v1-abstract-full').style.display = 'none'; document.getElementById('2503.01151v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 10-12 refs</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T50 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7; I.2.10 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.00733">arXiv:2503.00733</a> <span> [<a href="https://arxiv.org/pdf/2503.00733">pdf</a>, <a href="https://arxiv.org/format/2503.00733">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> UniWav: Towards Unified Pre-training for Speech Representation Learning and Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+A+H">Alexander H. Liu</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Sang-gil Lee</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C+H">Chao-Han Huck Yang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Y">Yuan Gong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y+F">Yu-Chiang Frank Wang</a>, <a href="/search/cs?searchtype=author&query=Glass%2C+J+R">James R. Glass</a>, <a href="/search/cs?searchtype=author&query=Valle%2C+R">Rafael Valle</a>, <a href="/search/cs?searchtype=author&query=Catanzaro%2C+B">Bryan Catanzaro</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.00733v1-abstract-short" style="display: inline;"> Pre-training and representation learning have been playing an increasingly important role in modern speech processing. Nevertheless, different applications have been relying on different foundation models, since predominant pre-training techniques are either designed for discriminative tasks or generative tasks. In this work, we make the first attempt at building a unified pre-training framework f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.00733v1-abstract-full').style.display = 'inline'; document.getElementById('2503.00733v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.00733v1-abstract-full" style="display: none;"> Pre-training and representation learning have been playing an increasingly important role in modern speech processing. Nevertheless, different applications have been relying on different foundation models, since predominant pre-training techniques are either designed for discriminative tasks or generative tasks. In this work, we make the first attempt at building a unified pre-training framework for both types of tasks in speech. We show that with the appropriate design choices for pre-training, one can jointly learn a representation encoder and generative audio decoder that can be applied to both types of tasks. We propose UniWav, an encoder-decoder framework designed to unify pre-training representation learning and generative tasks. On speech recognition, text-to-speech, and speech tokenization, UniWav achieves comparable performance to different existing foundation models, each trained on a specific task. Our findings suggest that a single general-purpose foundation model for speech can be built to replace different foundation models, reducing the overhead and cost of pre-training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.00733v1-abstract-full').style.display = 'none'; document.getElementById('2503.00733v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2025; demo page at https://alexander-h-liu.github.io/uniwav-demo.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.00596">arXiv:2503.00596</a> <span> [<a href="https://arxiv.org/pdf/2503.00596">pdf</a>, <a href="https://arxiv.org/format/2503.00596">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> BadJudge: Backdoor Vulnerabilities of LLM-as-a-Judge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tong%2C+T">Terry Tong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fei Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhe Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Muhao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.00596v1-abstract-short" style="display: inline;"> This paper proposes a novel backdoor threat attacking the LLM-as-a-Judge evaluation regime, where the adversary controls both the candidate and evaluator model. The backdoored evaluator victimizes benign users by unfairly assigning inflated scores to adversary. A trivial single token backdoor poisoning 1% of the evaluator training data triples the adversary's score with respect to their legitimate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.00596v1-abstract-full').style.display = 'inline'; document.getElementById('2503.00596v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.00596v1-abstract-full" style="display: none;"> This paper proposes a novel backdoor threat attacking the LLM-as-a-Judge evaluation regime, where the adversary controls both the candidate and evaluator model. The backdoored evaluator victimizes benign users by unfairly assigning inflated scores to adversary. A trivial single token backdoor poisoning 1% of the evaluator training data triples the adversary's score with respect to their legitimate score. We systematically categorize levels of data access corresponding to three real-world settings, (1) web poisoning, (2) malicious annotator, and (3) weight poisoning. These regimes reflect a weak to strong escalation of data access that highly correlates with attack severity. Under the weakest assumptions - web poisoning (1), the adversary still induces a 20% score inflation. Likewise, in the (3) weight poisoning regime, the stronger assumptions enable the adversary to inflate their scores from 1.5/5 to 4.9/5. The backdoor threat generalizes across different evaluator architectures, trigger designs, evaluation tasks, and poisoning rates. By poisoning 10% of the evaluator training data, we control toxicity judges (Guardrails) to misclassify toxic prompts as non-toxic 89% of the time, and document reranker judges in RAG to rank the poisoned document first 97% of the time. LLM-as-a-Judge is uniquely positioned at the intersection of ethics and technology, where social implications of mislead model selection and evaluation constrain the available defensive tools. Amidst these challenges, model merging emerges as a principled tool to offset the backdoor, reducing ASR to near 0% whilst maintaining SOTA performance. Model merging's low computational cost and convenient integration into the current LLM Judge training pipeline position it as a promising avenue for backdoor mitigation in the LLM-as-a-Judge setting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.00596v1-abstract-full').style.display = 'none'; document.getElementById('2503.00596v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published to ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.00410">arXiv:2503.00410</a> <span> [<a href="https://arxiv.org/pdf/2503.00410">pdf</a>, <a href="https://arxiv.org/format/2503.00410">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> High Dynamic Range Video Compression: A Large-Scale Benchmark Dataset and A Learned Bit-depth Scalable Compression Algorithm </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tian%2C+Z">Zhaoyi Tian</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Feifeng Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shiwei Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zihao Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yao Zhu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+L">Liquan Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.00410v1-abstract-short" style="display: inline;"> Recently, learned video compression (LVC) is undergoing a period of rapid development. However, due to absence of large and high-quality high dynamic range (HDR) video training data, LVC on HDR video is still unexplored. In this paper, we are the first to collect a large-scale HDR video benchmark dataset, named HDRVD2K, featuring huge quantity, diverse scenes and multiple motion types. HDRVD2K fil… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.00410v1-abstract-full').style.display = 'inline'; document.getElementById('2503.00410v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.00410v1-abstract-full" style="display: none;"> Recently, learned video compression (LVC) is undergoing a period of rapid development. However, due to absence of large and high-quality high dynamic range (HDR) video training data, LVC on HDR video is still unexplored. In this paper, we are the first to collect a large-scale HDR video benchmark dataset, named HDRVD2K, featuring huge quantity, diverse scenes and multiple motion types. HDRVD2K fills gaps of video training data and facilitate the development of LVC on HDR videos. Based on HDRVD2K, we further propose the first learned bit-depth scalable video compression (LBSVC) network for HDR videos by effectively exploiting bit-depth redundancy between videos of multiple dynamic ranges. To achieve this, we first propose a compression-friendly bit-depth enhancement module (BEM) to effectively predict original HDR videos based on compressed tone-mapped low dynamic range (LDR) videos and dynamic range prior, instead of reducing redundancy only through spatio-temporal predictions. Our method greatly improves the reconstruction quality and compression performance on HDR videos. Extensive experiments demonstrate the effectiveness of HDRVD2K on learned HDR video compression and great compression performance of our proposed LBSVC network. Code and dataset will be released in https://github.com/sdkinda/HDR-Learned-Video-Coding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.00410v1-abstract-full').style.display = 'none'; document.getElementById('2503.00410v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.00317">arXiv:2503.00317</a> <span> [<a href="https://arxiv.org/pdf/2503.00317">pdf</a>, <a href="https://arxiv.org/format/2503.00317">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Physics">physics.comp-ph</span> </div> </div> <p class="title is-5 mathjax"> DeepONet Augmented by Randomized Neural Networks for Efficient Operator Learning in PDEs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhaoxi Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.00317v1-abstract-short" style="display: inline;"> Deep operator networks (DeepONets) represent a powerful class of data-driven methods for operator learning, demonstrating strong approximation capabilities for a wide range of linear and nonlinear operators. They have shown promising performance in learning operators that govern partial differential equations (PDEs), including diffusion-reaction systems and Burgers' equations. However, the accurac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.00317v1-abstract-full').style.display = 'inline'; document.getElementById('2503.00317v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.00317v1-abstract-full" style="display: none;"> Deep operator networks (DeepONets) represent a powerful class of data-driven methods for operator learning, demonstrating strong approximation capabilities for a wide range of linear and nonlinear operators. They have shown promising performance in learning operators that govern partial differential equations (PDEs), including diffusion-reaction systems and Burgers' equations. However, the accuracy of DeepONets is often constrained by computational limitations and optimization challenges inherent in training deep neural networks. Furthermore, the computational cost associated with training these networks is typically very high. To address these challenges, we leverage randomized neural networks (RaNNs), in which the parameters of the hidden layers remain fixed following random initialization. RaNNs compute the output layer parameters using the least-squares method, significantly reducing training time and mitigating optimization errors. In this work, we integrate DeepONets with RaNNs to propose RaNN-DeepONets, a hybrid architecture designed to balance accuracy and efficiency. Furthermore, to mitigate the need for extensive data preparation, we introduce the concept of physics-informed RaNN-DeepONets. Instead of relying on data generated through other time-consuming numerical methods, we incorporate PDE information directly into the training process. We evaluate the proposed model on three benchmark PDE problems: diffusion-reaction dynamics, Burgers' equation, and the Darcy flow problem. Through these tests, we assess its ability to learn nonlinear operators with varying input types. When compared to the standard DeepONet framework, RaNN-DeepONets achieves comparable accuracy while reducing computational costs by orders of magnitude. These results highlight the potential of RaNN-DeepONets as an efficient alternative for operator learning in PDE-based systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.00317v1-abstract-full').style.display = 'none'; document.getElementById('2503.00317v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.20981">arXiv:2502.20981</a> <span> [<a href="https://arxiv.org/pdf/2502.20981">pdf</a>, <a href="https://arxiv.org/format/2502.20981">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Distribution Prototype Diffusion Learning for Open-set Supervised Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fuyun Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tong Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuanzhi Wang</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+Y">Yide Qiu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xin Liu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xu Guo</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Z">Zhen Cui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.20981v1-abstract-short" style="display: inline;"> In Open-set Supervised Anomaly Detection (OSAD), the existing methods typically generate pseudo anomalies to compensate for the scarcity of observed anomaly samples, while overlooking critical priors of normal samples, leading to less effective discriminative boundaries. To address this issue, we propose a Distribution Prototype Diffusion Learning (DPDL) method aimed at enclosing normal samples wi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.20981v1-abstract-full').style.display = 'inline'; document.getElementById('2502.20981v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.20981v1-abstract-full" style="display: none;"> In Open-set Supervised Anomaly Detection (OSAD), the existing methods typically generate pseudo anomalies to compensate for the scarcity of observed anomaly samples, while overlooking critical priors of normal samples, leading to less effective discriminative boundaries. To address this issue, we propose a Distribution Prototype Diffusion Learning (DPDL) method aimed at enclosing normal samples within a compact and discriminative distribution space. Specifically, we construct multiple learnable Gaussian prototypes to create a latent representation space for abundant and diverse normal samples and learn a Schr枚dinger bridge to facilitate a diffusive transition toward these prototypes for normal samples while steering anomaly samples away. Moreover, to enhance inter-sample separation, we design a dispersion feature learning way in hyperspherical space, which benefits the identification of out-of-distribution anomalies. Experimental results demonstrate the effectiveness and superiority of our proposed DPDL, achieving state-of-the-art performance on 9 public datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.20981v1-abstract-full').style.display = 'none'; document.getElementById('2502.20981v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.20795">arXiv:2502.20795</a> <span> [<a href="https://arxiv.org/pdf/2502.20795">pdf</a>, <a href="https://arxiv.org/format/2502.20795">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Plan2Align: Predictive Planning Based Test-Time Preference Alignment in Paragraph-Level Machine Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kuang-Da Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Teng-Ruei Chen</a>, <a href="/search/cs?searchtype=author&query=Hung%2C+Y+H">Yu Heng Hung</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+S">Shuoyang Ding</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yueh-Hua Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y+F">Yu-Chiang Frank Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C+H">Chao-Han Huck Yang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+W">Wen-Chih Peng</a>, <a href="/search/cs?searchtype=author&query=Hsieh%2C+P">Ping-Chun Hsieh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.20795v1-abstract-short" style="display: inline;"> Machine Translation (MT) has been predominantly designed for sentence-level translation using transformer-based architectures. While next-token prediction based Large Language Models (LLMs) demonstrate strong capabilities in long-text translation, non-extensive language models often suffer from omissions and semantic inconsistencies when processing paragraphs. Existing preference alignment methods… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.20795v1-abstract-full').style.display = 'inline'; document.getElementById('2502.20795v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.20795v1-abstract-full" style="display: none;"> Machine Translation (MT) has been predominantly designed for sentence-level translation using transformer-based architectures. While next-token prediction based Large Language Models (LLMs) demonstrate strong capabilities in long-text translation, non-extensive language models often suffer from omissions and semantic inconsistencies when processing paragraphs. Existing preference alignment methods improve sentence-level translation but fail to ensure coherence over extended contexts due to the myopic nature of next-token generation. We introduce Plan2Align, a test-time alignment framework that treats translation as a predictive planning problem, adapting Model Predictive Control to iteratively refine translation outputs. Experiments on WMT24 Discourse-Level Literary Translation show that Plan2Align significantly improves paragraph-level translation, achieving performance surpassing or on par with the existing training-time and test-time alignment methods on LLaMA-3.1 8B. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.20795v1-abstract-full').style.display = 'none'; document.getElementById('2502.20795v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint. Code will be released at Plan2Align GitHub link: https://github.com/NYCU-RL-Bandits-Lab/Plan2Align</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.19723">arXiv:2502.19723</a> <span>  </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CNsum:Automatic Summarization for Chinese News Text </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yu Zhao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Songping Huang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+D">Dongsheng Zhou</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Z">Zhaoyun Ding</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fei Wang</a>, <a href="/search/cs?searchtype=author&query=Nian%2C+A">Aixin Nian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.19723v3-abstract-short" style="display: inline;"> Obtaining valuable information from massive data efficiently has become our research goal in the era of Big Data. Text summarization technology has been continuously developed to meet this demand. Recent work has also shown that transformer-based pre-trained language models have achieved great success on various tasks in Natural Language Processing (NLP). Aiming at the problem of Chinese news text… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.19723v3-abstract-full').style.display = 'inline'; document.getElementById('2502.19723v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.19723v3-abstract-full" style="display: none;"> Obtaining valuable information from massive data efficiently has become our research goal in the era of Big Data. Text summarization technology has been continuously developed to meet this demand. Recent work has also shown that transformer-based pre-trained language models have achieved great success on various tasks in Natural Language Processing (NLP). Aiming at the problem of Chinese news text summary generation and the application of Transformer structure on Chinese, this paper proposes a Chinese news text summarization model (CNsum) based on Transformer structure, and tests it on Chinese datasets such as THUCNews. The results of the conducted experiments show that CNsum achieves better ROUGE score than the baseline models, which verifies the outperformance of the model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.19723v3-abstract-full').style.display = 'none'; document.getElementById('2502.19723v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This withdrawal is due to the lack of authorization from all co-authors for the publication of this version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.18694">arXiv:2502.18694</a> <span> [<a href="https://arxiv.org/pdf/2502.18694">pdf</a>, <a href="https://arxiv.org/format/2502.18694">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Requirements-Driven Automated Software Testing: A Systematic Review </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fanyu Wang</a>, <a href="/search/cs?searchtype=author&query=Arora%2C+C">Chetan Arora</a>, <a href="/search/cs?searchtype=author&query=Tantithamthavorn%2C+C">Chakkrit Tantithamthavorn</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kaicheng Huang</a>, <a href="/search/cs?searchtype=author&query=Aleti%2C+A">Aldeida Aleti</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.18694v1-abstract-short" style="display: inline;"> Automated software testing has the potential to enhance efficiency and reliability in software development, yet its adoption remains hindered by challenges in aligning test generation with software requirements. REquirements-Driven Automated Software Testing (REDAST) aims to bridge this gap by leveraging requirements as the foundation for automated test artifact generation. This systematic literat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18694v1-abstract-full').style.display = 'inline'; document.getElementById('2502.18694v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.18694v1-abstract-full" style="display: none;"> Automated software testing has the potential to enhance efficiency and reliability in software development, yet its adoption remains hindered by challenges in aligning test generation with software requirements. REquirements-Driven Automated Software Testing (REDAST) aims to bridge this gap by leveraging requirements as the foundation for automated test artifact generation. This systematic literature review (SLR) explores the landscape of REDAST by analyzing requirements input, transformation techniques, test outcomes, evaluation methods, and existing limitations. We conducted a comprehensive review of 156 papers selected from six major research databases. Our findings reveal the predominant types, formats, and notations used for requirements in REDAST, the automation techniques employed for generating test artifacts from requirements, and the abstraction levels of resulting test cases. Furthermore, we evaluate the effectiveness of various testing frameworks and identify key challenges such as scalability, automation gaps, and dependency on input quality. This study synthesizes the current state of REDAST research, highlights trends, and proposes future directions, serving as a reference for researchers and practitioners aiming to advance automated software testing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18694v1-abstract-full').style.display = 'none'; document.getElementById('2502.18694v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under reviewing in TOSEM</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.16652">arXiv:2502.16652</a> <span> [<a href="https://arxiv.org/pdf/2502.16652">pdf</a>, <a href="https://arxiv.org/format/2502.16652">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Dr. Splat: Directly Referring 3D Gaussian Splatting via Direct Language Embedding Registration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jun-Seong%2C+K">Kim Jun-Seong</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+G">GeonU Kim</a>, <a href="/search/cs?searchtype=author&query=Yu-Ji%2C+K">Kim Yu-Ji</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y+F">Yu-Chiang Frank Wang</a>, <a href="/search/cs?searchtype=author&query=Choe%2C+J">Jaesung Choe</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+T">Tae-Hyun Oh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.16652v1-abstract-short" style="display: inline;"> We introduce Dr. Splat, a novel approach for open-vocabulary 3D scene understanding leveraging 3D Gaussian Splatting. Unlike existing language-embedded 3DGS methods, which rely on a rendering process, our method directly associates language-aligned CLIP embeddings with 3D Gaussians for holistic 3D scene understanding. The key of our method is a language feature registration technique where CLIP em… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16652v1-abstract-full').style.display = 'inline'; document.getElementById('2502.16652v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.16652v1-abstract-full" style="display: none;"> We introduce Dr. Splat, a novel approach for open-vocabulary 3D scene understanding leveraging 3D Gaussian Splatting. Unlike existing language-embedded 3DGS methods, which rely on a rendering process, our method directly associates language-aligned CLIP embeddings with 3D Gaussians for holistic 3D scene understanding. The key of our method is a language feature registration technique where CLIP embeddings are assigned to the dominant Gaussians intersected by each pixel-ray. Moreover, we integrate Product Quantization (PQ) trained on general large-scale image data to compactly represent embeddings without per-scene optimization. Experiments demonstrate that our approach significantly outperforms existing approaches in 3D perception benchmarks, such as open-vocabulary 3D semantic segmentation, 3D object localization, and 3D object selection tasks. For video results, please visit : https://drsplat.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16652v1-abstract-full').style.display = 'none'; document.getElementById('2502.16652v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Wang%2C+F&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Wang%2C+F&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+F&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+F&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+F&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+F&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository