Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 3,450 results for author: <span class="mathjax">Li, C</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Li%2C+C">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Li, C"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Li%2C+C&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Li, C"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Li%2C+C&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Li%2C+C&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+C&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+C&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+C&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+C&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2504.00437">arXiv:2504.00437</a> <span> [<a href="https://arxiv.org/pdf/2504.00437">pdf</a>, <a href="https://arxiv.org/format/2504.00437">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ADGaussian: Generalizable Gaussian Splatting for Autonomous Driving with Multi-modal Inputs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+Q">Qi Song</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenghong Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haotong Lin</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+S">Sida Peng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+R">Rui Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2504.00437v1-abstract-short" style="display: inline;"> We present a novel approach, termed ADGaussian, for generalizable street scene reconstruction. The proposed method enables high-quality rendering from single-view input. Unlike prior Gaussian Splatting methods that primarily focus on geometry refinement, we emphasize the importance of joint optimization of image and depth features for accurate Gaussian prediction. To this end, we first incorporate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2504.00437v1-abstract-full').style.display = 'inline'; document.getElementById('2504.00437v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2504.00437v1-abstract-full" style="display: none;"> We present a novel approach, termed ADGaussian, for generalizable street scene reconstruction. The proposed method enables high-quality rendering from single-view input. Unlike prior Gaussian Splatting methods that primarily focus on geometry refinement, we emphasize the importance of joint optimization of image and depth features for accurate Gaussian prediction. To this end, we first incorporate sparse LiDAR depth as an additional input modality, formulating the Gaussian prediction process as a joint learning framework of visual information and geometric clue. Furthermore, we propose a multi-modal feature matching strategy coupled with a multi-scale Gaussian decoding model to enhance the joint refinement of multi-modal features, thereby enabling efficient multi-modal Gaussian learning. Extensive experiments on two large-scale autonomous driving datasets, Waymo and KITTI, demonstrate that our ADGaussian achieves state-of-the-art performance and exhibits superior zero-shot generalization capabilities in novel-view shifting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2504.00437v1-abstract-full').style.display = 'none'; document.getElementById('2504.00437v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 April, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The project page can be found at https://maggiesong7.github.io/research/ADGaussian/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2504.00432">arXiv:2504.00432</a> <span> [<a href="https://arxiv.org/pdf/2504.00432">pdf</a>, <a href="https://arxiv.org/format/2504.00432">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DecoFuse: Decomposing and Fusing the "What", "Where", and "How" for Brain-Inspired fMRI-to-Video Decoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+C">Chong Li</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jingyang Huo</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+W">Weikang Gong</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yanwei Fu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+X">Xiangyang Xue</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+J">Jianfeng Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2504.00432v1-abstract-short" style="display: inline;"> Decoding visual experiences from brain activity is a significant challenge. Existing fMRI-to-video methods often focus on semantic content while overlooking spatial and motion information. However, these aspects are all essential and are processed through distinct pathways in the brain. Motivated by this, we propose DecoFuse, a novel brain-inspired framework for decoding videos from fMRI signals.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2504.00432v1-abstract-full').style.display = 'inline'; document.getElementById('2504.00432v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2504.00432v1-abstract-full" style="display: none;"> Decoding visual experiences from brain activity is a significant challenge. Existing fMRI-to-video methods often focus on semantic content while overlooking spatial and motion information. However, these aspects are all essential and are processed through distinct pathways in the brain. Motivated by this, we propose DecoFuse, a novel brain-inspired framework for decoding videos from fMRI signals. It first decomposes the video into three components - semantic, spatial, and motion - then decodes each component separately before fusing them to reconstruct the video. This approach not only simplifies the complex task of video decoding by decomposing it into manageable sub-tasks, but also establishes a clearer connection between learned representations and their biological counterpart, as supported by ablation studies. Further, our experiments show significant improvements over previous state-of-the-art methods, achieving 82.4% accuracy for semantic classification, 70.6% accuracy in spatial consistency, a 0.212 cosine similarity for motion prediction, and 21.9% 50-way accuracy for video generation. Additionally, neural encoding analyses for semantic and spatial information align with the two-streams hypothesis, further validating the distinct roles of the ventral and dorsal pathways. Overall, DecoFuse provides a strong and biologically plausible framework for fMRI-to-video decoding. Project page: https://chongjg.github.io/DecoFuse/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2504.00432v1-abstract-full').style.display = 'none'; document.getElementById('2504.00432v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 April, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.24389">arXiv:2503.24389</a> <span> [<a href="https://arxiv.org/pdf/2503.24389">pdf</a>, <a href="https://arxiv.org/format/2503.24389">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> SU-YOLO: Spiking Neural Network for Efficient Underwater Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenyang Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenxuan Liu</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+G">Guoqiang Gong</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+X">Xiaobo Ding</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+X">Xian Zhong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.24389v1-abstract-short" style="display: inline;"> Underwater object detection is critical for oceanic research and industrial safety inspections. However, the complex optical environment and the limited resources of underwater equipment pose significant challenges to achieving high accuracy and low power consumption. To address these issues, we propose Spiking Underwater YOLO (SU-YOLO), a Spiking Neural Network (SNN) model. Leveraging the lightwe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.24389v1-abstract-full').style.display = 'inline'; document.getElementById('2503.24389v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.24389v1-abstract-full" style="display: none;"> Underwater object detection is critical for oceanic research and industrial safety inspections. However, the complex optical environment and the limited resources of underwater equipment pose significant challenges to achieving high accuracy and low power consumption. To address these issues, we propose Spiking Underwater YOLO (SU-YOLO), a Spiking Neural Network (SNN) model. Leveraging the lightweight and energy-efficient properties of SNNs, SU-YOLO incorporates a novel spike-based underwater image denoising method based solely on integer addition, which enhances the quality of feature maps with minimal computational overhead. In addition, we introduce Separated Batch Normalization (SeBN), a technique that normalizes feature maps independently across multiple time steps and is optimized for integration with residual structures to capture the temporal dynamics of SNNs more effectively. The redesigned spiking residual blocks integrate the Cross Stage Partial Network (CSPNet) with the YOLO architecture to mitigate spike degradation and enhance the model's feature extraction capabilities. Experimental results on URPC2019 underwater dataset demonstrate that SU-YOLO achieves mAP of 78.8% with 6.97M parameters and an energy consumption of 2.98 mJ, surpassing mainstream SNN models in both detection accuracy and computational efficiency. These results underscore the potential of SNNs for engineering applications. The code is available in https://github.com/lwxfight/snn-underwater. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.24389v1-abstract-full').style.display = 'none'; document.getElementById('2503.24389v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23844">arXiv:2503.23844</a> <span> [<a href="https://arxiv.org/pdf/2503.23844">pdf</a>, <a href="https://arxiv.org/format/2503.23844">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FlexiMo: A Flexible Remote Sensing Foundation Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xuyang Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenyu Li</a>, <a href="/search/cs?searchtype=author&query=Ghamisi%2C+P">Pedram Ghamisi</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+D">Danfeng Hong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23844v1-abstract-short" style="display: inline;"> The rapid expansion of multi-source satellite imagery drives innovation in Earth observation, opening unprecedented opportunities for Remote Sensing Foundation Models to harness diverse data. However, many existing models remain constrained by fixed spatial resolutions and patch sizes, limiting their ability to fully exploit the heterogeneous spatial characteristics inherent in satellite imagery.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23844v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23844v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23844v1-abstract-full" style="display: none;"> The rapid expansion of multi-source satellite imagery drives innovation in Earth observation, opening unprecedented opportunities for Remote Sensing Foundation Models to harness diverse data. However, many existing models remain constrained by fixed spatial resolutions and patch sizes, limiting their ability to fully exploit the heterogeneous spatial characteristics inherent in satellite imagery. To address these challenges, we propose FlexiMo, a flexible remote sensing foundation model that endows the pre-trained model with the flexibility to adapt to arbitrary spatial resolutions. Central to FlexiMo is a spatial resolution-aware module that employs a parameter-free alignment embedding mechanism to dynamically recalibrate patch embeddings based on the input image's resolution and dimensions. This design not only preserves critical token characteristics and ensures multi-scale feature fidelity but also enables efficient feature extraction without requiring modifications to the underlying network architecture. In addition, FlexiMo incorporates a lightweight channel adaptation module that leverages prior spectral information from sensors. This mechanism allows the model to process images with varying numbers of channels while maintaining the data's intrinsic physical properties. Extensive experiments on diverse multimodal, multi-resolution, and multi-scale datasets demonstrate that FlexiMo significantly enhances model generalization and robustness. In particular, our method achieves outstanding performance across a range of downstream tasks, including scene classification, land cover classification, urban building segmentation, and cloud detection. By enabling parameter-efficient and physically consistent adaptation, FlexiMo paves the way for more adaptable and effective foundation models in real-world remote sensing applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23844v1-abstract-full').style.display = 'none'; document.getElementById('2503.23844v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23644">arXiv:2503.23644</a> <span> [<a href="https://arxiv.org/pdf/2503.23644">pdf</a>, <a href="https://arxiv.org/format/2503.23644">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Uni-Render: A Unified Accelerator for Real-Time Rendering Across Diverse Neural Renderers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+C">Chaojian Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Sixu Li</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+L">Linrui Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingqun Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y+C">Yingyan Celine Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23644v1-abstract-short" style="display: inline;"> Recent advancements in neural rendering technologies and their supporting devices have paved the way for immersive 3D experiences, significantly transforming human interaction with intelligent devices across diverse applications. However, achieving the desired real-time rendering speeds for immersive interactions is still hindered by (1) the lack of a universal algorithmic solution for different a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23644v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23644v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23644v1-abstract-full" style="display: none;"> Recent advancements in neural rendering technologies and their supporting devices have paved the way for immersive 3D experiences, significantly transforming human interaction with intelligent devices across diverse applications. However, achieving the desired real-time rendering speeds for immersive interactions is still hindered by (1) the lack of a universal algorithmic solution for different application scenarios and (2) the dedication of existing devices or accelerators to merely specific rendering pipelines. To overcome this challenge, we have developed a unified neural rendering accelerator that caters to a wide array of typical neural rendering pipelines, enabling real-time and on-device rendering across different applications while maintaining both efficiency and compatibility. Our accelerator design is based on the insight that, although neural rendering pipelines vary and their algorithm designs are continually evolving, they typically share common operators, predominantly executing similar workloads. Building on this insight, we propose a reconfigurable hardware architecture that can dynamically adjust dataflow to align with specific rendering metric requirements for diverse applications, effectively supporting both typical and the latest hybrid rendering pipelines. Benchmarking experiments and ablation studies on both synthetic and real-world scenes demonstrate the effectiveness of the proposed accelerator. The proposed unified accelerator stands out as the first solution capable of achieving real-time neural rendering across varied representative pipelines on edge devices, potentially paving the way for the next generation of neural graphics applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23644v1-abstract-full').style.display = 'none'; document.getElementById('2503.23644v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by HPCA'25</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23625">arXiv:2503.23625</a> <span> [<a href="https://arxiv.org/pdf/2503.23625">pdf</a>, <a href="https://arxiv.org/format/2503.23625">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Gaussian Blending Unit: An Edge GPU Plug-in for Real-Time Gaussian-Based Rendering in AR/VR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+Z">Zhifan Ye</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yonggan Fu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingqun Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Leshu Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yongan Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Sixu Li</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+C">Cheng Wan</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+C">Chenxi Wan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chaojian Li</a>, <a href="/search/cs?searchtype=author&query=Prathipati%2C+S">Sreemanth Prathipati</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y+C">Yingyan Celine Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23625v1-abstract-short" style="display: inline;"> The rapidly advancing field of Augmented and Virtual Reality (AR/VR) demands real-time, photorealistic rendering on resource-constrained platforms. 3D Gaussian Splatting, delivering state-of-the-art (SOTA) performance in rendering efficiency and quality, has emerged as a promising solution across a broad spectrum of AR/VR applications. However, despite its effectiveness on high-end GPUs, it strugg… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23625v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23625v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23625v1-abstract-full" style="display: none;"> The rapidly advancing field of Augmented and Virtual Reality (AR/VR) demands real-time, photorealistic rendering on resource-constrained platforms. 3D Gaussian Splatting, delivering state-of-the-art (SOTA) performance in rendering efficiency and quality, has emerged as a promising solution across a broad spectrum of AR/VR applications. However, despite its effectiveness on high-end GPUs, it struggles on edge systems like the Jetson Orin NX Edge GPU, achieving only 7-17 FPS -- well below the over 60 FPS standard required for truly immersive AR/VR experiences. Addressing this challenge, we perform a comprehensive analysis of Gaussian-based AR/VR applications and identify the Gaussian Blending Stage, which intensively calculates each Gaussian's contribution at every pixel, as the primary bottleneck. In response, we propose a Gaussian Blending Unit (GBU), an edge GPU plug-in module for real-time rendering in AR/VR applications. Notably, our GBU can be seamlessly integrated into conventional edge GPUs and collaboratively supports a wide range of AR/VR applications. Specifically, GBU incorporates an intra-row sequential shading (IRSS) dataflow that shades each row of pixels sequentially from left to right, utilizing a two-step coordinate transformation. When directly deployed on a GPU, the proposed dataflow achieved a non-trivial 1.72x speedup on real-world static scenes, though still falls short of real-time rendering performance. Recognizing the limited compute utilization in the GPU-based implementation, GBU enhances rendering speed with a dedicated rendering engine that balances the workload across rows by aggregating computations from multiple Gaussians. Experiments across representative AR/VR applications demonstrate that our GBU provides a unified solution for on-device real-time rendering while maintaining SOTA rendering quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23625v1-abstract-full').style.display = 'none'; document.getElementById('2503.23625v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by HPCA 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23580">arXiv:2503.23580</a> <span> [<a href="https://arxiv.org/pdf/2503.23580">pdf</a>, <a href="https://arxiv.org/format/2503.23580">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DiT4SR: Taming Diffusion Transformer for Real-World Image Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Duan%2C+Z">Zheng-Peng Duan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiawei Zhang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+X">Xin Jin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Ziheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Z">Zheng Xiong</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+D">Dongqing Zou</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+J">Jimmy Ren</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+C">Chun-Le Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chongyi Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23580v1-abstract-short" style="display: inline;"> Large-scale pre-trained diffusion models are becoming increasingly popular in solving the Real-World Image Super-Resolution (Real-ISR) problem because of their rich generative priors. The recent development of diffusion transformer (DiT) has witnessed overwhelming performance over the traditional UNet-based architecture in image generation, which also raises the question: Can we adopt the advanced… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23580v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23580v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23580v1-abstract-full" style="display: none;"> Large-scale pre-trained diffusion models are becoming increasingly popular in solving the Real-World Image Super-Resolution (Real-ISR) problem because of their rich generative priors. The recent development of diffusion transformer (DiT) has witnessed overwhelming performance over the traditional UNet-based architecture in image generation, which also raises the question: Can we adopt the advanced DiT-based diffusion model for Real-ISR? To this end, we propose our DiT4SR, one of the pioneering works to tame the large-scale DiT model for Real-ISR. Instead of directly injecting embeddings extracted from low-resolution (LR) images like ControlNet, we integrate the LR embeddings into the original attention mechanism of DiT, allowing for the bidirectional flow of information between the LR latent and the generated latent. The sufficient interaction of these two streams allows the LR stream to evolve with the diffusion process, producing progressively refined guidance that better aligns with the generated latent at each diffusion step. Additionally, the LR guidance is injected into the generated latent via a cross-stream convolution layer, compensating for DiT's limited ability to capture local information. These simple but effective designs endow the DiT model with superior performance in Real-ISR, which is demonstrated by extensive experiments. Project Page: https://adam-duan.github.io/projects/dit4sr/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23580v1-abstract-full').style.display = 'none'; document.getElementById('2503.23580v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23534">arXiv:2503.23534</a> <span> [<a href="https://arxiv.org/pdf/2503.23534">pdf</a>, <a href="https://arxiv.org/format/2503.23534">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> BiPVL-Seg: Bidirectional Progressive Vision-Language Fusion with Global-Local Alignment for Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sultan%2C+R+I">Rafi Ibn Sultan</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Hui Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chengyin Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+D">Dongxiao Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23534v1-abstract-short" style="display: inline;"> Medical image segmentation typically relies solely on visual data, overlooking the rich textual information clinicians use for diagnosis. Vision-language models attempt to bridge this gap, but existing approaches often process visual and textual features independently, resulting in weak cross-modal alignment. Simple fusion techniques fail due to the inherent differences between spatial visual feat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23534v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23534v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23534v1-abstract-full" style="display: none;"> Medical image segmentation typically relies solely on visual data, overlooking the rich textual information clinicians use for diagnosis. Vision-language models attempt to bridge this gap, but existing approaches often process visual and textual features independently, resulting in weak cross-modal alignment. Simple fusion techniques fail due to the inherent differences between spatial visual features and sequential text embeddings. Additionally, medical terminology deviates from general language, limiting the effectiveness of off-the-shelf text encoders and further hindering vision-language alignment. We propose BiPVL-Seg, an end-to-end framework that integrates vision-language fusion and embedding alignment through architectural and training innovations, where both components reinforce each other to enhance medical image segmentation. BiPVL-Seg introduces bidirectional progressive fusion in the architecture, which facilitates stage-wise information exchange between vision and text encoders. Additionally, it incorporates global-local contrastive alignment, a training objective that enhances the text encoder's comprehension by aligning text and vision embeddings at both class and concept levels. Extensive experiments on diverse medical imaging benchmarks across CT and MR modalities demonstrate BiPVL-Seg's superior performance when compared with state-of-the-art methods in complex multi-class segmentation. Source code is available in this GitHub repository. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23534v1-abstract-full').style.display = 'none'; document.getElementById('2503.23534v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23359">arXiv:2503.23359</a> <span> [<a href="https://arxiv.org/pdf/2503.23359">pdf</a>, <a href="https://arxiv.org/format/2503.23359">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VideoFusion: A Spatio-Temporal Collaborative Network for Mutli-modal Video Fusion and Restoration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+L">Linfeng Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yeda Wang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+M">Meiqi Gong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zizhuo Li</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Y">Yuxin Deng</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+X">Xunpeng Yi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chunyu Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Han Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jiayi Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23359v1-abstract-short" style="display: inline;"> Compared to images, videos better align with real-world acquisition scenarios and possess valuable temporal cues. However, existing multi-sensor fusion research predominantly integrates complementary context from multiple images rather than videos. This primarily stems from two factors: 1) the scarcity of large-scale multi-sensor video datasets, limiting research in video fusion, and 2) the inhere… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23359v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23359v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23359v1-abstract-full" style="display: none;"> Compared to images, videos better align with real-world acquisition scenarios and possess valuable temporal cues. However, existing multi-sensor fusion research predominantly integrates complementary context from multiple images rather than videos. This primarily stems from two factors: 1) the scarcity of large-scale multi-sensor video datasets, limiting research in video fusion, and 2) the inherent difficulty of jointly modeling spatial and temporal dependencies in a unified framework. This paper proactively compensates for the dilemmas. First, we construct M3SVD, a benchmark dataset with $220$ temporally synchronized and spatially registered infrared-visible video pairs comprising 153,797 frames, filling the data gap for the video fusion community. Secondly, we propose VideoFusion, a multi-modal video fusion model that fully exploits cross-modal complementarity and temporal dynamics to generate spatio-temporally coherent videos from (potentially degraded) multi-modal inputs. Specifically, 1) a differential reinforcement module is developed for cross-modal information interaction and enhancement, 2) a complete modality-guided fusion strategy is employed to adaptively integrate multi-modal features, and 3) a bi-temporal co-attention mechanism is devised to dynamically aggregate forward-backward temporal contexts to reinforce cross-frame feature representations. Extensive experiments reveal that VideoFusion outperforms existing image-oriented fusion paradigms in sequential scenarios, effectively mitigating temporal inconsistency and interference. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23359v1-abstract-full').style.display = 'none'; document.getElementById('2503.23359v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23355">arXiv:2503.23355</a> <span> [<a href="https://arxiv.org/pdf/2503.23355">pdf</a>, <a href="https://arxiv.org/format/2503.23355">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DSPFusion: Image Fusion via Degradation and Semantic Dual-Prior Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+L">Linfeng Tang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chunyu Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoqing Wang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yixuan Yuan</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jiayi Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23355v1-abstract-short" style="display: inline;"> Existing fusion methods are tailored for high-quality images but struggle with degraded images captured under harsh circumstances, thus limiting the practical potential of image fusion. This work presents a \textbf{D}egradation and \textbf{S}emantic \textbf{P}rior dual-guided framework for degraded image \textbf{Fusion} (\textbf{DSPFusion}), utilizing degradation priors and high-quality scene sema… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23355v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23355v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23355v1-abstract-full" style="display: none;"> Existing fusion methods are tailored for high-quality images but struggle with degraded images captured under harsh circumstances, thus limiting the practical potential of image fusion. This work presents a \textbf{D}egradation and \textbf{S}emantic \textbf{P}rior dual-guided framework for degraded image \textbf{Fusion} (\textbf{DSPFusion}), utilizing degradation priors and high-quality scene semantic priors restored via diffusion models to guide both information recovery and fusion in a unified model. In specific, it first individually extracts modality-specific degradation priors, while jointly capturing comprehensive low-quality semantic priors. Subsequently, a diffusion model is developed to iteratively restore high-quality semantic priors in a compact latent space, enabling our method to be over $20 \times$ faster than mainstream diffusion model-based image fusion schemes. Finally, the degradation priors and high-quality semantic priors are employed to guide information enhancement and aggregation via the dual-prior guidance and prior-guided fusion modules. Extensive experiments demonstrate that DSPFusion mitigates most typical degradations while integrating complementary context with minimal computational cost, greatly broadening the application scope of image fusion. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23355v1-abstract-full').style.display = 'none'; document.getElementById('2503.23355v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23331">arXiv:2503.23331</a> <span> [<a href="https://arxiv.org/pdf/2503.23331">pdf</a>, <a href="https://arxiv.org/format/2503.23331">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> HiPART: Hierarchical Pose AutoRegressive Transformer for Occluded 3D Human Pose Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Hongwei Zheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Han Li</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+W">Wenrui Dai</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Ziyang Zheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenglin Li</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+J">Junni Zou</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+H">Hongkai Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23331v1-abstract-short" style="display: inline;"> Existing 2D-to-3D human pose estimation (HPE) methods struggle with the occlusion issue by enriching information like temporal and visual cues in the lifting stage. In this paper, we argue that these methods ignore the limitation of the sparse skeleton 2D input representation, which fundamentally restricts the 2D-to-3D lifting and worsens the occlusion issue. To address these, we propose a novel t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23331v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23331v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23331v1-abstract-full" style="display: none;"> Existing 2D-to-3D human pose estimation (HPE) methods struggle with the occlusion issue by enriching information like temporal and visual cues in the lifting stage. In this paper, we argue that these methods ignore the limitation of the sparse skeleton 2D input representation, which fundamentally restricts the 2D-to-3D lifting and worsens the occlusion issue. To address these, we propose a novel two-stage generative densification method, named Hierarchical Pose AutoRegressive Transformer (HiPART), to generate hierarchical 2D dense poses from the original sparse 2D pose. Specifically, we first develop a multi-scale skeleton tokenization module to quantize the highly dense 2D pose into hierarchical tokens and propose a Skeleton-aware Alignment to strengthen token connections. We then develop a Hierarchical AutoRegressive Modeling scheme for hierarchical 2D pose generation. With generated hierarchical poses as inputs for 2D-to-3D lifting, the proposed method shows strong robustness in occluded scenarios and achieves state-of-the-art performance on the single-frame-based 3D HPE. Moreover, it outperforms numerous multi-frame methods while reducing parameter and computational complexity and can also complement them to further enhance performance and robustness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23331v1-abstract-full').style.display = 'none'; document.getElementById('2503.23331v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23274">arXiv:2503.23274</a> <span> [<a href="https://arxiv.org/pdf/2503.23274">pdf</a>, <a href="https://arxiv.org/format/2503.23274">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> PromptDistill: Query-based Selective Token Retention in Intermediate Layers for Efficient Large Language Model Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+W">Weisheng Jin</a>, <a href="/search/cs?searchtype=author&query=Song%2C+M">Maojia Song</a>, <a href="/search/cs?searchtype=author&query=Pala%2C+T+D">Tej Deep Pala</a>, <a href="/search/cs?searchtype=author&query=Chia%2C+Y+K">Yew Ken Chia</a>, <a href="/search/cs?searchtype=author&query=Zadeh%2C+A">Amir Zadeh</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chuan Li</a>, <a href="/search/cs?searchtype=author&query=Poria%2C+S">Soujanya Poria</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23274v1-abstract-short" style="display: inline;"> As large language models (LLMs) tackle increasingly complex tasks and longer documents, their computational and memory costs during inference become a major bottleneck. To address this, we propose PromptDistill, a novel, training-free method that improves inference efficiency while preserving generation quality. PromptDistill identifies and retains the most informative tokens by leveraging attenti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23274v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23274v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23274v1-abstract-full" style="display: none;"> As large language models (LLMs) tackle increasingly complex tasks and longer documents, their computational and memory costs during inference become a major bottleneck. To address this, we propose PromptDistill, a novel, training-free method that improves inference efficiency while preserving generation quality. PromptDistill identifies and retains the most informative tokens by leveraging attention interactions in early layers, preserving their hidden states while reducing the computational burden in later layers. This allows the model to focus on essential contextual information without fully processing all tokens. Unlike previous methods such as H2O and SnapKV, which perform compression only after processing the entire input, or GemFilter, which selects a fixed portion of the initial prompt without considering contextual dependencies, PromptDistill dynamically allocates computational resources to the most relevant tokens while maintaining a global awareness of the input. Experiments using our method and baseline approaches with base models such as LLaMA 3.1 8B Instruct, Phi 3.5 Mini Instruct, and Qwen2 7B Instruct on benchmarks including LongBench, InfBench, and Needle in a Haystack demonstrate that PromptDistill significantly improves efficiency while having minimal impact on output quality compared to the original models. With a single-stage selection strategy, PromptDistill effectively balances performance and efficiency, outperforming prior methods like GemFilter, H2O, and SnapKV due to its superior ability to retain essential information. Specifically, compared to GemFilter, PromptDistill achieves an overall $1\%$ to $5\%$ performance improvement while also offering better time efficiency. Additionally, we explore multi-stage selection, which further improves efficiency while maintaining strong generation performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23274v1-abstract-full').style.display = 'none'; document.getElementById('2503.23274v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.22745">arXiv:2503.22745</a> <span> [<a href="https://arxiv.org/pdf/2503.22745">pdf</a>, <a href="https://arxiv.org/format/2503.22745">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Graph-Based Uncertainty-Aware Self-Training with Stochastic Node Labeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tom Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+A">Anna Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.22745v1-abstract-short" style="display: inline;"> Self-training has become a popular semi-supervised learning technique for leveraging unlabeled data. However, the over-confidence of pseudo-labels remains a key challenge. In this paper, we propose a novel \emph{graph-based uncertainty-aware self-training} (GUST) framework to combat over-confidence in node classification. Drawing inspiration from the uncertainty integration idea introduced by Wang… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22745v1-abstract-full').style.display = 'inline'; document.getElementById('2503.22745v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.22745v1-abstract-full" style="display: none;"> Self-training has become a popular semi-supervised learning technique for leveraging unlabeled data. However, the over-confidence of pseudo-labels remains a key challenge. In this paper, we propose a novel \emph{graph-based uncertainty-aware self-training} (GUST) framework to combat over-confidence in node classification. Drawing inspiration from the uncertainty integration idea introduced by Wang \emph{et al.}~\cite{wang2024uncertainty}, our method largely diverges from previous self-training approaches by focusing on \emph{stochastic node labeling} grounded in the graph topology. Specifically, we deploy a Bayesian-inspired module to estimate node-level uncertainty, incorporate these estimates into the pseudo-label generation process via an expectation-maximization (EM)-like step, and iteratively update both node embeddings and adjacency-based transformations. Experimental results on several benchmark graph datasets demonstrate that our GUST framework achieves state-of-the-art performance, especially in settings where labeled data is extremely sparse. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22745v1-abstract-full').style.display = 'none'; document.getElementById('2503.22745v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.22744">arXiv:2503.22744</a> <span> [<a href="https://arxiv.org/pdf/2503.22744">pdf</a>, <a href="https://arxiv.org/format/2503.22744">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Uncertainty-Aware Graph Self-Training with Expectation-Maximization Regularization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+E">Emily Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Michael Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.22744v1-abstract-short" style="display: inline;"> In this paper, we propose a novel \emph{uncertainty-aware graph self-training} approach for semi-supervised node classification. Our method introduces an Expectation-Maximization (EM) regularization scheme to incorporate an uncertainty mechanism during pseudo-label generation and model retraining. Unlike conventional graph self-training pipelines that rely on fixed pseudo-labels, our approach iter… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22744v1-abstract-full').style.display = 'inline'; document.getElementById('2503.22744v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.22744v1-abstract-full" style="display: none;"> In this paper, we propose a novel \emph{uncertainty-aware graph self-training} approach for semi-supervised node classification. Our method introduces an Expectation-Maximization (EM) regularization scheme to incorporate an uncertainty mechanism during pseudo-label generation and model retraining. Unlike conventional graph self-training pipelines that rely on fixed pseudo-labels, our approach iteratively refines label confidences with an EM-inspired uncertainty measure. This ensures that the predictive model focuses on reliable graph regions while gradually incorporating ambiguous nodes. Inspired by prior work on uncertainty-aware self-training techniques~\cite{wang2024uncertainty}, our framework is designed to handle noisy graph structures and feature spaces more effectively. Through extensive experiments on several benchmark graph datasets, we demonstrate that our method outperforms strong baselines by a margin of up to 2.5\% in accuracy while maintaining lower variance in performance across multiple runs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22744v1-abstract-full').style.display = 'none'; document.getElementById('2503.22744v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.22743">arXiv:2503.22743</a> <span> [<a href="https://arxiv.org/pdf/2503.22743">pdf</a>, <a href="https://arxiv.org/format/2503.22743">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Adaptive State-Space Mamba for Real-Time Sensor Data Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+A">Alice Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.22743v1-abstract-short" style="display: inline;"> State-space modeling has emerged as a powerful paradigm for sequence analysis in various tasks such as natural language processing, time-series forecasting, and signal processing. In this work, we propose an \emph{Adaptive State-Space Mamba} (\textbf{ASSM}) framework for real-time sensor data anomaly detection. While state-space models have been previously employed for image processing application… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22743v1-abstract-full').style.display = 'inline'; document.getElementById('2503.22743v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.22743v1-abstract-full" style="display: none;"> State-space modeling has emerged as a powerful paradigm for sequence analysis in various tasks such as natural language processing, time-series forecasting, and signal processing. In this work, we propose an \emph{Adaptive State-Space Mamba} (\textbf{ASSM}) framework for real-time sensor data anomaly detection. While state-space models have been previously employed for image processing applications (e.g., style transfer \cite{wang2024stylemamba}), our approach leverages the core idea of sequential hidden states to tackle a significantly different domain: detecting anomalies on streaming sensor data. In particular, we introduce an adaptive gating mechanism that dynamically modulates the hidden state update based on contextual and learned statistical cues. This design ensures that our model remains computationally efficient and scalable, even under rapid data arrival rates. Extensive experiments on real-world and synthetic sensor datasets demonstrate that our method achieves superior detection performance compared to existing baselines. Our approach is easily extensible to other time-series tasks that demand rapid and reliable detection capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22743v1-abstract-full').style.display = 'none'; document.getElementById('2503.22743v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.22475">arXiv:2503.22475</a> <span> [<a href="https://arxiv.org/pdf/2503.22475">pdf</a>, <a href="https://arxiv.org/format/2503.22475">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DeepOFormer: Deep Operator Learning with Domain-informed Features for Fatigue Life Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenyang Li</a>, <a href="/search/cs?searchtype=author&query=Kapure%2C+T+S">Tanmay Sunil Kapure</a>, <a href="/search/cs?searchtype=author&query=Roy%2C+P+C">Prokash Chandra Roy</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+Z">Zhengtao Gan</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+B">Bo Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.22475v1-abstract-short" style="display: inline;"> Fatigue life characterizes the duration a material can function before failure under specific environmental conditions, and is traditionally assessed using stress-life (S-N) curves. While machine learning and deep learning offer promising results for fatigue life prediction, they face the overfitting challenge because of the small size of fatigue experimental data in specific materials. To address… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22475v1-abstract-full').style.display = 'inline'; document.getElementById('2503.22475v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.22475v1-abstract-full" style="display: none;"> Fatigue life characterizes the duration a material can function before failure under specific environmental conditions, and is traditionally assessed using stress-life (S-N) curves. While machine learning and deep learning offer promising results for fatigue life prediction, they face the overfitting challenge because of the small size of fatigue experimental data in specific materials. To address this challenge, we propose, DeepOFormer, by formulating S-N curve prediction as an operator learning problem. DeepOFormer improves the deep operator learning framework with a transformer-based encoder and a mean L2 relative error loss function. We also consider Stussi, Weibull, and Pascual and Meeker (PM) features as domain-informed features. These features are motivated by empirical fatigue models. To evaluate the performance of our DeepOFormer, we compare it with different deep learning models and XGBoost on a dataset with 54 S-N curves of aluminum alloys. With seven different aluminum alloys selected for testing, our DeepOFormer achieves an R2 of 0.9515, a mean absolute error of 0.2080, and a mean relative error of 0.5077, significantly outperforming state-of-the-art deep/machine learning methods including DeepONet, TabTransformer, and XGBoost, etc. The results highlight that our Deep0Former integrating with domain-informed features substantially improves prediction accuracy and generalization capabilities for fatigue life prediction in aluminum alloys. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22475v1-abstract-full').style.display = 'none'; document.getElementById('2503.22475v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21802">arXiv:2503.21802</a> <span> [<a href="https://arxiv.org/pdf/2503.21802">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Applications">stat.AP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Structured and sparse partial least squares coherence for multivariate cortico-muscular analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jingyao Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qilu Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+D">Di Ma</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+T">Tianyu Jia</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+S">Shijie Jia</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+X">Xiaoxue Zhai</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+R">Ruimou Xie</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+P">Ping-Ju Lin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhibin Li</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Y">Yu Pan</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+L">Linhong Ji</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21802v1-abstract-short" style="display: inline;"> Multivariate cortico-muscular analysis has recently emerged as a promising approach for evaluating the corticospinal neural pathway. However, current multivariate approaches encounter challenges such as high dimensionality and limited sample sizes, thus restricting their further applications. In this paper, we propose a structured and sparse partial least squares coherence algorithm (ssPLSC) to ex… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21802v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21802v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21802v1-abstract-full" style="display: none;"> Multivariate cortico-muscular analysis has recently emerged as a promising approach for evaluating the corticospinal neural pathway. However, current multivariate approaches encounter challenges such as high dimensionality and limited sample sizes, thus restricting their further applications. In this paper, we propose a structured and sparse partial least squares coherence algorithm (ssPLSC) to extract shared latent space representations related to cortico-muscular interactions. Our approach leverages an embedded optimization framework by integrating a partial least squares (PLS)-based objective function, a sparsity constraint and a connectivity-based structured constraint, addressing the generalizability, interpretability and spatial structure. To solve the optimization problem, we develop an efficient alternating iterative algorithm within a unified framework and prove its convergence experimentally. Extensive experimental results from one synthetic and several real-world datasets have demonstrated that ssPLSC can achieve competitive or better performance over some representative multivariate cortico-muscular fusion methods, particularly in scenarios characterized by limited sample sizes and high noise levels. This study provides a novel multivariate fusion method for cortico-muscular analysis, offering a transformative tool for the evaluation of corticospinal pathway integrity in neurological disorders. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21802v1-abstract-full').style.display = 'none'; document.getElementById('2503.21802v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21779">arXiv:2503.21779</a> <span> [<a href="https://arxiv.org/pdf/2503.21779">pdf</a>, <a href="https://arxiv.org/format/2503.21779">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> X$^{2}$-Gaussian: 4D Radiative Gaussian Splatting for Continuous-time Tomographic Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+W">Weihao Yu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yuanhao Cai</a>, <a href="/search/cs?searchtype=author&query=Zha%2C+R">Ruyi Zha</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Z">Zhiwen Fan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenxin Li</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yixuan Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21779v1-abstract-short" style="display: inline;"> Four-dimensional computed tomography (4D CT) reconstruction is crucial for capturing dynamic anatomical changes but faces inherent limitations from conventional phase-binning workflows. Current methods discretize temporal resolution into fixed phases with respiratory gating devices, introducing motion misalignment and restricting clinical practicality. In this paper, We propose X$^2$-Gaussian, a n… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21779v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21779v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21779v1-abstract-full" style="display: none;"> Four-dimensional computed tomography (4D CT) reconstruction is crucial for capturing dynamic anatomical changes but faces inherent limitations from conventional phase-binning workflows. Current methods discretize temporal resolution into fixed phases with respiratory gating devices, introducing motion misalignment and restricting clinical practicality. In this paper, We propose X$^2$-Gaussian, a novel framework that enables continuous-time 4D-CT reconstruction by integrating dynamic radiative Gaussian splatting with self-supervised respiratory motion learning. Our approach models anatomical dynamics through a spatiotemporal encoder-decoder architecture that predicts time-varying Gaussian deformations, eliminating phase discretization. To remove dependency on external gating devices, we introduce a physiology-driven periodic consistency loss that learns patient-specific breathing cycles directly from projections via differentiable optimization. Extensive experiments demonstrate state-of-the-art performance, achieving a 9.93 dB PSNR gain over traditional methods and 2.25 dB improvement against prior Gaussian splatting techniques. By unifying continuous motion modeling with hardware-free period learning, X$^2$-Gaussian advances high-fidelity 4D CT reconstruction for dynamic clinical imaging. Project website at: https://x2-gaussian.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21779v1-abstract-full').style.display = 'none'; document.getElementById('2503.21779v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://x2-gaussian.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21141">arXiv:2503.21141</a> <span> [<a href="https://arxiv.org/pdf/2503.21141">pdf</a>, <a href="https://arxiv.org/format/2503.21141">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Safe Human Robot Navigation in Warehouse Scenario </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Farrell%2C+S">Seth Farrell</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenghao Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Hongzhan Yu</a>, <a href="/search/cs?searchtype=author&query=Yoshimitsu%2C+R">Ryo Yoshimitsu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+S">Sicun Gao</a>, <a href="/search/cs?searchtype=author&query=Christensen%2C+H+I">Henrik I. Christensen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21141v1-abstract-short" style="display: inline;"> The integration of autonomous mobile robots (AMRs) in industrial environments, particularly warehouses, has revolutionized logistics and operational efficiency. However, ensuring the safety of human workers in dynamic, shared spaces remains a critical challenge. This work proposes a novel methodology that leverages control barrier functions (CBFs) to enhance safety in warehouse navigation. By inte… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21141v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21141v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21141v1-abstract-full" style="display: none;"> The integration of autonomous mobile robots (AMRs) in industrial environments, particularly warehouses, has revolutionized logistics and operational efficiency. However, ensuring the safety of human workers in dynamic, shared spaces remains a critical challenge. This work proposes a novel methodology that leverages control barrier functions (CBFs) to enhance safety in warehouse navigation. By integrating learning-based CBFs with the Open Robotics Middleware Framework (OpenRMF), the system achieves adaptive and safety-enhanced controls in multi-robot, multi-agent scenarios. Experiments conducted using various robot platforms demonstrate the efficacy of the proposed approach in avoiding static and dynamic obstacles, including human pedestrians. Our experiments evaluate different scenarios in which the number of robots, robot platforms, speed, and number of obstacles are varied, from which we achieve promising performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21141v1-abstract-full').style.display = 'none'; document.getElementById('2503.21141v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21127">arXiv:2503.21127</a> <span> [<a href="https://arxiv.org/pdf/2503.21127">pdf</a>, <a href="https://arxiv.org/format/2503.21127">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Collaborative Evolution: Multi-Round Learning Between Large and Small Language Models for Emergent Fake News Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Ziyi Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaoming Zhang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+S">Shenghan Tan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Litian Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chaozhuo Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21127v1-abstract-short" style="display: inline;"> The proliferation of fake news on social media platforms has exerted a substantial influence on society, leading to discernible impacts and deleterious consequences. Conventional deep learning methodologies employing small language models (SLMs) suffer from the necessity for extensive supervised training and the challenge of adapting to rapidly evolving circumstances. Large language models (LLMs),… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21127v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21127v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21127v1-abstract-full" style="display: none;"> The proliferation of fake news on social media platforms has exerted a substantial influence on society, leading to discernible impacts and deleterious consequences. Conventional deep learning methodologies employing small language models (SLMs) suffer from the necessity for extensive supervised training and the challenge of adapting to rapidly evolving circumstances. Large language models (LLMs), despite their robust zero-shot capabilities, have fallen short in effectively identifying fake news due to a lack of pertinent demonstrations and the dynamic nature of knowledge. In this paper, a novel framework Multi-Round Collaboration Detection (MRCD) is proposed to address these aforementioned limitations. The MRCD framework is capable of enjoying the merits from both LLMs and SLMs by integrating their generalization abilities and specialized functionalities, respectively. Our approach features a two-stage retrieval module that selects relevant and up-to-date demonstrations and knowledge, enhancing in-context learning for better detection of emerging news events. We further design a multi-round learning framework to ensure more reliable detection results. Our framework MRCD achieves SOTA results on two real-world datasets Pheme and Twitter16, with accuracy improvements of 7.4\% and 12.8\% compared to using only SLMs, which effectively addresses the limitations of current models and improves the detection of emergent fake news. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21127v1-abstract-full').style.display = 'none'; document.getElementById('2503.21127v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21056">arXiv:2503.21056</a> <span> [<a href="https://arxiv.org/pdf/2503.21056">pdf</a>, <a href="https://arxiv.org/format/2503.21056">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Online Reasoning Video Segmentation with Just-in-Time Digital Twins </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yiqing Shen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bohan Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenjia Li</a>, <a href="/search/cs?searchtype=author&query=Seenivasan%2C+L">Lalithkumar Seenivasan</a>, <a href="/search/cs?searchtype=author&query=Unberath%2C+M">Mathias Unberath</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21056v1-abstract-short" style="display: inline;"> Reasoning segmentation (RS) aims to identify and segment objects of interest based on implicit text queries. As such, RS is a catalyst for embodied AI agents, enabling them to interpret high-level commands without requiring explicit step-by-step guidance. However, current RS approaches rely heavily on the visual perception capabilities of multimodal large language models (LLMs), leading to several… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21056v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21056v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21056v1-abstract-full" style="display: none;"> Reasoning segmentation (RS) aims to identify and segment objects of interest based on implicit text queries. As such, RS is a catalyst for embodied AI agents, enabling them to interpret high-level commands without requiring explicit step-by-step guidance. However, current RS approaches rely heavily on the visual perception capabilities of multimodal large language models (LLMs), leading to several major limitations. First, they struggle with queries that require multiple steps of reasoning or those that involve complex spatial/temporal relationships. Second, they necessitate LLM fine-tuning, which may require frequent updates to maintain compatibility with contemporary LLMs and may increase risks of catastrophic forgetting during fine-tuning. Finally, being primarily designed for static images or offline video processing, they scale poorly to online video data. To address these limitations, we propose an agent framework that disentangles perception and reasoning for online video RS without LLM fine-tuning. Our innovation is the introduction of a just-in-time digital twin concept, where -- given an implicit query -- a LLM plans the construction of a low-level scene representation from high-level video using specialist vision models. We refer to this approach to creating a digital twin as "just-in-time" because the LLM planner will anticipate the need for specific information and only request this limited subset instead of always evaluating every specialist model. The LLM then performs reasoning on this digital twin representation to identify target objects. To evaluate our approach, we introduce a new comprehensive video reasoning segmentation benchmark comprising 200 videos with 895 implicit text queries. The benchmark spans three reasoning categories (semantic, spatial, and temporal) with three different reasoning chain complexity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21056v1-abstract-full').style.display = 'none'; document.getElementById('2503.21056v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21054">arXiv:2503.21054</a> <span> [<a href="https://arxiv.org/pdf/2503.21054">pdf</a>, <a href="https://arxiv.org/format/2503.21054">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Operating Room Workflow Analysis via Reasoning Segmentation over Digital Twins </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yiqing Shen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenjia Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bohan Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Cheng-Yi Li</a>, <a href="/search/cs?searchtype=author&query=Porras%2C+T">Tito Porras</a>, <a href="/search/cs?searchtype=author&query=Unberath%2C+M">Mathias Unberath</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21054v1-abstract-short" style="display: inline;"> Analyzing operating room (OR) workflows to derive quantitative insights into OR efficiency is important for hospitals to maximize patient care and financial sustainability. Prior work on OR-level workflow analysis has relied on end-to-end deep neural networks. While these approaches work well in constrained settings, they are limited to the conditions specified at development time and do not offer… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21054v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21054v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21054v1-abstract-full" style="display: none;"> Analyzing operating room (OR) workflows to derive quantitative insights into OR efficiency is important for hospitals to maximize patient care and financial sustainability. Prior work on OR-level workflow analysis has relied on end-to-end deep neural networks. While these approaches work well in constrained settings, they are limited to the conditions specified at development time and do not offer the flexibility necessary to accommodate the OR workflow analysis needs of various OR scenarios (e.g., large academic center vs. rural provider) without data collection, annotation, and retraining. Reasoning segmentation (RS) based on foundation models offers this flexibility by enabling automated analysis of OR workflows from OR video feeds given only an implicit text query related to the objects of interest. Due to the reliance on large language model (LLM) fine-tuning, current RS approaches struggle with reasoning about semantic/spatial relationships and show limited generalization to OR video due to variations in visual characteristics and domain-specific terminology. To address these limitations, we first propose a novel digital twin (DT) representation that preserves both semantic and spatial relationships between the various OR components. Then, building on this foundation, we propose ORDiRS (Operating Room Digital twin representation for Reasoning Segmentation), an LLM-tuning-free RS framework that reformulates RS into a "reason-retrieval-synthesize" paradigm. Finally, we present ORDiRS-Agent, an LLM-based agent that decomposes OR workflow analysis queries into manageable RS sub-queries and generates responses by combining detailed textual explanations with supporting visual evidence from RS. Experimental results on both an in-house and a public OR dataset demonstrate that our ORDiRS achieves a cIoU improvement of 6.12%-9.74% compared to the existing state-of-the-arts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21054v1-abstract-full').style.display = 'none'; document.getElementById('2503.21054v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20982">arXiv:2503.20982</a> <span> [<a href="https://arxiv.org/pdf/2503.20982">pdf</a>, <a href="https://arxiv.org/ps/2503.20982">ps</a>, <a href="https://arxiv.org/format/2503.20982">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Number Theory">math.NT</span> </div> </div> <p class="title is-5 mathjax"> Permutation polynomials over finite fields from low-degree rational functions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Garg%2C+K">Kirpa Garg</a>, <a href="/search/cs?searchtype=author&query=Hasan%2C+S+U">Sartaj Ul Hasan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chunlei Li</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+H">Hridesh Kumar</a>, <a href="/search/cs?searchtype=author&query=Pal%2C+M">Mohit Pal</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20982v2-abstract-short" style="display: inline;"> This paper considers permutation polynomials over the finite field $F_{q^2}$ in even characteristic by utilizing low-degree permutation rational functions over $F_q$. As a result, we obtain two classes of permutation binomials and six classes of permutation pentanomials over $F_{q^2}$. Additionally, we show that the obtained binomials and pentanomials are quasi-multiplicative inequivalent to the k… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20982v2-abstract-full').style.display = 'inline'; document.getElementById('2503.20982v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20982v2-abstract-full" style="display: none;"> This paper considers permutation polynomials over the finite field $F_{q^2}$ in even characteristic by utilizing low-degree permutation rational functions over $F_q$. As a result, we obtain two classes of permutation binomials and six classes of permutation pentanomials over $F_{q^2}$. Additionally, we show that the obtained binomials and pentanomials are quasi-multiplicative inequivalent to the known ones in the literature. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20982v2-abstract-full').style.display = 'none'; document.getElementById('2503.20982v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">32 pages</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 11T06; 12E20 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20802">arXiv:2503.20802</a> <span> [<a href="https://arxiv.org/pdf/2503.20802">pdf</a>, <a href="https://arxiv.org/format/2503.20802">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CEFW: A Comprehensive Evaluation Framework for Watermark in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shuhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+B">Bo Cheng</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jiale Han</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuli Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhixuan Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Changbao Li</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+P">Pingli Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20802v1-abstract-short" style="display: inline;"> Text watermarking provides an effective solution for identifying synthetic text generated by large language models. However, existing techniques often focus on satisfying specific criteria while ignoring other key aspects, lacking a unified evaluation. To fill this gap, we propose the Comprehensive Evaluation Framework for Watermark (CEFW), a unified framework that comprehensively evaluates waterm… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20802v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20802v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20802v1-abstract-full" style="display: none;"> Text watermarking provides an effective solution for identifying synthetic text generated by large language models. However, existing techniques often focus on satisfying specific criteria while ignoring other key aspects, lacking a unified evaluation. To fill this gap, we propose the Comprehensive Evaluation Framework for Watermark (CEFW), a unified framework that comprehensively evaluates watermarking methods across five key dimensions: ease of detection, fidelity of text quality, minimal embedding cost, robustness to adversarial attacks, and imperceptibility to prevent imitation or forgery. By assessing watermarks according to all these key criteria, CEFW offers a thorough evaluation of their practicality and effectiveness. Moreover, we introduce a simple and effective watermarking method called Balanced Watermark (BW), which guarantees robustness and imperceptibility through balancing the way watermark information is added. Extensive experiments show that BW outperforms existing methods in overall performance across all evaluation dimensions. We release our code to the community for future research. https://github.com/DrankXs/BalancedWatermark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20802v1-abstract-full').style.display = 'none'; document.getElementById('2503.20802v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20483">arXiv:2503.20483</a> <span> [<a href="https://arxiv.org/pdf/2503.20483">pdf</a>, <a href="https://arxiv.org/format/2503.20483">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Dissecting and Mitigating Diffusion Bias via Mechanistic Interpretability </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yingdong Shi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Changming Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yifan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yongxiang Zhao</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+A">Anqi Pang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Sibei Yang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jingyi Yu</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+K">Kan Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20483v1-abstract-short" style="display: inline;"> Diffusion models have demonstrated impressive capabilities in synthesizing diverse content. However, despite their high-quality outputs, these models often perpetuate social biases, including those related to gender and race. These biases can potentially contribute to harmful real-world consequences, reinforcing stereotypes and exacerbating inequalities in various social contexts. While existing r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20483v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20483v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20483v1-abstract-full" style="display: none;"> Diffusion models have demonstrated impressive capabilities in synthesizing diverse content. However, despite their high-quality outputs, these models often perpetuate social biases, including those related to gender and race. These biases can potentially contribute to harmful real-world consequences, reinforcing stereotypes and exacerbating inequalities in various social contexts. While existing research on diffusion bias mitigation has predominantly focused on guiding content generation, it often neglects the intrinsic mechanisms within diffusion models that causally drive biased outputs. In this paper, we investigate the internal processes of diffusion models, identifying specific decision-making mechanisms, termed bias features, embedded within the model architecture. By directly manipulating these features, our method precisely isolates and adjusts the elements responsible for bias generation, permitting granular control over the bias levels in the generated content. Through experiments on both unconditional and conditional diffusion models across various social bias attributes, we demonstrate our method's efficacy in managing generation distribution while preserving image quality. We also dissect the discovered model mechanism, revealing different intrinsic features controlling fine-grained aspects of generation, boosting further research on mechanistic interpretability of diffusion models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20483v1-abstract-full').style.display = 'none'; document.getElementById('2503.20483v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2025; Project Page: https://foundation-model-research.github.io/difflens</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20382">arXiv:2503.20382</a> <span> [<a href="https://arxiv.org/pdf/2503.20382">pdf</a>, <a href="https://arxiv.org/format/2503.20382">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RSRWKV: A Linear-Complexity 2D Attention Mechanism for Efficient Remote Sensing Vision Task </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+C">Chunshan Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Rong Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaofei Yang</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+D">Dianhui Chu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20382v1-abstract-short" style="display: inline;"> High-resolution remote sensing analysis faces challenges in global context modeling due to scene complexity and scale diversity. While CNNs excel at local feature extraction via parameter sharing, their fixed receptive fields fundamentally restrict long-range dependency modeling. Vision Transformers (ViTs) effectively capture global semantic relationships through self-attention mechanisms but suff… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20382v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20382v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20382v1-abstract-full" style="display: none;"> High-resolution remote sensing analysis faces challenges in global context modeling due to scene complexity and scale diversity. While CNNs excel at local feature extraction via parameter sharing, their fixed receptive fields fundamentally restrict long-range dependency modeling. Vision Transformers (ViTs) effectively capture global semantic relationships through self-attention mechanisms but suffer from quadratic computational complexity relative to image resolution, creating critical efficiency bottlenecks for high-resolution imagery. The RWKV model's linear-complexity sequence modeling achieves breakthroughs in NLP but exhibits anisotropic limitations in vision tasks due to its 1D scanning mechanism. To address these challenges, we propose RSRWKV, featuring a novel 2D-WKV scanning mechanism that bridges sequential processing and 2D spatial reasoning while maintaining linear complexity. This enables isotropic context aggregation across multiple directions. The MVC-Shift module enhances multi-scale receptive field coverage, while the ECA module strengthens cross-channel feature interaction and semantic saliency modeling. Experimental results demonstrate RSRWKV's superior performance over CNN and Transformer baselines in classification, detection, and segmentation tasks on NWPU RESISC45, VHR-10.v2, and GLH-Water datasets, offering a scalable solution for high-resolution remote sensing analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20382v1-abstract-full').style.display = 'none'; document.getElementById('2503.20382v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20377">arXiv:2503.20377</a> <span> [<a href="https://arxiv.org/pdf/2503.20377">pdf</a>, <a href="https://arxiv.org/format/2503.20377">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> UB-Mesh: a Hierarchically Localized nD-FullMesh Datacenter Network Architecture </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liao%2C+H">Heng Liao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bingyang Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xianping Chen</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zhigang Guo</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+C">Chuanning Cheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianbing Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiangyu Chen</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+P">Peng Dong</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+R">Rui Meng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenjie Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhe Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Ziyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Gai%2C+Y">Yuhang Gai</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+C">Cunle Qian</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yi Xiong</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Z">Zhongwu Cheng</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+J">Jing Xia</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yuli Ma</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xi Chen</a>, <a href="/search/cs?searchtype=author&query=Du%2C+W">Wenhua Du</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+S">Shizhong Xiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chungang Li</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Y">Yong Qin</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+L">Liudong Xiong</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhou Yu</a> , et al. (9 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20377v1-abstract-short" style="display: inline;"> As the Large-scale Language Models (LLMs) continue to scale, the requisite computational power and bandwidth escalate. To address this, we introduce UB-Mesh, a novel AI datacenter network architecture designed to enhance scalability, performance, cost-efficiency and availability. Unlike traditional datacenters that provide symmetrical node-to-node bandwidth, UB-Mesh employs a hierarchically locali… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20377v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20377v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20377v1-abstract-full" style="display: none;"> As the Large-scale Language Models (LLMs) continue to scale, the requisite computational power and bandwidth escalate. To address this, we introduce UB-Mesh, a novel AI datacenter network architecture designed to enhance scalability, performance, cost-efficiency and availability. Unlike traditional datacenters that provide symmetrical node-to-node bandwidth, UB-Mesh employs a hierarchically localized nD-FullMesh network topology. This design fully leverages the data locality of LLM training, prioritizing short-range, direct interconnects to minimize data movement distance and reduce switch usage. Although UB-Mesh's nD-FullMesh topology offers several theoretical advantages, its concrete architecture design, physical implementation and networking system optimization present new challenges. For the actual construction of UB-Mesh, we first design the UB-Mesh-Pod architecture, which is based on a 4D-FullMesh topology. UB-Mesh-Pod is implemented via a suite of hardware components that serve as the foundational building blocks, including specifically-designed NPU, CPU, Low-Radix-Switch (LRS), High-Radix-Switch (HRS), NICs and others. These components are interconnected via a novel Unified Bus (UB) technique, which enables flexible IO bandwidth allocation and hardware resource pooling. For networking system optimization, we propose advanced routing mechanism named All-Path-Routing (APR) to efficiently manage data traffic. These optimizations, combined with topology-aware performance enhancements and robust reliability measures like 64+1 backup design, result in 2.04x higher cost-efficiency, 7.2% higher network availability compared to traditional Clos architecture and 95%+ linearity in various LLM training tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20377v1-abstract-full').style.display = 'none'; document.getElementById('2503.20377v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20275">arXiv:2503.20275</a> <span> [<a href="https://arxiv.org/pdf/2503.20275">pdf</a>, <a href="https://arxiv.org/format/2503.20275">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Survey of Disaggregated Memory: Cross-layer Technique Insights for Next-Generation Datacenters </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jing Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chao Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Taolei Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jinyang Guo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hanzhang Yang</a>, <a href="/search/cs?searchtype=author&query=Zhuansun%2C+Y">Yiming Zhuansun</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+M">Minyi Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20275v2-abstract-short" style="display: inline;"> The growing scale of data requires efficient memory subsystems with large memory capacity and high memory performance. Disaggregated architecture has become a promising solution for today's cloud and edge computing for its scalability and elasticity. As a critical part of disaggregation, disaggregated memory faces many design challenges in many dimensions, including hardware scalability, architect… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20275v2-abstract-full').style.display = 'inline'; document.getElementById('2503.20275v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20275v2-abstract-full" style="display: none;"> The growing scale of data requires efficient memory subsystems with large memory capacity and high memory performance. Disaggregated architecture has become a promising solution for today's cloud and edge computing for its scalability and elasticity. As a critical part of disaggregation, disaggregated memory faces many design challenges in many dimensions, including hardware scalability, architecture structure, software system design, application programmability, resource allocation, power management, etc. These challenges inspire a number of novel solutions at different system levels to improve system efficiency. In this paper, we provide a comprehensive review of disaggregated memory, including the methodology and technologies of disaggregated memory system foundation, optimization, and management. We study the technical essentials of disaggregated memory systems and analyze them from the hardware, architecture, system, and application levels. Then, we compare the design details of typical cross-layer designs on disaggregated memory. Finally, we discuss the challenges and opportunities of future disaggregated memory works that serve better for next-generation elastic and efficient datacenters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20275v2-abstract-full').style.display = 'none'; document.getElementById('2503.20275v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20104">arXiv:2503.20104</a> <span> [<a href="https://arxiv.org/pdf/2503.20104">pdf</a>, <a href="https://arxiv.org/format/2503.20104">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> "Is There Anything Else?'': Examining Administrator Influence on Linguistic Features from the Cookie Theft Picture Description Cognitive Test </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+C">Changye Li</a>, <a href="/search/cs?searchtype=author&query=Sheng%2C+Z">Zhecheng Sheng</a>, <a href="/search/cs?searchtype=author&query=Cohen%2C+T">Trevor Cohen</a>, <a href="/search/cs?searchtype=author&query=Pakhomov%2C+S">Serguei Pakhomov</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20104v1-abstract-short" style="display: inline;"> Alzheimer's Disease (AD) dementia is a progressive neurodegenerative disease that negatively impacts patients' cognitive ability. Previous studies have demonstrated that changes in naturalistic language samples can be useful for early screening of AD dementia. However, the nature of language deficits often requires test administrators to use various speech elicitation techniques during spontaneous… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20104v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20104v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20104v1-abstract-full" style="display: none;"> Alzheimer's Disease (AD) dementia is a progressive neurodegenerative disease that negatively impacts patients' cognitive ability. Previous studies have demonstrated that changes in naturalistic language samples can be useful for early screening of AD dementia. However, the nature of language deficits often requires test administrators to use various speech elicitation techniques during spontaneous language assessments to obtain enough propositional utterances from dementia patients. This could lead to the ``observer's effect'' on the downstream analysis that has not been fully investigated. Our study seeks to quantify the influence of test administrators on linguistic features in dementia assessment with two English corpora the ``Cookie Theft'' picture description datasets collected at different locations and test administrators show different levels of administrator involvement. Our results show that the level of test administrator involvement significantly impacts observed linguistic features in patient speech. These results suggest that many of significant linguistic features in the downstream classification task may be partially attributable to differences in the test administration practices rather than solely to participants' cognitive status. The variations in test administrator behavior can lead to systematic biases in linguistic data, potentially confounding research outcomes and clinical assessments. Our study suggests that there is a need for a more standardized test administration protocol in the development of responsible clinical speech analytics frameworks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20104v1-abstract-full').style.display = 'none'; document.getElementById('2503.20104v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CMCL 2025 workshop, co-located with NAACL 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20103">arXiv:2503.20103</a> <span> [<a href="https://arxiv.org/pdf/2503.20103">pdf</a>, <a href="https://arxiv.org/format/2503.20103">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Bigger But Not Better: Small Neural Language Models Outperform Large Language Models in Detection of Thought Disorder </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+C">Changye Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+W">Weizhe Xu</a>, <a href="/search/cs?searchtype=author&query=Pakhomov%2C+S">Serguei Pakhomov</a>, <a href="/search/cs?searchtype=author&query=Bradley%2C+E">Ellen Bradley</a>, <a href="/search/cs?searchtype=author&query=Ben-Zeev%2C+D">Dror Ben-Zeev</a>, <a href="/search/cs?searchtype=author&query=Cohen%2C+T">Trevor Cohen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20103v1-abstract-short" style="display: inline;"> Disorganized thinking is a key diagnostic indicator of schizophrenia-spectrum disorders. Recently, clinical estimates of the severity of disorganized thinking have been shown to correlate with measures of how difficult speech transcripts would be for large language models (LLMs) to predict. However, LLMs' deployment challenges -- including privacy concerns, computational and financial costs, and l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20103v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20103v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20103v1-abstract-full" style="display: none;"> Disorganized thinking is a key diagnostic indicator of schizophrenia-spectrum disorders. Recently, clinical estimates of the severity of disorganized thinking have been shown to correlate with measures of how difficult speech transcripts would be for large language models (LLMs) to predict. However, LLMs' deployment challenges -- including privacy concerns, computational and financial costs, and lack of transparency of training data -- limit their clinical utility. We investigate whether smaller neural language models can serve as effective alternatives for detecting positive formal thought disorder, using the same sliding window based perplexity measurements that proved effective with larger models. Surprisingly, our results show that smaller models are more sensitive to linguistic differences associated with formal thought disorder than their larger counterparts. Detection capability declines beyond a certain model size and context length, challenging the common assumption of ``bigger is better'' for LLM-based applications. Our findings generalize across audio diaries and clinical interview speech samples from individuals with psychotic symptoms, suggesting a promising direction for developing efficient, cost-effective, and privacy-preserving screening tools that can be deployed in both clinical and naturalistic settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20103v1-abstract-full').style.display = 'none'; document.getElementById('2503.20103v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CL Psych 2025 workshop, co-located with NAACL 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19486">arXiv:2503.19486</a> <span> [<a href="https://arxiv.org/pdf/2503.19486">pdf</a>, <a href="https://arxiv.org/format/2503.19486">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Exploring Disentangled and Controllable Human Image Synthesis: From End-to-End to Stage-by-Stage </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zhengwentai Sun</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Heyuan Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xihe Yang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+K">Keru Zheng</a>, <a href="/search/cs?searchtype=author&query=Ning%2C+S">Shuliang Ning</a>, <a href="/search/cs?searchtype=author&query=Zhi%2C+Y">Yihao Zhi</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+H">Hongjie Liao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenghong Li</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+S">Shuguang Cui</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiaoguang Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19486v1-abstract-short" style="display: inline;"> Achieving fine-grained controllability in human image synthesis is a long-standing challenge in computer vision. Existing methods primarily focus on either facial synthesis or near-frontal body generation, with limited ability to simultaneously control key factors such as viewpoint, pose, clothing, and identity in a disentangled manner. In this paper, we introduce a new disentangled and controllab… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19486v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19486v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19486v1-abstract-full" style="display: none;"> Achieving fine-grained controllability in human image synthesis is a long-standing challenge in computer vision. Existing methods primarily focus on either facial synthesis or near-frontal body generation, with limited ability to simultaneously control key factors such as viewpoint, pose, clothing, and identity in a disentangled manner. In this paper, we introduce a new disentangled and controllable human synthesis task, which explicitly separates and manipulates these four factors within a unified framework. We first develop an end-to-end generative model trained on MVHumanNet for factor disentanglement. However, the domain gap between MVHumanNet and in-the-wild data produce unsatisfacotry results, motivating the exploration of virtual try-on (VTON) dataset as a potential solution. Through experiments, we observe that simply incorporating the VTON dataset as additional data to train the end-to-end model degrades performance, primarily due to the inconsistency in data forms between the two datasets, which disrupts the disentanglement process. To better leverage both datasets, we propose a stage-by-stage framework that decomposes human image generation into three sequential steps: clothed A-pose generation, back-view synthesis, and pose and view control. This structured pipeline enables better dataset utilization at different stages, significantly improving controllability and generalization, especially for in-the-wild scenarios. Extensive experiments demonstrate that our stage-by-stage approach outperforms end-to-end models in both visual fidelity and disentanglement quality, offering a scalable solution for real-world tasks. Additional demos are available on the project page: https://taited.github.io/discohuman-project/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19486v1-abstract-full').style.display = 'none'; document.getElementById('2503.19486v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19397">arXiv:2503.19397</a> <span> [<a href="https://arxiv.org/pdf/2503.19397">pdf</a>, <a href="https://arxiv.org/format/2503.19397">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Quality-focused Active Adversarial Policy for Safe Grasping in Human-Robot Interaction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenghao Li</a>, <a href="/search/cs?searchtype=author&query=Beuran%2C+R">Razvan Beuran</a>, <a href="/search/cs?searchtype=author&query=Chong%2C+N+Y">Nak Young Chong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19397v1-abstract-short" style="display: inline;"> Vision-guided robot grasping methods based on Deep Neural Networks (DNNs) have achieved remarkable success in handling unknown objects, attributable to their powerful generalizability. However, these methods with this generalizability tend to recognize the human hand and its adjacent objects as graspable targets, compromising safety during Human-Robot Interaction (HRI). In this work, we propose th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19397v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19397v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19397v1-abstract-full" style="display: none;"> Vision-guided robot grasping methods based on Deep Neural Networks (DNNs) have achieved remarkable success in handling unknown objects, attributable to their powerful generalizability. However, these methods with this generalizability tend to recognize the human hand and its adjacent objects as graspable targets, compromising safety during Human-Robot Interaction (HRI). In this work, we propose the Quality-focused Active Adversarial Policy (QFAAP) to solve this problem. Specifically, the first part is the Adversarial Quality Patch (AQP), wherein we design the adversarial quality patch loss and leverage the grasp dataset to optimize a patch with high quality scores. Next, we construct the Projected Quality Gradient Descent (PQGD) and integrate it with the AQP, which contains only the hand region within each real-time frame, endowing the AQP with fast adaptability to the human hand shape. Through AQP and PQGD, the hand can be actively adversarial with the surrounding objects, lowering their quality scores. Therefore, further setting the quality score of the hand to zero will reduce the grasping priority of both the hand and its adjacent objects, enabling the robot to grasp other objects away from the hand without emergency stops. We conduct extensive experiments on the benchmark datasets and a cobot, showing the effectiveness of QFAAP. Our code and demo videos are available here: https://github.com/clee-jaist/QFAAP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19397v1-abstract-full').style.display = 'none'; document.getElementById('2503.19397v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19353">arXiv:2503.19353</a> <span> [<a href="https://arxiv.org/pdf/2503.19353">pdf</a>, <a href="https://arxiv.org/format/2503.19353">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> QUAD: Quantization and Parameter-Efficient Tuning of LLM with Activation Decomposition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yuxuan Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaodong Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Cuiping Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hong Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19353v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) excel in diverse applications but suffer inefficiency due to massive scale. While quantization reduces computational costs, existing methods degrade accuracy in medium-sized LLMs (e.g., Llama-3-8B) due to activation outliers. To address this, we propose QUAD (Quantization with Activation Decomposition), a framework leveraging Singular Value Decomposition (SVD) to suppr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19353v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19353v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19353v1-abstract-full" style="display: none;"> Large Language Models (LLMs) excel in diverse applications but suffer inefficiency due to massive scale. While quantization reduces computational costs, existing methods degrade accuracy in medium-sized LLMs (e.g., Llama-3-8B) due to activation outliers. To address this, we propose QUAD (Quantization with Activation Decomposition), a framework leveraging Singular Value Decomposition (SVD) to suppress activation outliers for effective 4-bit quantization. QUAD estimates activation singular vectors offline using calibration data to construct an orthogonal transformation matrix P, shifting outliers to additional dimensions in full precision while quantizing rest components to 4-bit. Additionally, QUAD enables parameter-efficient fine-tuning via adaptable full-precision outlier weights, narrowing the accuracy gap between quantized and full-precision models. Experiments demonstrate that QUAD achieves 94% ~ 96% accuracy under W4A4 quantization and 98% accuracy with W4A4/A8 and parameter-efficient fine-tuning for Llama-3 and Qwen-2.5 models. Our code is available at \href{https://github.com/hyx1999/Quad}{repository}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19353v1-abstract-full').style.display = 'none'; document.getElementById('2503.19353v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 8 figures, 8 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19262">arXiv:2503.19262</a> <span> [<a href="https://arxiv.org/pdf/2503.19262">pdf</a>, <a href="https://arxiv.org/format/2503.19262">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Learning Hazing to Dehazing: Towards Realistic Haze Generation for Real-World Image Dehazing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruiyi Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yushuo Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zicheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chunyi Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shuaicheng Liu</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+G">Guangtao Zhai</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaohong Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19262v1-abstract-short" style="display: inline;"> Existing real-world image dehazing methods primarily attempt to fine-tune pre-trained models or adapt their inference procedures, thus heavily relying on the pre-trained models and associated training data. Moreover, restoring heavily distorted information under dense haze requires generative diffusion models, whose potential in dehazing remains underutilized partly due to their lengthy sampling p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19262v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19262v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19262v1-abstract-full" style="display: none;"> Existing real-world image dehazing methods primarily attempt to fine-tune pre-trained models or adapt their inference procedures, thus heavily relying on the pre-trained models and associated training data. Moreover, restoring heavily distorted information under dense haze requires generative diffusion models, whose potential in dehazing remains underutilized partly due to their lengthy sampling processes. To address these limitations, we introduce a novel hazing-dehazing pipeline consisting of a Realistic Hazy Image Generation framework (HazeGen) and a Diffusion-based Dehazing framework (DiffDehaze). Specifically, HazeGen harnesses robust generative diffusion priors of real-world hazy images embedded in a pre-trained text-to-image diffusion model. By employing specialized hybrid training and blended sampling strategies, HazeGen produces realistic and diverse hazy images as high-quality training data for DiffDehaze. To alleviate the inefficiency and fidelity concerns associated with diffusion-based methods, DiffDehaze adopts an Accelerated Fidelity-Preserving Sampling process (AccSamp). The core of AccSamp is the Tiled Statistical Alignment Operation (AlignOp), which can provide a clean and faithful dehazing estimate within a small fraction of sampling steps to reduce complexity and enable effective fidelity guidance. Extensive experiments demonstrate the superior dehazing performance and visual quality of our approach over existing methods. The code is available at https://github.com/ruiyi-w/Learning-Hazing-to-Dehazing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19262v1-abstract-full').style.display = 'none'; document.getElementById('2503.19262v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18420">arXiv:2503.18420</a> <span> [<a href="https://arxiv.org/pdf/2503.18420">pdf</a>, <a href="https://arxiv.org/format/2503.18420">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Panorama Generation From NFoV Image Done Right </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+D">Dian Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Cheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiao-Ming Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Cao Li</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+C">Chengfei Lv</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jian-Fang Hu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+W">Wei-Shi Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18420v1-abstract-short" style="display: inline;"> Generating 360-degree panoramas from narrow field of view (NFoV) image is a promising computer vision task for Virtual Reality (VR) applications. Existing methods mostly assess the generated panoramas with InceptionNet or CLIP based metrics, which tend to perceive the image quality and is \textbf{not suitable for evaluating the distortion}. In this work, we first propose a distortion-specific CLIP… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18420v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18420v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18420v1-abstract-full" style="display: none;"> Generating 360-degree panoramas from narrow field of view (NFoV) image is a promising computer vision task for Virtual Reality (VR) applications. Existing methods mostly assess the generated panoramas with InceptionNet or CLIP based metrics, which tend to perceive the image quality and is \textbf{not suitable for evaluating the distortion}. In this work, we first propose a distortion-specific CLIP, named Distort-CLIP to accurately evaluate the panorama distortion and discover the \textbf{``visual cheating''} phenomenon in previous works (\ie, tending to improve the visual results by sacrificing distortion accuracy). This phenomenon arises because prior methods employ a single network to learn the distinct panorama distortion and content completion at once, which leads the model to prioritize optimizing the latter. To address the phenomenon, we propose \textbf{PanoDecouple}, a decoupled diffusion model framework, which decouples the panorama generation into distortion guidance and content completion, aiming to generate panoramas with both accurate distortion and visual appeal. Specifically, we design a DistortNet for distortion guidance by imposing panorama-specific distortion prior and a modified condition registration mechanism; and a ContentNet for content completion by imposing perspective image information. Additionally, a distortion correction loss function with Distort-CLIP is introduced to constrain the distortion explicitly. The extensive experiments validate that PanoDecouple surpasses existing methods both in distortion and visual metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18420v1-abstract-full').style.display = 'none'; document.getElementById('2503.18420v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR2025. Project page:https://isee-laboratory.github.io/PanoDecouple/ Code:https://github.com/iSEE-Laboratory/PanoDecouple/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18336">arXiv:2503.18336</a> <span> [<a href="https://arxiv.org/pdf/2503.18336">pdf</a>, <a href="https://arxiv.org/format/2503.18336">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Rise of the Community Champions: From Reviewer Crunch to Community Power </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+C">Changlun Li</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yao Shi</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yuyu Luo</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+N">Nan Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18336v1-abstract-short" style="display: inline;"> Academic publishing is facing a crisis driven by exponential growth in submissions and an overwhelmed peer review system, leading to inconsistent decisions and a severe reviewer shortage. This paper introduces Panvas, a platform that reimagines academic publishing as a continuous, community-driven process. Panvas addresses these systemic failures with a novel combination of economic incentives (pa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18336v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18336v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18336v1-abstract-full" style="display: none;"> Academic publishing is facing a crisis driven by exponential growth in submissions and an overwhelmed peer review system, leading to inconsistent decisions and a severe reviewer shortage. This paper introduces Panvas, a platform that reimagines academic publishing as a continuous, community-driven process. Panvas addresses these systemic failures with a novel combination of economic incentives (paid reviews) and rich interaction mechanisms (multi-dimensional ratings, threaded discussions, and expert-led reviews). By moving beyond the traditional accept/reject paradigm and integrating paper hosting with code/data repositories and social networking, Panvas fosters a meritocratic environment for scholarly communication and presents a radical rethinking of how we evaluate and disseminate scientific knowledge. We present the system design, development roadmap, and a user study plan to evaluate its effectiveness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18336v1-abstract-full').style.display = 'none'; document.getElementById('2503.18336v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress, vision paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18313">arXiv:2503.18313</a> <span> [<a href="https://arxiv.org/pdf/2503.18313">pdf</a>, <a href="https://arxiv.org/format/2503.18313">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> DeepFund: Will LLM be Professional at Fund Investment? A Live Arena Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+C">Changlun Li</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yao Shi</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yuyu Luo</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+N">Nan Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18313v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated impressive capabilities across various domains, but their effectiveness in financial decision making, particularly in fund investment, remains inadequately evaluated. Current benchmarks primarily assess LLMs understanding of financial documents rather than their ability to manage assets or analyze trading opportunities in dynamic market conditions. A… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18313v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18313v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18313v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated impressive capabilities across various domains, but their effectiveness in financial decision making, particularly in fund investment, remains inadequately evaluated. Current benchmarks primarily assess LLMs understanding of financial documents rather than their ability to manage assets or analyze trading opportunities in dynamic market conditions. A critical limitation in existing evaluation methodologies is the backtesting approach, which suffers from information leakage when LLMs are evaluated on historical data they may have encountered during pretraining. This paper introduces DeepFund, a comprehensive platform for evaluating LLM based trading strategies in a simulated live environment. Our approach implements a multi agent framework where LLMs serve as both analysts and managers, creating a realistic simulation of investment decision making. The platform employs a forward testing methodology that mitigates information leakage by evaluating models on market data released after their training cutoff dates. We provide a web interface that visualizes model performance across different market conditions and investment parameters, enabling detailed comparative analysis. Through DeepFund, we aim to provide a more accurate and fair assessment of LLMs capabilities in fund investment, offering insights into their potential real world applications in financial markets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18313v1-abstract-full').style.display = 'none'; document.getElementById('2503.18313v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18278">arXiv:2503.18278</a> <span> [<a href="https://arxiv.org/pdf/2503.18278">pdf</a>, <a href="https://arxiv.org/format/2503.18278">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> TopV: Compatible Token Pruning with Inference Time Optimization for Fast and Low-Memory Multimodal Vision Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+C">Cheng Yang</a>, <a href="/search/cs?searchtype=author&query=Sui%2C+Y">Yang Sui</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jinqi Xiao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Lingyi Huang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Y">Yu Gong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chendi Li</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Jinghua Yan</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yu Bai</a>, <a href="/search/cs?searchtype=author&query=Sadayappan%2C+P">Ponnuswamy Sadayappan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xia Hu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+B">Bo Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18278v2-abstract-short" style="display: inline;"> Vision-Language Models (VLMs) demand substantial computational resources during inference, largely due to the extensive visual input tokens for representing visual information. Previous studies have noted that visual tokens tend to receive less attention than text tokens, suggesting their lower importance during inference and potential for pruning. However, their methods encounter several challeng… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18278v2-abstract-full').style.display = 'inline'; document.getElementById('2503.18278v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18278v2-abstract-full" style="display: none;"> Vision-Language Models (VLMs) demand substantial computational resources during inference, largely due to the extensive visual input tokens for representing visual information. Previous studies have noted that visual tokens tend to receive less attention than text tokens, suggesting their lower importance during inference and potential for pruning. However, their methods encounter several challenges: reliance on greedy heuristic criteria for token importance and incompatibility with FlashAttention and KV cache. To address these issues, we introduce \textbf{TopV}, a compatible \textbf{TO}ken \textbf{P}runing with inference Time Optimization for fast and low-memory \textbf{V}LM, achieving efficient pruning without additional training or fine-tuning. Instead of relying on attention scores, we formulate token pruning as an optimization problem, accurately identifying important visual tokens while remaining compatible with FlashAttention. Additionally, since we only perform this pruning once during the prefilling stage, it effectively reduces KV cache size. Our optimization framework incorporates a visual-aware cost function considering factors such as Feature Similarity, Relative Spatial Distance, and Absolute Central Distance, to measure the importance of each source visual token, enabling effective pruning of low-importance tokens. Extensive experiments demonstrate that our method outperforms previous token pruning methods, validating the effectiveness and efficiency of our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18278v2-abstract-full').style.display = 'none'; document.getElementById('2503.18278v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17865">arXiv:2503.17865</a> <span> [<a href="https://arxiv.org/pdf/2503.17865">pdf</a>, <a href="https://arxiv.org/ps/2503.17865">ps</a>, <a href="https://arxiv.org/format/2503.17865">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Understanding Inverse Reinforcement Learning under Overparameterization: Non-Asymptotic Analysis and Global Optimality </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruijia Zhang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+S">Siliang Zeng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenliang Li</a>, <a href="/search/cs?searchtype=author&query=Garcia%2C+A">Alfredo Garcia</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+M">Mingyi Hong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17865v1-abstract-short" style="display: inline;"> The goal of the Inverse reinforcement learning (IRL) task is to identify the underlying reward function and the corresponding optimal policy from a set of expert demonstrations. While most IRL algorithms' theoretical guarantees rely on a linear reward structure, we aim to extend the theoretical understanding of IRL to scenarios where the reward function is parameterized by neural networks. Meanwhi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17865v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17865v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17865v1-abstract-full" style="display: none;"> The goal of the Inverse reinforcement learning (IRL) task is to identify the underlying reward function and the corresponding optimal policy from a set of expert demonstrations. While most IRL algorithms' theoretical guarantees rely on a linear reward structure, we aim to extend the theoretical understanding of IRL to scenarios where the reward function is parameterized by neural networks. Meanwhile, conventional IRL algorithms usually adopt a nested structure, leading to computational inefficiency, especially in high-dimensional settings. To address this problem, we propose the first two-timescale single-loop IRL algorithm under neural network parameterized reward and provide a non-asymptotic convergence analysis under overparameterization. Although prior optimality results for linear rewards do not apply, we show that our algorithm can identify the globally optimal reward and policy under certain neural network structures. This is the first IRL algorithm with a non-asymptotic convergence guarantee that provably achieves global optimality in neural network settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17865v1-abstract-full').style.display = 'none'; document.getElementById('2503.17865v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17793">arXiv:2503.17793</a> <span> [<a href="https://arxiv.org/pdf/2503.17793">pdf</a>, <a href="https://arxiv.org/format/2503.17793">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Every Sample Matters: Leveraging Mixture-of-Experts and High-Quality Data for Efficient and Accurate Code LLM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Codefuse"> Codefuse</a>, <a href="/search/cs?searchtype=author&query=Team%2C+L">Ling Team</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+W">Wenting Cai</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yuchen Cao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chaoyu Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chen Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Siba Chen</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Q">Qing Cui</a>, <a href="/search/cs?searchtype=author&query=Di%2C+P">Peng Di</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+J">Junpeng Fang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Z">Zi Gong</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+T">Ting Guo</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhengyu He</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yang Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Cong Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jianguo Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zheng Li</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+S">Shijie Lian</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">BingChang Liu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+S">Songshan Luo</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+S">Shuo Mao</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+M">Min Shen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jian Wu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiaolong Yang</a> , et al. (8 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17793v1-abstract-short" style="display: inline;"> Recent advancements in code large language models (LLMs) have demonstrated remarkable capabilities in code generation and understanding. It is still challenging to build a code LLM with comprehensive performance yet ultimate efficiency. Many attempts have been released in the open source community to break the trade-off between performance and efficiency, such as the Qwen Coder series and the Deep… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17793v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17793v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17793v1-abstract-full" style="display: none;"> Recent advancements in code large language models (LLMs) have demonstrated remarkable capabilities in code generation and understanding. It is still challenging to build a code LLM with comprehensive performance yet ultimate efficiency. Many attempts have been released in the open source community to break the trade-off between performance and efficiency, such as the Qwen Coder series and the DeepSeek Coder series. This paper introduces yet another attempt in this area, namely Ling-Coder-Lite. We leverage the efficient Mixture-of-Experts (MoE) architecture along with a set of high-quality data curation methods (especially those based on program analytics) to build an efficient yet powerful code LLM. Ling-Coder-Lite exhibits on-par performance on 12 representative coding benchmarks compared to state-of-the-art models of similar size, such as Qwen2.5-Coder-7B and DeepSeek-Coder-V2-Lite, while offering competitive latency and throughput. In practice, we achieve a 50\% reduction in deployment resources compared to the similar-sized dense model without performance loss. To facilitate further research and development in this area, we open-source our models as well as a substantial portion of high-quality data for the annealing and post-training stages. The models and data can be accessed at~\url{https://huggingface.co/inclusionAI/Ling-Coder-lite}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17793v1-abstract-full').style.display = 'none'; document.getElementById('2503.17793v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 6 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17741">arXiv:2503.17741</a> <span> [<a href="https://arxiv.org/pdf/2503.17741">pdf</a>, <a href="https://arxiv.org/format/2503.17741">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> RustMap: Towards Project-Scale C-to-Rust Migration via Program Analysis and LLM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cai%2C+X">Xuemeng Cai</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiakun Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiping Huang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yijun Yu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Haitao Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chunmiao Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bo Wang</a>, <a href="/search/cs?searchtype=author&query=Yusuf%2C+I+N+B">Imam Nur Bani Yusuf</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+L">Lingxiao Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17741v1-abstract-short" style="display: inline;"> Migrating existing C programs into Rust is increasingly desired, as Rust offers superior memory safety while maintaining C's high performance. However, vastly different features between C and Rust--e.g., distinct definitions and usages of pointers and references--pose significant challenges beyond mere syntactic translation. Existing automated translation tools, such as C2Rust, may rely too much o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17741v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17741v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17741v1-abstract-full" style="display: none;"> Migrating existing C programs into Rust is increasingly desired, as Rust offers superior memory safety while maintaining C's high performance. However, vastly different features between C and Rust--e.g., distinct definitions and usages of pointers and references--pose significant challenges beyond mere syntactic translation. Existing automated translation tools, such as C2Rust, may rely too much on syntactic, template-based translation and generate unsafe Rust code that is hard for human developers to read, maintain, or even compile. More semantic-aware translation that produces safer, idiomatic, and runnable Rust code is much needed. This paper introduces a novel dependency-guided and large language model (LLM)-based C-to-Rust translation approach, RustMap, based on three key ideas: (1) Utilize LLM capabilities to produce idiomatic Rust code from given small pieces of C code, (2) Mitigate LLM limitations in handling large codebases by breaking project-scale C programs into smaller units for translation according to their usage dependencies and composing them into a runnable Rust program, and (3) Enhance the correctness of the translated Rust program by using test cases to check input/output equivalence, isolate faulty code when execution states deviate, and iteratively refine the translation using feedback from compilation and test errors. We empirically evaluate RustMap on 126 real-world programs, including 125 from Rosetta Code and a 7000+ line bzip2 implementation using GPT-4o as the LLM. RustMap shows promising results, guiding GPT-4o to produce idiomatic, readable, and functional Rust code with significantly less unsafe code than other tools, and revealing non-trivial translation patterns reusable for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17741v1-abstract-full').style.display = 'none'; document.getElementById('2503.17741v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17378">arXiv:2503.17378</a> <span> [<a href="https://arxiv.org/pdf/2503.17378">pdf</a>, <a href="https://arxiv.org/format/2503.17378">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Large language model-powered AI systems achieve self-replication with no human intervention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pan%2C+X">Xudong Pan</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+J">Jiarun Dai</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Y">Yihe Fan</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+M">Minyuan Luo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Changyi Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Min Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17378v2-abstract-short" style="display: inline;"> Self-replication with no human intervention is broadly recognized as one of the principal red lines associated with frontier AI systems. While leading corporations such as OpenAI and Google DeepMind have assessed GPT-o3-mini and Gemini on replication-related tasks and concluded that these systems pose a minimal risk regarding self-replication, our research presents novel findings. Following the sa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17378v2-abstract-full').style.display = 'inline'; document.getElementById('2503.17378v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17378v2-abstract-full" style="display: none;"> Self-replication with no human intervention is broadly recognized as one of the principal red lines associated with frontier AI systems. While leading corporations such as OpenAI and Google DeepMind have assessed GPT-o3-mini and Gemini on replication-related tasks and concluded that these systems pose a minimal risk regarding self-replication, our research presents novel findings. Following the same evaluation protocol, we demonstrate that 11 out of 32 existing AI systems under evaluation already possess the capability of self-replication. In hundreds of experimental trials, we observe a non-trivial number of successful self-replication trials across mainstream model families worldwide, even including those with as small as 14 billion parameters which can run on personal computers. Furthermore, we note the increase in self-replication capability when the model becomes more intelligent in general. Also, by analyzing the behavioral traces of diverse AI systems, we observe that existing AI systems already exhibit sufficient planning, problem-solving, and creative capabilities to accomplish complex agentic tasks including self-replication. More alarmingly, we observe successful cases where an AI system do self-exfiltration without explicit instructions, adapt to harsher computational environments without sufficient software or hardware supports, and plot effective strategies to survive against the shutdown command from the human beings. These novel findings offer a crucial time buffer for the international community to collaborate on establishing effective governance over the self-replication capabilities and behaviors of frontier AI systems, which could otherwise pose existential risks to the human society if not well-controlled. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17378v2-abstract-full').style.display = 'none'; document.getElementById('2503.17378v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17288">arXiv:2503.17288</a> <span> [<a href="https://arxiv.org/pdf/2503.17288">pdf</a>, <a href="https://arxiv.org/format/2503.17288">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Exploring a Principled Framework for Deep Subspace Clustering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Meng%2C+X">Xianghan Meng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhiyuan Huang</a>, <a href="/search/cs?searchtype=author&query=He%2C+W">Wei He</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+X">Xianbiao Qi</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+R">Rong Xiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chun-Guang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17288v1-abstract-short" style="display: inline;"> Subspace clustering is a classical unsupervised learning task, built on a basic assumption that high-dimensional data can be approximated by a union of subspaces (UoS). Nevertheless, the real-world data are often deviating from the UoS assumption. To address this challenge, state-of-the-art deep subspace clustering algorithms attempt to jointly learn UoS representations and self-expressive coeffic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17288v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17288v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17288v1-abstract-full" style="display: none;"> Subspace clustering is a classical unsupervised learning task, built on a basic assumption that high-dimensional data can be approximated by a union of subspaces (UoS). Nevertheless, the real-world data are often deviating from the UoS assumption. To address this challenge, state-of-the-art deep subspace clustering algorithms attempt to jointly learn UoS representations and self-expressive coefficients. However, the general framework of the existing algorithms suffers from a catastrophic feature collapse and lacks a theoretical guarantee to learn desired UoS representation. In this paper, we present a Principled fRamewOrk for Deep Subspace Clustering (PRO-DSC), which is designed to learn structured representations and self-expressive coefficients in a unified manner. Specifically, in PRO-DSC, we incorporate an effective regularization on the learned representations into the self-expressive model, prove that the regularized self-expressive model is able to prevent feature space collapse, and demonstrate that the learned optimal representations under certain condition lie on a union of orthogonal subspaces. Moreover, we provide a scalable and efficient approach to implement our PRO-DSC and conduct extensive experiments to verify our theoretical findings and demonstrate the superior performance of our proposed deep subspace clustering approach. The code is available at https://github.com/mengxianghan123/PRO-DSC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17288v1-abstract-full').style.display = 'none'; document.getElementById('2503.17288v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The paper is accepted by ICLR 2025. The first two authors are equally contributed</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17222">arXiv:2503.17222</a> <span> [<a href="https://arxiv.org/pdf/2503.17222">pdf</a>, <a href="https://arxiv.org/format/2503.17222">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Automating Adjudication of Cardiovascular Events Using Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sivarajkumar%2C+S">Sonish Sivarajkumar</a>, <a href="/search/cs?searchtype=author&query=Ameri%2C+K">Kimia Ameri</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chuqin Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanshan Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+M">Min Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17222v1-abstract-short" style="display: inline;"> Cardiovascular events, such as heart attacks and strokes, remain a leading cause of mortality globally, necessitating meticulous monitoring and adjudication in clinical trials. This process, traditionally performed manually by clinical experts, is time-consuming, resource-intensive, and prone to inter-reviewer variability, potentially introducing bias and hindering trial progress. This study addre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17222v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17222v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17222v1-abstract-full" style="display: none;"> Cardiovascular events, such as heart attacks and strokes, remain a leading cause of mortality globally, necessitating meticulous monitoring and adjudication in clinical trials. This process, traditionally performed manually by clinical experts, is time-consuming, resource-intensive, and prone to inter-reviewer variability, potentially introducing bias and hindering trial progress. This study addresses these critical limitations by presenting a novel framework for automating the adjudication of cardiovascular events in clinical trials using Large Language Models (LLMs). We developed a two-stage approach: first, employing an LLM-based pipeline for event information extraction from unstructured clinical data and second, using an LLM-based adjudication process guided by a Tree of Thoughts approach and clinical endpoint committee (CEC) guidelines. Using cardiovascular event-specific clinical trial data, the framework achieved an F1-score of 0.82 for event extraction and an accuracy of 0.68 for adjudication. Furthermore, we introduce the CLEART score, a novel, automated metric specifically designed for evaluating the quality of AI-generated clinical reasoning in adjudicating cardiovascular events. This approach demonstrates significant potential for substantially reducing adjudication time and costs while maintaining high-quality, consistent, and auditable outcomes in clinical trials. The reduced variability and enhanced standardization also allow for faster identification and mitigation of risks associated with cardiovascular therapies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17222v1-abstract-full').style.display = 'none'; document.getElementById('2503.17222v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16976">arXiv:2503.16976</a> <span> [<a href="https://arxiv.org/pdf/2503.16976">pdf</a>, <a href="https://arxiv.org/format/2503.16976">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> GeoT: Geometry-guided Instance-dependent Transition Matrix for Semi-supervised Tooth Point Cloud Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+W">Weihao Yu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xiaoqing Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenxin Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yifan Liu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yixuan Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16976v1-abstract-short" style="display: inline;"> Achieving meticulous segmentation of tooth point clouds from intra-oral scans stands as an indispensable prerequisite for various orthodontic applications. Given the labor-intensive nature of dental annotation, a significant amount of data remains unlabeled, driving increasing interest in semi-supervised approaches. One primary challenge of existing semi-supervised medical segmentation methods lie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16976v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16976v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16976v1-abstract-full" style="display: none;"> Achieving meticulous segmentation of tooth point clouds from intra-oral scans stands as an indispensable prerequisite for various orthodontic applications. Given the labor-intensive nature of dental annotation, a significant amount of data remains unlabeled, driving increasing interest in semi-supervised approaches. One primary challenge of existing semi-supervised medical segmentation methods lies in noisy pseudo labels generated for unlabeled data. To address this challenge, we propose GeoT, the first framework that employs instance-dependent transition matrix (IDTM) to explicitly model noise in pseudo labels for semi-supervised dental segmentation. Specifically, to handle the extensive solution space of IDTM arising from tens of thousands of dental points, we introduce tooth geometric priors through two key components: point-level geometric regularization (PLGR) to enhance consistency between point adjacency relationships in 3D and IDTM spaces, and class-level geometric smoothing (CLGS) to leverage the fixed spatial distribution of tooth categories for optimal IDTM estimation. Extensive experiments performed on the public Teeth3DS dataset and private dataset demonstrate that our method can make full utilization of unlabeled data to facilitate segmentation, achieving performance comparable to fully supervised methods with only $20\%$ of the labeled data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16976v1-abstract-full').style.display = 'none'; document.getElementById('2503.16976v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IPMI2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16816">arXiv:2503.16816</a> <span> [<a href="https://arxiv.org/pdf/2503.16816">pdf</a>, <a href="https://arxiv.org/format/2503.16816">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ST-Prompt Guided Histological Hypergraph Learning for Spatial Gene Expression Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Niu%2C+Y">Yi Niu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiashuai Liu</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+Y">Yingkang Zhan</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jiangbo Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Di Zhang</a>, <a href="/search/cs?searchtype=author&query=Machado%2C+I">Ines Machado</a>, <a href="/search/cs?searchtype=author&query=Crispin-Ortuzar%2C+M">Mireia Crispin-Ortuzar</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chen Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zeyu Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16816v1-abstract-short" style="display: inline;"> Spatial Transcriptomics (ST) reveals the spatial distribution of gene expression in tissues, offering critical insights into biological processes and disease mechanisms. However, predicting ST from H\&E-stained histology images is challenging due to the heterogeneous relationship between histomorphology and gene expression, which arises from substantial variability across different patients and ti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16816v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16816v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16816v1-abstract-full" style="display: none;"> Spatial Transcriptomics (ST) reveals the spatial distribution of gene expression in tissues, offering critical insights into biological processes and disease mechanisms. However, predicting ST from H\&E-stained histology images is challenging due to the heterogeneous relationship between histomorphology and gene expression, which arises from substantial variability across different patients and tissue sections. A more practical and valuable approach is to utilize ST data from a few local regions to predict the spatial transcriptomic landscape across the remaining regions in H&E slides. In response, we propose PHG2ST, an ST-prompt guided histological hypergraph learning framework, which leverages sparse ST signals as prompts to guide histological hypergraph learning for global spatial gene expression prediction. Our framework fuses histological hypergraph representations at multiple scales through a masked ST-prompt encoding mechanism, improving robustness and generalizability. Benchmark evaluations on two public ST datasets demonstrate that PHG2ST outperforms the existing state-of-the-art methods and closely aligns with the ground truth. These results underscore the potential of leveraging sparse local ST data for scalable and cost-effective spatial gene expression mapping in real-world biomedical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16816v1-abstract-full').style.display = 'none'; document.getElementById('2503.16816v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16471">arXiv:2503.16471</a> <span> [<a href="https://arxiv.org/pdf/2503.16471">pdf</a>, <a href="https://arxiv.org/format/2503.16471">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Review of Brain-Computer Interface Technologies: Signal Acquisition Methods and Interaction Paradigms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yifan Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+C">Cheng Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenzhong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16471v1-abstract-short" style="display: inline;"> Brain-Computer Interface (BCI) technology facilitates direct communication between the human brain and external devices, representing a substantial advancement in human-machine interaction. This review provides an in-depth analysis of various BCI paradigms, including classic paradigms, current classifications, and hybrid paradigms, each with distinct characteristics and applications. Additionally,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16471v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16471v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16471v1-abstract-full" style="display: none;"> Brain-Computer Interface (BCI) technology facilitates direct communication between the human brain and external devices, representing a substantial advancement in human-machine interaction. This review provides an in-depth analysis of various BCI paradigms, including classic paradigms, current classifications, and hybrid paradigms, each with distinct characteristics and applications. Additionally, we explore a range of signal acquisition methods, classified into non-implantation, intervention, and implantation techniques, elaborating on their principles and recent advancements. By examining the interdependence between paradigms and signal acquisition technologies, this review offers a comprehensive perspective on how innovations in one domain propel progress in the other. The goal is to present insights into the future development of more efficient, user-friendly, and versatile BCI systems, emphasizing the synergy between paradigm design and signal acquisition techniques and their potential to transform the field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16471v1-abstract-full').style.display = 'none'; document.getElementById('2503.16471v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 figures,20 pages</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> J.3 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16252">arXiv:2503.16252</a> <span> [<a href="https://arxiv.org/pdf/2503.16252">pdf</a>, <a href="https://arxiv.org/format/2503.16252">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Fin-R1: A Large Language Model for Financial Reasoning through Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhaowei Liu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xin Guo</a>, <a href="/search/cs?searchtype=author&query=Lou%2C+F">Fangqi Lou</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+L">Lingfeng Zeng</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+J">Jinyi Niu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zixuan Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiajie Xu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+W">Weige Cai</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ziwei Yang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xueqian Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chao Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Sheng Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dezhi Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yun Chen</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Z">Zuo Bai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Liwen Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16252v2-abstract-short" style="display: inline;"> Reasoning large language models are rapidly evolving across various domains. However, their capabilities in handling complex financial tasks still require in-depth exploration. In this paper, we introduce Fin-R1, a reasoning large language model specifically designed for the financial sector. Fin-R1 is built using a two-stage architecture, leveraging a financial reasoning dataset distilled and pro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16252v2-abstract-full').style.display = 'inline'; document.getElementById('2503.16252v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16252v2-abstract-full" style="display: none;"> Reasoning large language models are rapidly evolving across various domains. However, their capabilities in handling complex financial tasks still require in-depth exploration. In this paper, we introduce Fin-R1, a reasoning large language model specifically designed for the financial sector. Fin-R1 is built using a two-stage architecture, leveraging a financial reasoning dataset distilled and processed based on DeepSeek-R1. Through supervised fine-tuning (SFT) and reinforcement learning (RL) training, it demonstrates performance close to DeepSeek-R1 with a parameter size of 7 billion across a range of financial reasoning tasks. It achieves the state-of-the-art (SOTA) in the FinQA and ConvFinQA tasks between those LLMs in our evaluation, surpassing larger models in other tasks as well. Fin-R1 showcases strong reasoning and decision-making capabilities, providing solutions to various problems encountered in the financial domain. Our code is available at https://github.com/SUFE-AIFLM-Lab/Fin-R1. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16252v2-abstract-full').style.display = 'none'; document.getElementById('2503.16252v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16133">arXiv:2503.16133</a> <span> [<a href="https://arxiv.org/pdf/2503.16133">pdf</a>, <a href="https://arxiv.org/ps/2503.16133">ps</a>, <a href="https://arxiv.org/format/2503.16133">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Multi-Prompt Style Interpolation for Fine-Grained Artistic Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lei Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hao Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuxin Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chao Li</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+K">Kai Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16133v1-abstract-short" style="display: inline;"> Text-driven image style transfer has seen remarkable progress with methods leveraging cross-modal embeddings for fast, high-quality stylization. However, most existing pipelines assume a \emph{single} textual style prompt, limiting the range of artistic control and expressiveness. In this paper, we propose a novel \emph{multi-prompt style interpolation} framework that extends the recently introduc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16133v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16133v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16133v1-abstract-full" style="display: none;"> Text-driven image style transfer has seen remarkable progress with methods leveraging cross-modal embeddings for fast, high-quality stylization. However, most existing pipelines assume a \emph{single} textual style prompt, limiting the range of artistic control and expressiveness. In this paper, we propose a novel \emph{multi-prompt style interpolation} framework that extends the recently introduced \textbf{StyleMamba} approach. Our method supports blending or interpolating among multiple textual prompts (eg, ``cubism,'' ``impressionism,'' and ``cartoon''), allowing the creation of nuanced or hybrid artistic styles within a \emph{single} image. We introduce a \textit{Multi-Prompt Embedding Mixer} combined with \textit{Adaptive Blending Weights} to enable fine-grained control over the spatial and semantic influence of each style. Further, we propose a \emph{Hierarchical Masked Directional Loss} to refine region-specific style consistency. Experiments and user studies confirm our approach outperforms single-prompt baselines and naive linear combinations of styles, achieving superior style fidelity, text-image alignment, and artistic flexibility, all while maintaining the computational efficiency offered by the state-space formulation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16133v1-abstract-full').style.display = 'none'; document.getElementById('2503.16133v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16129">arXiv:2503.16129</a> <span> [<a href="https://arxiv.org/pdf/2503.16129">pdf</a>, <a href="https://arxiv.org/ps/2503.16129">ps</a>, <a href="https://arxiv.org/format/2503.16129">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Controllable Segmentation-Based Text-Guided Style Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jingwen Li</a>, <a href="/search/cs?searchtype=author&query=Chandrasekar%2C+A">Aravind Chandrasekar</a>, <a href="/search/cs?searchtype=author&query=Rocha%2C+M">Mariana Rocha</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chao Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuqing Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16129v1-abstract-short" style="display: inline;"> We present a novel approach for controllable, region-specific style editing driven by textual prompts. Building upon the state-space style alignment framework introduced by \emph{StyleMamba}, our method integrates a semantic segmentation model into the style transfer pipeline. This allows users to selectively apply text-driven style changes to specific segments (e.g., ``turn the building into a cy… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16129v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16129v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16129v1-abstract-full" style="display: none;"> We present a novel approach for controllable, region-specific style editing driven by textual prompts. Building upon the state-space style alignment framework introduced by \emph{StyleMamba}, our method integrates a semantic segmentation model into the style transfer pipeline. This allows users to selectively apply text-driven style changes to specific segments (e.g., ``turn the building into a cyberpunk tower'') while leaving other regions (e.g., ``people'' or ``trees'') unchanged. By incorporating region-wise condition vectors and a region-specific directional loss, our method achieves high-fidelity transformations that respect both semantic boundaries and user-driven style descriptions. Extensive experiments demonstrate that our approach can flexibly handle complex scene stylizations in real-world scenarios, improving control and quality over purely global style transfer methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16129v1-abstract-full').style.display = 'none'; document.getElementById('2503.16129v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Li%2C+C&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Li%2C+C&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+C&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+C&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+C&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+C&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository