Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 7,043 results for author: <span class="mathjax">Li, J</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Li%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Li, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Li%2C+J&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Li, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Li%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Li%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.24381">arXiv:2503.24381</a> <span> [<a href="https://arxiv.org/pdf/2503.24381">pdf</a>, <a href="https://arxiv.org/format/2503.24381">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> UniOcc: A Unified Benchmark for Occupancy Forecasting and Prediction in Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuping Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiangyu Huang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xiaokang Sun</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Mingxuan Yan</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+S">Shuo Xing</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+Z">Zhengzhong Tu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiachen Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.24381v1-abstract-short" style="display: inline;"> We introduce UniOcc, a comprehensive, unified benchmark for occupancy forecasting (i.e., predicting future occupancies based on historical information) and current-frame occupancy prediction from camera images. UniOcc unifies data from multiple real-world datasets (i.e., nuScenes, Waymo) and high-fidelity driving simulators (i.e., CARLA, OpenCOOD), which provides 2D/3D occupancy labels with per-vo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.24381v1-abstract-full').style.display = 'inline'; document.getElementById('2503.24381v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.24381v1-abstract-full" style="display: none;"> We introduce UniOcc, a comprehensive, unified benchmark for occupancy forecasting (i.e., predicting future occupancies based on historical information) and current-frame occupancy prediction from camera images. UniOcc unifies data from multiple real-world datasets (i.e., nuScenes, Waymo) and high-fidelity driving simulators (i.e., CARLA, OpenCOOD), which provides 2D/3D occupancy labels with per-voxel flow annotations and support for cooperative autonomous driving. In terms of evaluation, unlike existing studies that rely on suboptimal pseudo labels for evaluation, UniOcc incorporates novel metrics that do not depend on ground-truth occupancy, enabling robust assessment of additional aspects of occupancy quality. Through extensive experiments on state-of-the-art models, we demonstrate that large-scale, diverse training data and explicit flow information significantly enhance occupancy prediction and forecasting performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.24381v1-abstract-full').style.display = 'none'; document.getElementById('2503.24381v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages; Dataset: https://huggingface.co/datasets/tasl-lab/uniocc; Code: https://github.com/tasl-lab/UniOcc</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.24345">arXiv:2503.24345</a> <span> [<a href="https://arxiv.org/pdf/2503.24345">pdf</a>, <a href="https://arxiv.org/format/2503.24345">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PathOrchestra: A Comprehensive Foundation Model for Computational Pathology with Over 100 Diverse Clinical-Grade Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+F">Fang Yan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jianfeng Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiawen Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jiaxuan Lu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wen Chen</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zizhao Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jianan Li</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hong Yan</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jiabo Ma</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Minda Chen</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yang Lu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qing Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yizhi Wang</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+X">Xitong Ling</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xuenian Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zihan Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Q">Qiang Huang</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+S">Shengyi Hua</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mianxin Liu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+L">Lei Ma</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+T">Tian Shen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaofan Zhang</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yonghong He</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a> , et al. (2 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.24345v1-abstract-short" style="display: inline;"> The complexity and variability inherent in high-resolution pathological images present significant challenges in computational pathology. While pathology foundation models leveraging AI have catalyzed transformative advancements, their development demands large-scale datasets, considerable storage capacity, and substantial computational resources. Furthermore, ensuring their clinical applicability… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.24345v1-abstract-full').style.display = 'inline'; document.getElementById('2503.24345v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.24345v1-abstract-full" style="display: none;"> The complexity and variability inherent in high-resolution pathological images present significant challenges in computational pathology. While pathology foundation models leveraging AI have catalyzed transformative advancements, their development demands large-scale datasets, considerable storage capacity, and substantial computational resources. Furthermore, ensuring their clinical applicability and generalizability requires rigorous validation across a broad spectrum of clinical tasks. Here, we present PathOrchestra, a versatile pathology foundation model trained via self-supervised learning on a dataset comprising 300K pathological slides from 20 tissue and organ types across multiple centers. The model was rigorously evaluated on 112 clinical tasks using a combination of 61 private and 51 public datasets. These tasks encompass digital slide preprocessing, pan-cancer classification, lesion identification, multi-cancer subtype classification, biomarker assessment, gene expression prediction, and the generation of structured reports. PathOrchestra demonstrated exceptional performance across 27,755 WSIs and 9,415,729 ROIs, achieving over 0.950 accuracy in 47 tasks, including pan-cancer classification across various organs, lymphoma subtype diagnosis, and bladder cancer screening. Notably, it is the first model to generate structured reports for high-incidence colorectal cancer and diagnostically complex lymphoma-areas that are infrequently addressed by foundational models but hold immense clinical potential. Overall, PathOrchestra exemplifies the feasibility and efficacy of a large-scale, self-supervised pathology foundation model, validated across a broad range of clinical-grade tasks. Its high accuracy and reduced reliance on extensive data annotation underline its potential for clinical integration, offering a pathway toward more efficient and high-quality medical services. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.24345v1-abstract-full').style.display = 'none'; document.getElementById('2503.24345v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.24157">arXiv:2503.24157</a> <span> [<a href="https://arxiv.org/pdf/2503.24157">pdf</a>, <a href="https://arxiv.org/format/2503.24157">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LLM4FS: Leveraging Large Language Models for Feature Selection and How to Improve It </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jianhao Li</a>, <a href="/search/cs?searchtype=author&query=Xiu%2C+X">Xianchao Xiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.24157v1-abstract-short" style="display: inline;"> Recent advances in large language models (LLMs) have provided new opportunities for decision-making, particularly in the task of automated feature selection. In this paper, we first comprehensively evaluate LLM-based feature selection methods, covering the state-of-the-art DeepSeek-R1, GPT-o3-mini, and GPT-4.5. Then, we propose a novel hybrid strategy called LLM4FS that integrates LLMs with tradit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.24157v1-abstract-full').style.display = 'inline'; document.getElementById('2503.24157v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.24157v1-abstract-full" style="display: none;"> Recent advances in large language models (LLMs) have provided new opportunities for decision-making, particularly in the task of automated feature selection. In this paper, we first comprehensively evaluate LLM-based feature selection methods, covering the state-of-the-art DeepSeek-R1, GPT-o3-mini, and GPT-4.5. Then, we propose a novel hybrid strategy called LLM4FS that integrates LLMs with traditional data-driven methods. Specifically, input data samples into LLMs, and directly call traditional data-driven techniques such as random forest and forward sequential selection. Notably, our analysis reveals that the hybrid strategy leverages the contextual understanding of LLMs and the high statistical reliability of traditional data-driven methods to achieve excellent feature selection performance, even surpassing LLMs and traditional data-driven methods. Finally, we point out the limitations of its application in decision-making. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.24157v1-abstract-full').style.display = 'none'; document.getElementById('2503.24157v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23829">arXiv:2503.23829</a> <span> [<a href="https://arxiv.org/pdf/2503.23829">pdf</a>, <a href="https://arxiv.org/format/2503.23829">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Expanding RL with Verifiable Rewards Across Diverse Domains </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yi Su</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+D">Dian Yu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+L">Linfeng Song</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Juntao Li</a>, <a href="/search/cs?searchtype=author&query=Mi%2C+H">Haitao Mi</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+Z">Zhaopeng Tu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+D">Dong Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23829v1-abstract-short" style="display: inline;"> Reinforcement learning (RL) with verifiable rewards (RLVR) has shown promising results in mathematical reasoning and coding tasks where well-structured reference answers are available. However, its applicability to broader domains remains underexplored. In this work, we study the extension of RLVR to more diverse domains such as medicine, chemistry, psychology, and economics. We observe high agree… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23829v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23829v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23829v1-abstract-full" style="display: none;"> Reinforcement learning (RL) with verifiable rewards (RLVR) has shown promising results in mathematical reasoning and coding tasks where well-structured reference answers are available. However, its applicability to broader domains remains underexplored. In this work, we study the extension of RLVR to more diverse domains such as medicine, chemistry, psychology, and economics. We observe high agreement in binary judgments across different large language models (LLMs) when objective reference answers exist, which challenges the necessity of large-scale annotation for training domain-specific reward models. To address the limitations of binary rewards when handling unstructured reference answers, we further incorporate model-based soft scoring into RLVR to improve its flexibility. Our experiments show that a distilled generative reward model can serve as an effective cross-domain verifier, providing reliable reward signals for RL without requiring domain-specific annotations. By fine-tuning a base 7B model using various RL algorithms against our reward model, we obtain policies that outperform state-of-the-art open-source aligned LLMs such as Qwen2.5-72B-Instruct and DeepSeek-R1-Distill-Qwen-32B by a large margin, across domains in free-form answer settings. This also strengthens RLVR's robustness and scalability, highlighting its potential for real-world applications with noisy or weak labels. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23829v1-abstract-full').style.display = 'none'; document.getElementById('2503.23829v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23786">arXiv:2503.23786</a> <span> [<a href="https://arxiv.org/pdf/2503.23786">pdf</a>, <a href="https://arxiv.org/format/2503.23786">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MGD-SAM2: Multi-view Guided Detail-enhanced Segment Anything Model 2 for High-Resolution Class-agnostic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+H">Haoran Shen</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+P">Peixian Zhuang</a>, <a href="/search/cs?searchtype=author&query=Kou%2C+J">Jiahao Kou</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yuxin Zeng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Haoying Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiangyun Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23786v1-abstract-short" style="display: inline;"> Segment Anything Models (SAMs), as vision foundation models, have demonstrated remarkable performance across various image analysis tasks. Despite their strong generalization capabilities, SAMs encounter challenges in fine-grained detail segmentation for high-resolution class-independent segmentation (HRCS), due to the limitations in the direct processing of high-resolution inputs and low-resoluti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23786v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23786v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23786v1-abstract-full" style="display: none;"> Segment Anything Models (SAMs), as vision foundation models, have demonstrated remarkable performance across various image analysis tasks. Despite their strong generalization capabilities, SAMs encounter challenges in fine-grained detail segmentation for high-resolution class-independent segmentation (HRCS), due to the limitations in the direct processing of high-resolution inputs and low-resolution mask predictions, and the reliance on accurate manual prompts. To address these limitations, we propose MGD-SAM2 which integrates SAM2 with multi-view feature interaction between a global image and local patches to achieve precise segmentation. MGD-SAM2 incorporates the pre-trained SAM2 with four novel modules: the Multi-view Perception Adapter (MPAdapter), the Multi-view Complementary Enhancement Module (MCEM), the Hierarchical Multi-view Interaction Module (HMIM), and the Detail Refinement Module (DRM). Specifically, we first introduce MPAdapter to adapt the SAM2 encoder for enhanced extraction of local details and global semantics in HRCS images. Then, MCEM and HMIM are proposed to further exploit local texture and global context by aggregating multi-view features within and across multi-scales. Finally, DRM is designed to generate gradually restored high-resolution mask predictions, compensating for the loss of fine-grained details resulting from directly upsampling the low-resolution prediction maps. Experimental results demonstrate the superior performance and strong generalization of our model on multiple high-resolution and normal-resolution datasets. Code will be available at https://github.com/sevenshr/MGD-SAM2. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23786v1-abstract-full').style.display = 'none'; document.getElementById('2503.23786v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23777">arXiv:2503.23777</a> <span> [<a href="https://arxiv.org/pdf/2503.23777">pdf</a>, <a href="https://arxiv.org/format/2503.23777">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> CONGRAD:Conflicting Gradient Filtering for Multilingual Preference Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiangnan Li</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+T">Thuy-Trang Vu</a>, <a href="/search/cs?searchtype=author&query=Herold%2C+C">Christian Herold</a>, <a href="/search/cs?searchtype=author&query=Tebbifakhr%2C+A">Amirhossein Tebbifakhr</a>, <a href="/search/cs?searchtype=author&query=Khadivi%2C+S">Shahram Khadivi</a>, <a href="/search/cs?searchtype=author&query=Haffari%2C+G">Gholamreza Haffari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23777v1-abstract-short" style="display: inline;"> Naive joint training of large language models (LLMs) for multilingual preference alignment can suffer from negative interference. This is a known issue in multilingual training, where conflicting objectives degrade overall performance. However, the impact of this phenomenon in the context of multilingual preference alignment remains largely underexplored. To address this issue, we propose CONGRAD,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23777v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23777v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23777v1-abstract-full" style="display: none;"> Naive joint training of large language models (LLMs) for multilingual preference alignment can suffer from negative interference. This is a known issue in multilingual training, where conflicting objectives degrade overall performance. However, the impact of this phenomenon in the context of multilingual preference alignment remains largely underexplored. To address this issue, we propose CONGRAD, a scalable and effective filtering method that selects high-quality preference samples with minimal gradient conflicts across languages. Our method leverages gradient surgery to retain samples aligned with an aggregated multilingual update direction. Additionally, we incorporate a sublinear gradient compression strategy that reduces memory overhead during gradient accumulation. We integrate CONGRAD into self-rewarding framework and evaluate on LLaMA3-8B and Gemma2-2B across 10 languages. Results show that CONGRAD consistently outperforms strong baselines in both seen and unseen languages, with minimal alignment tax. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23777v1-abstract-full').style.display = 'none'; document.getElementById('2503.23777v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23721">arXiv:2503.23721</a> <span> [<a href="https://arxiv.org/pdf/2503.23721">pdf</a>, <a href="https://arxiv.org/format/2503.23721">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Unimodal-driven Distillation in Multimodal Emotion Recognition with Dynamic Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiagen Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Rui Yu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Huihao Huang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Huaicheng Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23721v1-abstract-short" style="display: inline;"> Multimodal Emotion Recognition in Conversations (MERC) identifies emotional states across text, audio and video, which is essential for intelligent dialogue systems and opinion analysis. Existing methods emphasize heterogeneous modal fusion directly for cross-modal integration, but often suffer from disorientation in multimodal learning due to modal heterogeneity and lack of instructive guidance.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23721v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23721v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23721v1-abstract-full" style="display: none;"> Multimodal Emotion Recognition in Conversations (MERC) identifies emotional states across text, audio and video, which is essential for intelligent dialogue systems and opinion analysis. Existing methods emphasize heterogeneous modal fusion directly for cross-modal integration, but often suffer from disorientation in multimodal learning due to modal heterogeneity and lack of instructive guidance. In this work, we propose SUMMER, a novel heterogeneous multimodal integration framework leveraging Mixture of Experts with Hierarchical Cross-modal Fusion and Interactive Knowledge Distillation. Key components include a Sparse Dynamic Mixture of Experts (SDMoE) for capturing dynamic token-wise interactions, a Hierarchical Cross-Modal Fusion (HCMF) for effective fusion of heterogeneous modalities, and Interactive Knowledge Distillation (IKD), which uses a pre-trained unimodal teacher to guide multimodal fusion in latent and logit spaces. Experiments on IEMOCAP and MELD show SUMMER outperforms state-of-the-art methods, particularly in recognizing minority and semantically similar emotions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23721v1-abstract-full').style.display = 'none'; document.getElementById('2503.23721v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23718">arXiv:2503.23718</a> <span> [<a href="https://arxiv.org/pdf/2503.23718">pdf</a>, <a href="https://arxiv.org/format/2503.23718">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Detecting Functional Bugs in Smart Contracts through LLM-Powered and Bug-Oriented Composite Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+B">Binbin Zhao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xingshuang Lin</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yuan Tian</a>, <a href="/search/cs?searchtype=author&query=Zonouz%2C+S">Saman Zonouz</a>, <a href="/search/cs?searchtype=author&query=Ruan%2C+N">Na Ruan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiliang Li</a>, <a href="/search/cs?searchtype=author&query=Beyah%2C+R">Raheem Beyah</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+S">Shouling Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23718v1-abstract-short" style="display: inline;"> Smart contracts are fundamental pillars of the blockchain, playing a crucial role in facilitating various business transactions. However, these smart contracts are vulnerable to exploitable bugs that can lead to substantial monetary losses. A recent study reveals that over 80% of these exploitable bugs, which are primarily functional bugs, can evade the detection of current tools. The primary issu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23718v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23718v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23718v1-abstract-full" style="display: none;"> Smart contracts are fundamental pillars of the blockchain, playing a crucial role in facilitating various business transactions. However, these smart contracts are vulnerable to exploitable bugs that can lead to substantial monetary losses. A recent study reveals that over 80% of these exploitable bugs, which are primarily functional bugs, can evade the detection of current tools. The primary issue is the significant gap between understanding the high-level logic of the business model and checking the low-level implementations in smart contracts. Furthermore, identifying deeply rooted functional bugs in smart contracts requires the automated generation of effective detection oracles based on various bug features. To address these challenges, we design and implement PROMFUZZ, an automated and scalable system to detect functional bugs, in smart contracts. In PROMFUZZ, we first propose a novel Large Language Model (LLM)-driven analysis framework, which leverages a dual-agent prompt engineering strategy to pinpoint potentially vulnerable functions for further scrutiny. We then implement a dual-stage coupling approach, which focuses on generating invariant checkers that leverage logic information extracted from potentially vulnerable functions. Finally, we design a bug-oriented fuzzing engine, which maps the logical information from the high-level business model to the low-level smart contract implementations, and performs the bug-oriented fuzzing on targeted functions. We compare PROMFUZZ with multiple state-of-the-art methods. The results show that PROMFUZZ achieves 86.96% recall and 93.02% F1-score in detecting functional bugs, marking at least a 50% improvement in both metrics over state-of-the-art methods. Moreover, we perform an in-depth analysis on real-world DeFi projects and detect 30 zero-day bugs. Up to now, 24 zero-day bugs have been assigned CVE IDs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23718v1-abstract-full').style.display = 'none'; document.getElementById('2503.23718v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23715">arXiv:2503.23715</a> <span> [<a href="https://arxiv.org/pdf/2503.23715">pdf</a>, <a href="https://arxiv.org/format/2503.23715">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HOIGen-1M: A Large-scale Dataset for Human-Object Interaction Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kun Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qi Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xinchen Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jie Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yongdong Zhang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+J">Jiebo Luo</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiaodong He</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wu Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23715v1-abstract-short" style="display: inline;"> Text-to-video (T2V) generation has made tremendous progress in generating complicated scenes based on texts. However, human-object interaction (HOI) often cannot be precisely generated by current T2V models due to the lack of large-scale videos with accurate captions for HOI. To address this issue, we introduce HOIGen-1M, the first largescale dataset for HOI Generation, consisting of over one mill… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23715v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23715v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23715v1-abstract-full" style="display: none;"> Text-to-video (T2V) generation has made tremendous progress in generating complicated scenes based on texts. However, human-object interaction (HOI) often cannot be precisely generated by current T2V models due to the lack of large-scale videos with accurate captions for HOI. To address this issue, we introduce HOIGen-1M, the first largescale dataset for HOI Generation, consisting of over one million high-quality videos collected from diverse sources. In particular, to guarantee the high quality of videos, we first design an efficient framework to automatically curate HOI videos using the powerful multimodal large language models (MLLMs), and then the videos are further cleaned by human annotators. Moreover, to obtain accurate textual captions for HOI videos, we design a novel video description method based on a Mixture-of-Multimodal-Experts (MoME) strategy that not only generates expressive captions but also eliminates the hallucination by individual MLLM. Furthermore, due to the lack of an evaluation framework for generated HOI videos, we propose two new metrics to assess the quality of generated videos in a coarse-to-fine manner. Extensive experiments reveal that current T2V models struggle to generate high-quality HOI videos and confirm that our HOIGen-1M dataset is instrumental for improving HOI video generation. Project webpage is available at https://liuqi-creat.github.io/HOIGen.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23715v1-abstract-full').style.display = 'none'; document.getElementById('2503.23715v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23708">arXiv:2503.23708</a> <span> [<a href="https://arxiv.org/pdf/2503.23708">pdf</a>, <a href="https://arxiv.org/format/2503.23708">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Towards Benchmarking and Assessing the Safety and Robustness of Autonomous Driving on Safety-critical Scenarios </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jingzheng Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xianglong Liu</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+S">Shikui Wei</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhijun Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bing Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qing Guo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xianqi Yang</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+Y">Yanjun Pu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiakai Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23708v1-abstract-short" style="display: inline;"> Autonomous driving has made significant progress in both academia and industry, including performance improvements in perception task and the development of end-to-end autonomous driving systems. However, the safety and robustness assessment of autonomous driving has not received sufficient attention. Current evaluations of autonomous driving are typically conducted in natural driving scenarios. H… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23708v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23708v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23708v1-abstract-full" style="display: none;"> Autonomous driving has made significant progress in both academia and industry, including performance improvements in perception task and the development of end-to-end autonomous driving systems. However, the safety and robustness assessment of autonomous driving has not received sufficient attention. Current evaluations of autonomous driving are typically conducted in natural driving scenarios. However, many accidents often occur in edge cases, also known as safety-critical scenarios. These safety-critical scenarios are difficult to collect, and there is currently no clear definition of what constitutes a safety-critical scenario. In this work, we explore the safety and robustness of autonomous driving in safety-critical scenarios. First, we provide a definition of safety-critical scenarios, including static traffic scenarios such as adversarial attack scenarios and natural distribution shifts, as well as dynamic traffic scenarios such as accident scenarios. Then, we develop an autonomous driving safety testing platform to comprehensively evaluate autonomous driving systems, encompassing not only the assessment of perception modules but also system-level evaluations. Our work systematically constructs a safety verification process for autonomous driving, providing technical support for the industry to establish standardized test framework and reduce risks in real-world road deployment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23708v1-abstract-full').style.display = 'none'; document.getElementById('2503.23708v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23684">arXiv:2503.23684</a> <span> [<a href="https://arxiv.org/pdf/2503.23684">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Detail-aware multi-view stereo network for depth estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tian%2C+H">Haitao Tian</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Junyang Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chenxing Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Helong Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23684v1-abstract-short" style="display: inline;"> Multi-view stereo methods have achieved great success for depth estimation based on the coarse-to-fine depth learning frameworks, however, the existing methods perform poorly in recovering the depth of object boundaries and detail regions. To address these issues, we propose a detail-aware multi-view stereo network (DA-MVSNet) with a coarse-to-fine framework. The geometric depth clues hidden in th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23684v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23684v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23684v1-abstract-full" style="display: none;"> Multi-view stereo methods have achieved great success for depth estimation based on the coarse-to-fine depth learning frameworks, however, the existing methods perform poorly in recovering the depth of object boundaries and detail regions. To address these issues, we propose a detail-aware multi-view stereo network (DA-MVSNet) with a coarse-to-fine framework. The geometric depth clues hidden in the coarse stage are utilized to maintain the geometric structural relationships between object surfaces and enhance the expressive capability of image features. In addition, an image synthesis loss is employed to constrain the gradient flow for detailed regions and further strengthen the supervision of object boundaries and texture-rich areas. Finally, we propose an adaptive depth interval adjustment strategy to improve the accuracy of object reconstruction. Extensive experiments on the DTU and Tanks & Temples datasets demonstrate that our method achieves competitive results. The code is available at https://github.com/wsmtht520-/DAMVSNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23684v1-abstract-full').style.display = 'none'; document.getElementById('2503.23684v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23536">arXiv:2503.23536</a> <span> [<a href="https://arxiv.org/pdf/2503.23536">pdf</a>, <a href="https://arxiv.org/format/2503.23536">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Survey on Unlearnable Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiahao Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yiqiang Chen</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+Y">Yunbing Xing</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yang Gu</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+X">Xiangyuan Lan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23536v1-abstract-short" style="display: inline;"> Unlearnable data (ULD) has emerged as an innovative defense technique to prevent machine learning models from learning meaningful patterns from specific data, thus protecting data privacy and security. By introducing perturbations to the training data, ULD degrades model performance, making it difficult for unauthorized models to extract useful representations. Despite the growing significance of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23536v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23536v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23536v1-abstract-full" style="display: none;"> Unlearnable data (ULD) has emerged as an innovative defense technique to prevent machine learning models from learning meaningful patterns from specific data, thus protecting data privacy and security. By introducing perturbations to the training data, ULD degrades model performance, making it difficult for unauthorized models to extract useful representations. Despite the growing significance of ULD, existing surveys predominantly focus on related fields, such as adversarial attacks and machine unlearning, with little attention given to ULD as an independent area of study. This survey fills that gap by offering a comprehensive review of ULD, examining unlearnable data generation methods, public benchmarks, evaluation metrics, theoretical foundations and practical applications. We compare and contrast different ULD approaches, analyzing their strengths, limitations, and trade-offs related to unlearnability, imperceptibility, efficiency and robustness. Moreover, we discuss key challenges, such as balancing perturbation imperceptibility with model degradation and the computational complexity of ULD generation. Finally, we highlight promising future research directions to advance the effectiveness and applicability of ULD, underscoring its potential to become a crucial tool in the evolving landscape of data protection in machine learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23536v1-abstract-full').style.display = 'none'; document.getElementById('2503.23536v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">31 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23526">arXiv:2503.23526</a> <span> [<a href="https://arxiv.org/pdf/2503.23526">pdf</a>, <a href="https://arxiv.org/ps/2503.23526">ps</a>, <a href="https://arxiv.org/format/2503.23526">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> Network Unreliability in Almost-Linear Time </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cen%2C+R">Ruoxu Cen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jason Li</a>, <a href="/search/cs?searchtype=author&query=Panigrahi%2C+D">Debmalya Panigrahi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23526v1-abstract-short" style="display: inline;"> The network unreliability problem asks for the probability that a given undirected graph gets disconnected when every edge independently fails with a given probability $p$. Valiant (1979) showed that this problem is \#P-hard; therefore, the best we can hope for are approximation algorithms. In a classic result, Karger (1995) obtained the first FPTAS for this problem by leveraging the fact that whe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23526v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23526v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23526v1-abstract-full" style="display: none;"> The network unreliability problem asks for the probability that a given undirected graph gets disconnected when every edge independently fails with a given probability $p$. Valiant (1979) showed that this problem is \#P-hard; therefore, the best we can hope for are approximation algorithms. In a classic result, Karger (1995) obtained the first FPTAS for this problem by leveraging the fact that when a graph disconnects, it almost always does so at a near-minimum cut, and there are only a small (polynomial) number of near-minimum cuts. Since then, a series of results have obtained progressively faster algorithms to the current bound of $m^{1+o(1)} + \tilde{O}(n^{3/2})$ (Cen, He, Li, and Panigrahi, 2024). In this paper, we obtain an $m^{1+o(1)}$-time algorithm for the network unreliability problem. This is essentially optimal, since we need $O(m)$ time to read the input graph. Our main new ingredient is relating network unreliability to an {\em ideal} tree packing of spanning trees (Thorup, 2001). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23526v1-abstract-full').style.display = 'none'; document.getElementById('2503.23526v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in STOC 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23523">arXiv:2503.23523</a> <span> [<a href="https://arxiv.org/pdf/2503.23523">pdf</a>, <a href="https://arxiv.org/format/2503.23523">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Question-Aware Knowledge Graph Prompting for Enhancing Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haochen Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Song Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chen Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jundong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23523v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) often struggle with tasks requiring external knowledge, such as knowledge-intensive Multiple Choice Question Answering (MCQA). Integrating Knowledge Graphs (KGs) can enhance reasoning; however, existing methods typically demand costly fine-tuning or retrieve noisy KG information. Recent approaches leverage Graph Neural Networks (GNNs) to generate KG-based input embeddi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23523v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23523v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23523v1-abstract-full" style="display: none;"> Large Language Models (LLMs) often struggle with tasks requiring external knowledge, such as knowledge-intensive Multiple Choice Question Answering (MCQA). Integrating Knowledge Graphs (KGs) can enhance reasoning; however, existing methods typically demand costly fine-tuning or retrieve noisy KG information. Recent approaches leverage Graph Neural Networks (GNNs) to generate KG-based input embedding prefixes as soft prompts for LLMs but fail to account for question relevance, resulting in noisy prompts. Moreover, in MCQA tasks, the absence of relevant KG knowledge for certain answer options remains a significant challenge. To address these issues, we propose Question-Aware Knowledge Graph Prompting (QAP), which incorporates question embeddings into GNN aggregation to dynamically assess KG relevance. QAP employs global attention to capture inter-option relationships, enriching soft prompts with inferred knowledge. Experimental results demonstrate that QAP outperforms state-of-the-art methods across multiple datasets, highlighting its effectiveness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23523v1-abstract-full').style.display = 'none'; document.getElementById('2503.23523v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23460">arXiv:2503.23460</a> <span> [<a href="https://arxiv.org/pdf/2503.23460">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Workshop on Aesthetics of Connectivity for Empowerment at ACM Designing Interactive Systems 2024 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jun Hu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+M">Mengru Xue</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+C">Cheng Yao</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yuan Feng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiabao Li</a>, <a href="/search/cs?searchtype=author&query=Hansen%2C+P">Preben Hansen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23460v1-abstract-short" style="display: inline;"> Connectivity enabled by technologies such as the Internet of Things, Artificial Intelligence, Big Data, and Cloud Computing is rapidly transforming our interactions with the world and with each other. It reshapes social interactions, fostering collaboration, creativity, and unprecedented access to information and resources. However, this connected world and era demand innovative design approaches… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23460v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23460v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23460v1-abstract-full" style="display: none;"> Connectivity enabled by technologies such as the Internet of Things, Artificial Intelligence, Big Data, and Cloud Computing is rapidly transforming our interactions with the world and with each other. It reshapes social interactions, fostering collaboration, creativity, and unprecedented access to information and resources. However, this connected world and era demand innovative design approaches that harmonize technical functionality with human-centered values. We have run a series of workshops at different conferences, trying to engage the participants in discussions about the related challenges and opportunities, of digital art [1] and aesthetics [2] to AI-driven creativity [3] and their functional aspects in healthcare [1] and empowerment [2, 3]. We want to focus further on the intersection of these challenges where we see opportunities: leveraging aesthetics and connectivity as catalysts for empowerment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23460v1-abstract-full').style.display = 'none'; document.getElementById('2503.23460v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23365">arXiv:2503.23365</a> <span> [<a href="https://arxiv.org/pdf/2503.23365">pdf</a>, <a href="https://arxiv.org/format/2503.23365">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> OnSiteVRU: A High-Resolution Trajectory Dataset for High-Density Vulnerable Road Users </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+Z">Zhangcun Yan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jianqing Li</a>, <a href="/search/cs?searchtype=author&query=Hang%2C+P">Peng Hang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jian Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23365v1-abstract-short" style="display: inline;"> With the acceleration of urbanization and the growth of transportation demands, the safety of vulnerable road users (VRUs, such as pedestrians and cyclists) in mixed traffic flows has become increasingly prominent, necessitating high-precision and diverse trajectory data to support the development and optimization of autonomous driving systems. However, existing datasets fall short in capturing th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23365v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23365v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23365v1-abstract-full" style="display: none;"> With the acceleration of urbanization and the growth of transportation demands, the safety of vulnerable road users (VRUs, such as pedestrians and cyclists) in mixed traffic flows has become increasingly prominent, necessitating high-precision and diverse trajectory data to support the development and optimization of autonomous driving systems. However, existing datasets fall short in capturing the diversity and dynamics of VRU behaviors, making it difficult to meet the research demands of complex traffic environments. To address this gap, this study developed the OnSiteVRU datasets, which cover a variety of scenarios, including intersections, road segments, and urban villages. These datasets provide trajectory data for motor vehicles, electric bicycles, and human-powered bicycles, totaling approximately 17,429 trajectories with a precision of 0.04 seconds. The datasets integrate both aerial-view natural driving data and onboard real-time dynamic detection data, along with environmental information such as traffic signals, obstacles, and real-time maps, enabling a comprehensive reconstruction of interaction events. The results demonstrate that VRU\_Data outperforms traditional datasets in terms of VRU density and scene coverage, offering a more comprehensive representation of VRU behavioral characteristics. This provides critical support for traffic flow modeling, trajectory prediction, and autonomous driving virtual testing. The dataset is publicly available for download at: https://www.kaggle.com/datasets/zcyan2/mixed-traffic-trajectory-dataset-in-from-shanghai. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23365v1-abstract-full').style.display = 'none'; document.getElementById('2503.23365v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23137">arXiv:2503.23137</a> <span> [<a href="https://arxiv.org/pdf/2503.23137">pdf</a>, <a href="https://arxiv.org/format/2503.23137">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> When 'YES' Meets 'BUT': Can Large Models Comprehend Contradictory Humor Through Comparative Reasoning? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+T">Tuo Liang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zhe Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jing Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yiren Lu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yunlai Zhou</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yiran Qiao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Disheng Liu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+J">Jeirui Peng</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jing Ma</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Y">Yu Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23137v1-abstract-short" style="display: inline;"> Understanding humor-particularly when it involves complex, contradictory narratives that require comparative reasoning-remains a significant challenge for large vision-language models (VLMs). This limitation hinders AI's ability to engage in human-like reasoning and cultural expression. In this paper, we investigate this challenge through an in-depth analysis of comics that juxtapose panels to cre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23137v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23137v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23137v1-abstract-full" style="display: none;"> Understanding humor-particularly when it involves complex, contradictory narratives that require comparative reasoning-remains a significant challenge for large vision-language models (VLMs). This limitation hinders AI's ability to engage in human-like reasoning and cultural expression. In this paper, we investigate this challenge through an in-depth analysis of comics that juxtapose panels to create humor through contradictions. We introduce the YesBut (V2), a novel benchmark with 1,262 comic images from diverse multilingual and multicultural contexts, featuring comprehensive annotations that capture various aspects of narrative understanding. Using this benchmark, we systematically evaluate a wide range of VLMs through four complementary tasks spanning from surface content comprehension to deep narrative reasoning, with particular emphasis on comparative reasoning between contradictory elements. Our extensive experiments reveal that even the most advanced models significantly underperform compared to humans, with common failures in visual perception, key element identification, comparative analysis and hallucinations. We further investigate text-based training strategies and social knowledge augmentation methods to enhance model performance. Our findings not only highlight critical weaknesses in VLMs' understanding of cultural and creative expressions but also provide pathways toward developing context-aware models capable of deeper narrative understanding though comparative reasoning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23137v1-abstract-full').style.display = 'none'; document.getElementById('2503.23137v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23053">arXiv:2503.23053</a> <span> [<a href="https://arxiv.org/pdf/2503.23053">pdf</a>, <a href="https://arxiv.org/format/2503.23053">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Training-free LLM Framework with Interaction between Contextually Related Subtasks in Solving Complex Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hongjia Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinlong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23053v1-abstract-short" style="display: inline;"> Large language models (LLMs) have shown remarkable capabilities in solving complex tasks. Recent work has explored decomposing such tasks into subtasks with independent contexts. However, some contextually related subtasks may encounter information loss during execution, leading to redundant operations or execution failures. To address this issue, we propose a training-free framework with an inter… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23053v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23053v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23053v1-abstract-full" style="display: none;"> Large language models (LLMs) have shown remarkable capabilities in solving complex tasks. Recent work has explored decomposing such tasks into subtasks with independent contexts. However, some contextually related subtasks may encounter information loss during execution, leading to redundant operations or execution failures. To address this issue, we propose a training-free framework with an interaction mechanism, which enables a subtask to query specific information or trigger certain actions in completed subtasks by sending requests. To implement interaction, we introduce a subtask trajectory memory to enable resumption of completed subtasks upon receiving interaction requests. Additionally, we propose a new action during execution, which generates a concise and precise description of execution process and outcomes of a subtask, to assist subsequent subtasks in determining interaction targets and requests. We evaluate our framework on interactive decision-making task WebShop and multi-hop question answering HotpotQA, with GPT-3.5 and GPT-4, and comparison results show that our framework outperforms the state-of-the-art training-free baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23053v1-abstract-full').style.display = 'none'; document.getElementById('2503.23053v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23014">arXiv:2503.23014</a> <span> [<a href="https://arxiv.org/pdf/2503.23014">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MSNGO: multi-species protein function annotation based on 3D protein structure and network propagation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+B">Beibei Wang</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+B">Boyue Cui</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shiqu Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xuan Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yadong Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Junyi Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23014v1-abstract-short" style="display: inline;"> Motivation: In recent years, protein function prediction has broken through the bottleneck of sequence features, significantly improving prediction accuracy using high-precision protein structures predicted by AlphaFold2. While single-species protein function prediction methods have achieved remarkable success, multi-species protein function prediction methods are still in the stage of using PPI n… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23014v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23014v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23014v1-abstract-full" style="display: none;"> Motivation: In recent years, protein function prediction has broken through the bottleneck of sequence features, significantly improving prediction accuracy using high-precision protein structures predicted by AlphaFold2. While single-species protein function prediction methods have achieved remarkable success, multi-species protein function prediction methods are still in the stage of using PPI networks and sequence features. Providing effective cross-species label propagation for species with sparse protein annotations remains a challenging issue. To address this problem, we propose the MSNGO model, which integrates structural features and network propagation methods. Our validation shows that using structural features can significantly improve the accuracy of multi-species protein function prediction. Results: We employ graph representation learning techniques to extract amino acid representations from protein structure contact maps and train a structural model using a graph convolution pooling module to derive protein-level structural features. After incorporating the sequence features from ESM-2, we apply a network propagation algorithm to aggregate information and update node representations within a heterogeneous network. The results demonstrate that MSNGO outperforms previous multi-species protein function prediction methods that rely on sequence features and PPI networks. Availability: https://github.com/blingbell/MSNGO. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23014v1-abstract-full').style.display = 'none'; document.getElementById('2503.23014v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.22998">arXiv:2503.22998</a> <span> [<a href="https://arxiv.org/pdf/2503.22998">pdf</a>, <a href="https://arxiv.org/format/2503.22998">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> AuditVotes: A Framework Towards More Deployable Certified Robustness for Graph Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lai%2C+Y">Yuni Lai</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yulin Zhu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yixuan Sun</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yulun Wu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+B">Bin Xiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Gaolei Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jianhua Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+K">Kai Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.22998v1-abstract-short" style="display: inline;"> Despite advancements in Graph Neural Networks (GNNs), adaptive attacks continue to challenge their robustness. Certified robustness based on randomized smoothing has emerged as a promising solution, offering provable guarantees that a model's predictions remain stable under adversarial perturbations within a specified range. However, existing methods face a critical trade-off between accuracy and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22998v1-abstract-full').style.display = 'inline'; document.getElementById('2503.22998v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.22998v1-abstract-full" style="display: none;"> Despite advancements in Graph Neural Networks (GNNs), adaptive attacks continue to challenge their robustness. Certified robustness based on randomized smoothing has emerged as a promising solution, offering provable guarantees that a model's predictions remain stable under adversarial perturbations within a specified range. However, existing methods face a critical trade-off between accuracy and robustness, as achieving stronger robustness requires introducing greater noise into the input graph. This excessive randomization degrades data quality and disrupts prediction consistency, limiting the practical deployment of certifiably robust GNNs in real-world scenarios where both accuracy and robustness are essential. To address this challenge, we propose \textbf{AuditVotes}, the first framework to achieve both high clean accuracy and certifiably robust accuracy for GNNs. It integrates randomized smoothing with two key components, \underline{au}gmentation and con\underline{dit}ional smoothing, aiming to improve data quality and prediction consistency. The augmentation, acting as a pre-processing step, de-noises the randomized graph, significantly improving data quality and clean accuracy. The conditional smoothing, serving as a post-processing step, employs a filtering function to selectively count votes, thereby filtering low-quality predictions and improving voting consistency. Extensive experimental results demonstrate that AuditVotes significantly enhances clean accuracy, certified robustness, and empirical robustness while maintaining high computational efficiency. Notably, compared to baseline randomized smoothing, AuditVotes improves clean accuracy by $437.1\%$ and certified accuracy by $409.3\%$ when the attacker can arbitrarily insert $20$ edges on the Cora-ML datasets, representing a substantial step toward deploying certifiably robust GNNs in real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22998v1-abstract-full').style.display = 'none'; document.getElementById('2503.22998v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.22875">arXiv:2503.22875</a> <span> [<a href="https://arxiv.org/pdf/2503.22875">pdf</a>, <a href="https://arxiv.org/format/2503.22875">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> </div> </div> <p class="title is-5 mathjax"> A Pilot Study on Tunable Precision Emulation via Automatic BLAS Offloading </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hang Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Junjie Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yinzhi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.22875v1-abstract-short" style="display: inline;"> This study explores the use of automatic BLAS offloading and INT8-based emulation for accelerating traditional HPC workloads on modern GPU architectures. Through the use of low-bitwidth integer units and cache-coherent Unified Memory Architecture, we emulate double-precision matrix multiplications in the MuST application without code changes. We find that accuracy depends on both arithmetic precis… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22875v1-abstract-full').style.display = 'inline'; document.getElementById('2503.22875v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.22875v1-abstract-full" style="display: none;"> This study explores the use of automatic BLAS offloading and INT8-based emulation for accelerating traditional HPC workloads on modern GPU architectures. Through the use of low-bitwidth integer units and cache-coherent Unified Memory Architecture, we emulate double-precision matrix multiplications in the MuST application without code changes. We find that accuracy depends on both arithmetic precision and the properties of the operator, which can be dealt with through tunable precision emulation. Unlike traditional mixed-precision approaches, this method preserves original algorithms while optimizing hardware utilization. We showcases the potential of improving accuracy and performance at the same time. This work highlights the potential of AI-driven hardware to transform HPC, advocating for adaptive precision strategies in future scientific computing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22875v1-abstract-full').style.display = 'none'; document.getElementById('2503.22875v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.22679">arXiv:2503.22679</a> <span> [<a href="https://arxiv.org/pdf/2503.22679">pdf</a>, <a href="https://arxiv.org/format/2503.22679">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Q-Insight: Understanding Image Quality via Visual Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+W">Weiqi Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuanyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shijie Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yabin Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Junlin Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Li Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jian Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.22679v1-abstract-short" style="display: inline;"> Image quality assessment (IQA) focuses on the perceptual visual quality of images, playing a crucial role in downstream tasks such as image reconstruction, compression, and generation. The rapid advancement of multi-modal large language models (MLLMs) has significantly broadened the scope of IQA, moving toward comprehensive image quality understanding that incorporates content analysis, degradatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22679v1-abstract-full').style.display = 'inline'; document.getElementById('2503.22679v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.22679v1-abstract-full" style="display: none;"> Image quality assessment (IQA) focuses on the perceptual visual quality of images, playing a crucial role in downstream tasks such as image reconstruction, compression, and generation. The rapid advancement of multi-modal large language models (MLLMs) has significantly broadened the scope of IQA, moving toward comprehensive image quality understanding that incorporates content analysis, degradation perception, and comparison reasoning beyond mere numerical scoring. Previous MLLM-based methods typically either generate numerical scores lacking interpretability or heavily rely on supervised fine-tuning (SFT) using large-scale annotated datasets to provide descriptive assessments, limiting their flexibility and applicability. In this paper, we propose Q-Insight, a reinforcement learning-based model built upon group relative policy optimization (GRPO), which demonstrates strong visual reasoning capability for image quality understanding while requiring only a limited amount of rating scores and degradation labels. By jointly optimizing score regression and degradation perception tasks with carefully designed reward functions, our approach effectively exploits their mutual benefits for enhanced performance. Extensive experiments demonstrate that Q-Insight substantially outperforms existing state-of-the-art methods in both score regression and degradation perception tasks, while exhibiting impressive zero-shot generalization to comparison reasoning tasks. Code will be available at https://github.com/lwq20020127/Q-Insight. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22679v1-abstract-full').style.display = 'none'; document.getElementById('2503.22679v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.22498">arXiv:2503.22498</a> <span> [<a href="https://arxiv.org/pdf/2503.22498">pdf</a>, <a href="https://arxiv.org/format/2503.22498">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="High Energy Physics - Phenomenology">hep-ph</span> </div> </div> <p class="title is-5 mathjax"> Learnable cut flow </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jing Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Hao Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.22498v1-abstract-short" style="display: inline;"> Neural networks have emerged as a powerful paradigm for tasks in high energy physics, yet their opaque training process renders them as a black box. In contrast, the traditional cut flow method offers simplicity and interpretability but demands human effort to identify optimal boundaries. To merge the strengths of both approaches, we propose the Learnable Cut Flow (LCF), a neural network that tran… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22498v1-abstract-full').style.display = 'inline'; document.getElementById('2503.22498v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.22498v1-abstract-full" style="display: none;"> Neural networks have emerged as a powerful paradigm for tasks in high energy physics, yet their opaque training process renders them as a black box. In contrast, the traditional cut flow method offers simplicity and interpretability but demands human effort to identify optimal boundaries. To merge the strengths of both approaches, we propose the Learnable Cut Flow (LCF), a neural network that transforms the traditional cut selection into a fully differentiable, data-driven process. LCF implements two cut strategies-parallel, where observable distributions are treated independently, and sequential, where prior cuts shape subsequent ones-to flexibly determine optimal boundaries. Building on this, we introduce the Learnable Importance, a metric that quantifies feature importance and adjusts their contributions to the loss accordingly, offering model-driven insights unlike ad-hoc metrics. To ensure differentiability, a modified loss function replaces hard cuts with mask operations, preserving data shape throughout the training process. LCF is tested on six varied mock datasets and a realistic diboson vs. QCD dataset. Results demonstrate that LCF (1) accurately learns cut boundaries across typical feature distributions in both parallel and sequential strategies, (2) assigns higher importance to discriminative features with minimal overlap, (3) handles redundant or correlated features robustly, and (4) performs effectively in real-world scenarios. In diboson dataset, LCF initially underperforms boosted decision trees and multiplayer perceptrons when using all observables. However, pruning less critical features-guided by learned importance-boosts its performance to match or exceed these baselines. LCF bridges the gap between traditional cut flow method and modern black-box neural networks, delivering actionable insights into the training process and feature importance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22498v1-abstract-full').style.display = 'none'; document.getElementById('2503.22498v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 pages, 33 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.22166">arXiv:2503.22166</a> <span> [<a href="https://arxiv.org/pdf/2503.22166">pdf</a>, <a href="https://arxiv.org/format/2503.22166">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Reasoning of Large Language Models over Knowledge Graphs with Super-Relations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Song Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Junhong Lin</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xiaojie Guo</a>, <a href="/search/cs?searchtype=author&query=Shun%2C+J">Julian Shun</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jundong Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yada Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.22166v1-abstract-short" style="display: inline;"> While large language models (LLMs) have made significant progress in processing and reasoning over knowledge graphs, current methods suffer from a high non-retrieval rate. This limitation reduces the accuracy of answering questions based on these graphs. Our analysis reveals that the combination of greedy search and forward reasoning is a major contributor to this issue. To overcome these challeng… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22166v1-abstract-full').style.display = 'inline'; document.getElementById('2503.22166v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.22166v1-abstract-full" style="display: none;"> While large language models (LLMs) have made significant progress in processing and reasoning over knowledge graphs, current methods suffer from a high non-retrieval rate. This limitation reduces the accuracy of answering questions based on these graphs. Our analysis reveals that the combination of greedy search and forward reasoning is a major contributor to this issue. To overcome these challenges, we introduce the concept of super-relations, which enables both forward and backward reasoning by summarizing and connecting various relational paths within the graph. This holistic approach not only expands the search space, but also significantly improves retrieval efficiency. In this paper, we propose the ReKnoS framework, which aims to Reason over Knowledge Graphs with Super-Relations. Our framework's key advantages include the inclusion of multiple relation paths through super-relations, enhanced forward and backward reasoning capabilities, and increased efficiency in querying LLMs. These enhancements collectively lead to a substantial improvement in the successful retrieval rate and overall reasoning performance. We conduct extensive experiments on nine real-world datasets to evaluate ReKnoS, and the results demonstrate the superior performance of ReKnoS over existing state-of-the-art baselines, with an average accuracy gain of 2.92%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22166v1-abstract-full').style.display = 'none'; document.getElementById('2503.22166v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21841">arXiv:2503.21841</a> <span> [<a href="https://arxiv.org/pdf/2503.21841">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HyperFree: A Channel-adaptive and Tuning-free Foundation Model for Hyperspectral Remote Sensing Imagery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jingtao Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yingyi Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinyu Wang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yunning Peng</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+C">Chen Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shaoyu Wang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zhendong Sun</a>, <a href="/search/cs?searchtype=author&query=Ke%2C+T">Tian Ke</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xiao Jiang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+T">Tangwei Lu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+A">Anran Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+Y">Yanfei Zhong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21841v1-abstract-short" style="display: inline;"> Advanced interpretation of hyperspectral remote sensing images benefits many precise Earth observation tasks. Recently, visual foundation models have promoted the remote sensing interpretation but concentrating on RGB and multispectral images. Due to the varied hyperspectral channels,existing foundation models would face image-by-image tuning situation, imposing great pressure on hardware and time… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21841v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21841v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21841v1-abstract-full" style="display: none;"> Advanced interpretation of hyperspectral remote sensing images benefits many precise Earth observation tasks. Recently, visual foundation models have promoted the remote sensing interpretation but concentrating on RGB and multispectral images. Due to the varied hyperspectral channels,existing foundation models would face image-by-image tuning situation, imposing great pressure on hardware and time resources. In this paper, we propose a tuning-free hyperspectral foundation model called HyperFree, by adapting the existing visual prompt engineering. To process varied channel numbers, we design a learned weight dictionary covering full-spectrum from $0.4 \sim 2.5 \, 渭\text{m}$, supporting to build the embedding layer dynamically. To make the prompt design more tractable, HyperFree can generate multiple semantic-aware masks for one prompt by treating feature distance as semantic-similarity. After pre-training HyperFree on constructed large-scale high-resolution hyperspectral images, HyperFree (1 prompt) has shown comparable results with specialized models (5 shots) on 5 tasks and 11 datasets.Code and dataset are accessible at https://rsidea.whu.edu.cn/hyperfree.htm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21841v1-abstract-full').style.display = 'none'; document.getElementById('2503.21841v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21824">arXiv:2503.21824</a> <span> [<a href="https://arxiv.org/pdf/2503.21824">pdf</a>, <a href="https://arxiv.org/format/2503.21824">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Protecting Your Video Content: Disrupting Automated Video-based LLM Annotations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haitong Liu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+K">Kuofeng Gao</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yang Bai</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinmin Li</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+J">Jinxiao Shan</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+T">Tao Dai</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shu-Tao Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21824v1-abstract-short" style="display: inline;"> Recently, video-based large language models (video-based LLMs) have achieved impressive performance across various video comprehension tasks. However, this rapid advancement raises significant privacy and security concerns, particularly regarding the unauthorized use of personal video data in automated annotation by video-based LLMs. These unauthorized annotated video-text pairs can then be used t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21824v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21824v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21824v1-abstract-full" style="display: none;"> Recently, video-based large language models (video-based LLMs) have achieved impressive performance across various video comprehension tasks. However, this rapid advancement raises significant privacy and security concerns, particularly regarding the unauthorized use of personal video data in automated annotation by video-based LLMs. These unauthorized annotated video-text pairs can then be used to improve the performance of downstream tasks, such as text-to-video generation. To safeguard personal videos from unauthorized use, we propose two series of protective video watermarks with imperceptible adversarial perturbations, named Ramblings and Mutes. Concretely, Ramblings aim to mislead video-based LLMs into generating inaccurate captions for the videos, thereby degrading the quality of video annotations through inconsistencies between video content and captions. Mutes, on the other hand, are designed to prompt video-based LLMs to produce exceptionally brief captions, lacking descriptive detail. Extensive experiments demonstrate that our video watermarking methods effectively protect video data by significantly reducing video annotation performance across various video-based LLMs, showcasing both stealthiness and robustness in protecting personal video content. Our code is available at https://github.com/ttthhl/Protecting_Your_Video_Content. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21824v1-abstract-full').style.display = 'none'; document.getElementById('2503.21824v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21816">arXiv:2503.21816</a> <span> [<a href="https://arxiv.org/pdf/2503.21816">pdf</a>, <a href="https://arxiv.org/format/2503.21816">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> EVPGS: Enhanced View Prior Guidance for Splatting-based Extrapolated View Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiahe Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Feiyu Wang</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+X">Xiaochao Qu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chengjing Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Luoqi Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Ting Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21816v1-abstract-short" style="display: inline;"> Gaussian Splatting (GS)-based methods rely on sufficient training view coverage and perform synthesis on interpolated views. In this work, we tackle the more challenging and underexplored Extrapolated View Synthesis (EVS) task. Here we enable GS-based models trained with limited view coverage to generalize well to extrapolated views. To achieve our goal, we propose a view augmentation framework to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21816v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21816v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21816v1-abstract-full" style="display: none;"> Gaussian Splatting (GS)-based methods rely on sufficient training view coverage and perform synthesis on interpolated views. In this work, we tackle the more challenging and underexplored Extrapolated View Synthesis (EVS) task. Here we enable GS-based models trained with limited view coverage to generalize well to extrapolated views. To achieve our goal, we propose a view augmentation framework to guide training through a coarse-to-fine process. At the coarse stage, we reduce rendering artifacts due to insufficient view coverage by introducing a regularization strategy at both appearance and geometry levels. At the fine stage, we generate reliable view priors to provide further training guidance. To this end, we incorporate an occlusion awareness into the view prior generation process, and refine the view priors with the aid of coarse stage output. We call our framework Enhanced View Prior Guidance for Splatting (EVPGS). To comprehensively evaluate EVPGS on the EVS task, we collect a real-world dataset called Merchandise3D dedicated to the EVS scenario. Experiments on three datasets including both real and synthetic demonstrate EVPGS achieves state-of-the-art performance, while improving synthesis quality at extrapolated views for GS-based methods both qualitatively and quantitatively. We will make our code, dataset, and models public. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21816v1-abstract-full').style.display = 'none'; document.getElementById('2503.21816v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21809">arXiv:2503.21809</a> <span> [<a href="https://arxiv.org/pdf/2503.21809">pdf</a>, <a href="https://arxiv.org/format/2503.21809">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Applications">stat.AP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Predictive Accuracy in Tennis: Integrating Fuzzy Logic and CV-GRNN for Dynamic Match Outcome and Player Momentum Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+K">Kechen Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaming Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhenyu Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinpeng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21809v1-abstract-short" style="display: inline;"> The predictive analysis of match outcomes and player momentum in professional tennis has long been a subject of scholarly debate. In this paper, we introduce a novel approach to game prediction by combining a multi-level fuzzy evaluation model with a CV-GRNN model. We first identify critical statistical indicators via Principal Component Analysis and then develop a two-tier fuzzy model based on th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21809v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21809v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21809v1-abstract-full" style="display: none;"> The predictive analysis of match outcomes and player momentum in professional tennis has long been a subject of scholarly debate. In this paper, we introduce a novel approach to game prediction by combining a multi-level fuzzy evaluation model with a CV-GRNN model. We first identify critical statistical indicators via Principal Component Analysis and then develop a two-tier fuzzy model based on the Wimbledon data. In addition, the results of Pearson Correlation Coefficient indicate that the momentum indicators, such as Player Win Streak and Score Difference, have a strong correlation among them, revealing insightful trends among players transitioning between losing and winning streaks. Subsequently, we refine the CV-GRNN model by incorporating 15 statistically significant indicators, resulting in an increase in accuracy to 86.64% and a decrease in MSE by 49.21%. This consequently strengthens the methodological framework for predicting tennis match outcomes, emphasizing its practical utility and potential for adaptation in various athletic contexts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21809v1-abstract-full').style.display = 'none'; document.getElementById('2503.21809v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages,10 figures,9 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21807">arXiv:2503.21807</a> <span> [<a href="https://arxiv.org/pdf/2503.21807">pdf</a>, <a href="https://arxiv.org/format/2503.21807">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> LERO: LLM-driven Evolutionary framework with Hybrid Rewards and Enhanced Observation for Multi-Agent Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yuan Wei</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+X">Xiaohan Shan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jianmin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21807v1-abstract-short" style="display: inline;"> Multi-agent reinforcement learning (MARL) faces two critical bottlenecks distinct from single-agent RL: credit assignment in cooperative tasks and partial observability of environmental states. We propose LERO, a framework integrating Large language models (LLMs) with evolutionary optimization to address these MARL-specific challenges. The solution centers on two LLM-generated components: a hybrid… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21807v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21807v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21807v1-abstract-full" style="display: none;"> Multi-agent reinforcement learning (MARL) faces two critical bottlenecks distinct from single-agent RL: credit assignment in cooperative tasks and partial observability of environmental states. We propose LERO, a framework integrating Large language models (LLMs) with evolutionary optimization to address these MARL-specific challenges. The solution centers on two LLM-generated components: a hybrid reward function that dynamically allocates individual credit through reward decomposition, and an observation enhancement function that augments partial observations with inferred environmental context. An evolutionary algorithm optimizes these components through iterative MARL training cycles, where top-performing candidates guide subsequent LLM generations. Evaluations in Multi-Agent Particle Environments (MPE) demonstrate LERO's superiority over baseline methods, with improved task performance and training efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21807v1-abstract-full').style.display = 'none'; document.getElementById('2503.21807v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21729">arXiv:2503.21729</a> <span> [<a href="https://arxiv.org/pdf/2503.21729">pdf</a>, <a href="https://arxiv.org/format/2503.21729">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ReaRAG: Knowledge-guided Reasoning Enhances Factuality of Large Reasoning Models with Iterative Retrieval Augmented Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+Z">Zhicheng Lee</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+S">Shulin Cao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jinxin Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiajie Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Weichuan Liu</a>, <a href="/search/cs?searchtype=author&query=Che%2C+X">Xiaoyin Che</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+L">Lei Hou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Juanzi Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21729v1-abstract-short" style="display: inline;"> Large Reasoning Models (LRMs) exhibit remarkable reasoning abilities but rely primarily on parametric knowledge, limiting factual accuracy. While recent works equip reinforcement learning (RL)-based LRMs with retrieval capabilities, they suffer from overthinking and lack robustness in reasoning, reducing their effectiveness in question answering (QA) tasks. To address this, we propose ReaRAG, a fa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21729v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21729v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21729v1-abstract-full" style="display: none;"> Large Reasoning Models (LRMs) exhibit remarkable reasoning abilities but rely primarily on parametric knowledge, limiting factual accuracy. While recent works equip reinforcement learning (RL)-based LRMs with retrieval capabilities, they suffer from overthinking and lack robustness in reasoning, reducing their effectiveness in question answering (QA) tasks. To address this, we propose ReaRAG, a factuality-enhanced reasoning model that explores diverse queries without excessive iterations. Our solution includes a novel data construction framework with an upper bound on the reasoning chain length. Specifically, we first leverage an LRM to generate deliberate thinking, then select an action from a predefined action space (Search and Finish). For Search action, a query is executed against the RAG engine, where the result is returned as observation to guide reasoning steps later. This process iterates until a Finish action is chosen. Benefiting from ReaRAG's strong reasoning capabilities, our approach outperforms existing baselines on multi-hop QA. Further analysis highlights its strong reflective ability to recognize errors and refine its reasoning trajectory. Our study enhances LRMs' factuality while effectively integrating robust reasoning for Retrieval-Augmented Generation (RAG). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21729v1-abstract-full').style.display = 'none'; document.getElementById('2503.21729v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21504">arXiv:2503.21504</a> <span> [<a href="https://arxiv.org/pdf/2503.21504">pdf</a>, <a href="https://arxiv.org/format/2503.21504">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Keyword-Oriented Multimodal Modeling for Euphemism Identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yuxue Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Junsong Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Meixuan Chen</a>, <a href="/search/cs?searchtype=author&query=Su%2C+D">Dongyu Su</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tongguan Wang</a>, <a href="/search/cs?searchtype=author&query=Sha%2C+Y">Ying Sha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21504v1-abstract-short" style="display: inline;"> Euphemism identification deciphers the true meaning of euphemisms, such as linking "weed" (euphemism) to "marijuana" (target keyword) in illicit texts, aiding content moderation and combating underground markets. While existing methods are primarily text-based, the rise of social media highlights the need for multimodal analysis, incorporating text, images, and audio. However, the lack of multimod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21504v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21504v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21504v1-abstract-full" style="display: none;"> Euphemism identification deciphers the true meaning of euphemisms, such as linking "weed" (euphemism) to "marijuana" (target keyword) in illicit texts, aiding content moderation and combating underground markets. While existing methods are primarily text-based, the rise of social media highlights the need for multimodal analysis, incorporating text, images, and audio. However, the lack of multimodal datasets for euphemisms limits further research. To address this, we regard euphemisms and their corresponding target keywords as keywords and first introduce a keyword-oriented multimodal corpus of euphemisms (KOM-Euph), involving three datasets (Drug, Weapon, and Sexuality), including text, images, and speech. We further propose a keyword-oriented multimodal euphemism identification method (KOM-EI), which uses cross-modal feature alignment and dynamic fusion modules to explicitly utilize the visual and audio features of the keywords for efficient euphemism identification. Extensive experiments demonstrate that KOM-EI outperforms state-of-the-art models and large language models, and show the importance of our multimodal datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21504v1-abstract-full').style.display = 'none'; document.getElementById('2503.21504v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21450">arXiv:2503.21450</a> <span> [<a href="https://arxiv.org/pdf/2503.21450">pdf</a>, <a href="https://arxiv.org/format/2503.21450">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> CMADiff: Cross-Modal Aligned Diffusion for Controllable Protein Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+C">Changjian Zhou</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+Y">Yuexi Qiu</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+T">Tongtong Ling</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiafeng Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shuanghe Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiangjing Wang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jia Song</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+W">Wensheng Xiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21450v1-abstract-short" style="display: inline;"> AI-assisted protein design has emerged as a critical tool for advancing biotechnology, as deep generative models have demonstrated their reliability in this domain. However, most existing models primarily utilize protein sequence or structural data for training, neglecting the physicochemical properties of proteins.Moreover, they are deficient to control the generation of proteins in intuitive con… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21450v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21450v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21450v1-abstract-full" style="display: none;"> AI-assisted protein design has emerged as a critical tool for advancing biotechnology, as deep generative models have demonstrated their reliability in this domain. However, most existing models primarily utilize protein sequence or structural data for training, neglecting the physicochemical properties of proteins.Moreover, they are deficient to control the generation of proteins in intuitive conditions. To address these limitations,we propose CMADiff here, a novel framework that enables controllable protein generation by aligning the physicochemical properties of protein sequences with text-based descriptions through a latent diffusion process. Specifically, CMADiff employs a Conditional Variational Autoencoder (CVAE) to integrate physicochemical features as conditional input, forming a robust latent space that captures biological traits. In this latent space, we apply a conditional diffusion process, which is guided by BioAligner, a contrastive learning-based module that aligns text descriptions with protein features, enabling text-driven control over protein sequence generation. Validated by a series of evaluations including AlphaFold3, the experimental results indicate that CMADiff outperforms protein sequence generation benchmarks and holds strong potential for future applications. The implementation and code are available at https://github.com/HPC-NEAU/PhysChemDiff. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21450v1-abstract-full').style.display = 'none'; document.getElementById('2503.21450v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21323">arXiv:2503.21323</a> <span> [<a href="https://arxiv.org/pdf/2503.21323">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DuckSegmentation: A segmentation model based on the AnYue Hemp Duck Dataset </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+L">Ling Feng</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+T">Tianyu Xie</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+W">Wei Ma</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+R">Ruijie Fu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yingxiao Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jun Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Bei Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21323v1-abstract-short" style="display: inline;"> The modernization of smart farming is a way to improve agricultural production efficiency, and improve the agricultural production environment. Although many large models have achieved high accuracy in the task of object recognition and segmentation, they cannot really be put into use in the farming industry due to their own poor interpretability and limitations in computational volume. In this pa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21323v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21323v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21323v1-abstract-full" style="display: none;"> The modernization of smart farming is a way to improve agricultural production efficiency, and improve the agricultural production environment. Although many large models have achieved high accuracy in the task of object recognition and segmentation, they cannot really be put into use in the farming industry due to their own poor interpretability and limitations in computational volume. In this paper, we built AnYue Shelduck Dateset, which contains a total of 1951 Shelduck datasets, and performed target detection and segmentation annotation with the help of professional annotators. Based on AnYue ShelduckDateset, this paper describes DuckProcessing, an efficient and powerful module for duck identification based on real shelduckfarms. First of all, using the YOLOv8 module designed to divide the mahjong between them, Precision reached 98.10%, Recall reached 96.53% and F1 score reached 0.95 on the test set. Again using the DuckSegmentation segmentation model, DuckSegmentation reached 96.43% mIoU. Finally, the excellent DuckSegmentation was used as the teacher model, and through knowledge distillation, Deeplabv3 r50 was used as the student model, and the final student model achieved 94.49% mIoU on the test set. The method provides a new way of thinking in practical sisal duck smart farming. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21323v1-abstract-full').style.display = 'none'; document.getElementById('2503.21323v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20840">arXiv:2503.20840</a> <span> [<a href="https://arxiv.org/pdf/2503.20840">pdf</a>, <a href="https://arxiv.org/format/2503.20840">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> CodeTool: Enhancing Programmatic Tool Invocation of LLMs via Process Supervision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yifei Lu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+F">Fanghua Ye</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jian Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Q">Qiang Gao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Cheng Liu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+H">Haibo Luo</a>, <a href="/search/cs?searchtype=author&query=Du%2C+N">Nan Du</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaolong Li</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+F">Feiliang Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20840v1-abstract-short" style="display: inline;"> Tool invocation significantly enhances the capabilities of Large Language Models (LLMs), yet challenges persist, particularly in complex task scenarios. Current methods, such as instruction-enhanced reasoning and supervised fine-tuning, often result in unnecessarily long reasoning paths and face difficulties in verifying the correctness of intermediate steps. In this paper, we propose CodeTool, a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20840v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20840v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20840v1-abstract-full" style="display: none;"> Tool invocation significantly enhances the capabilities of Large Language Models (LLMs), yet challenges persist, particularly in complex task scenarios. Current methods, such as instruction-enhanced reasoning and supervised fine-tuning, often result in unnecessarily long reasoning paths and face difficulties in verifying the correctness of intermediate steps. In this paper, we propose CodeTool, a novel framework for stepwise code generation that improves LLM tool invocation by leveraging the concise and easily verifiable nature of code. CodeTool incorporates two distinct process rewards: the On-the-spot Reward, which provides immediate feedback on the accuracy of each tool invocation, and the Latent Reward, which assesses the contribution of each step toward overall task completion. By maximizing the cumulative reward of the On-the-spot and Latend Rewards at each step, LLMs are guided to follow efficient and accurate reasoning paths. Extensive experiments on StableToolBench and RestBench-TMDB demonstrate the superiority of CodeTool over existing approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20840v1-abstract-full').style.display = 'none'; document.getElementById('2503.20840v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20784">arXiv:2503.20784</a> <span> [<a href="https://arxiv.org/pdf/2503.20784">pdf</a>, <a href="https://arxiv.org/format/2503.20784">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FB-4D: Spatial-Temporal Coherent Dynamic 3D Content Generation with Feature Banks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinwei Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Huan-ang Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenyi Li</a>, <a href="/search/cs?searchtype=author&query=Chi%2C+H">Haohan Chi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chenyu Liu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+C">Chenxi Du</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yiqian Liu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+M">Mingju Gao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Guiyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zongzheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+L">Li Yi</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yao Yao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jingwei Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongyang Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yikai Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hao Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20784v1-abstract-short" style="display: inline;"> With the rapid advancements in diffusion models and 3D generation techniques, dynamic 3D content generation has become a crucial research area. However, achieving high-fidelity 4D (dynamic 3D) generation with strong spatial-temporal consistency remains a challenging task. Inspired by recent findings that pretrained diffusion features capture rich correspondences, we propose FB-4D, a novel 4D gener… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20784v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20784v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20784v1-abstract-full" style="display: none;"> With the rapid advancements in diffusion models and 3D generation techniques, dynamic 3D content generation has become a crucial research area. However, achieving high-fidelity 4D (dynamic 3D) generation with strong spatial-temporal consistency remains a challenging task. Inspired by recent findings that pretrained diffusion features capture rich correspondences, we propose FB-4D, a novel 4D generation framework that integrates a Feature Bank mechanism to enhance both spatial and temporal consistency in generated frames. In FB-4D, we store features extracted from previous frames and fuse them into the process of generating subsequent frames, ensuring consistent characteristics across both time and multiple views. To ensure a compact representation, the Feature Bank is updated by a proposed dynamic merging mechanism. Leveraging this Feature Bank, we demonstrate for the first time that generating additional reference sequences through multiple autoregressive iterations can continuously improve generation performance. Experimental results show that FB-4D significantly outperforms existing methods in terms of rendering quality, spatial-temporal consistency, and robustness. It surpasses all multi-view generation tuning-free approaches by a large margin and achieves performance on par with training-based methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20784v1-abstract-full').style.display = 'none'; document.getElementById('2503.20784v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page:https://fb-4d.c7w.tech/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20754">arXiv:2503.20754</a> <span> [<a href="https://arxiv.org/pdf/2503.20754">pdf</a>, <a href="https://arxiv.org/format/2503.20754">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Flying Vines: Design, Modeling, and Control of a Soft Aerial Robotic Arm </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jitosho%2C+R">Rianna Jitosho</a>, <a href="/search/cs?searchtype=author&query=Winston%2C+C+E">Crystal E. Winston</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shengan Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinxin Li</a>, <a href="/search/cs?searchtype=author&query=Ahlquist%2C+M">Maxwell Ahlquist</a>, <a href="/search/cs?searchtype=author&query=Woehrle%2C+N+J">Nicholas John Woehrle</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C+K">C. Karen Liu</a>, <a href="/search/cs?searchtype=author&query=Okamura%2C+A+M">Allison M. Okamura</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20754v1-abstract-short" style="display: inline;"> Aerial robotic arms aim to enable inspection and environment interaction in otherwise hard-to-reach areas from the air. However, many aerial manipulators feature bulky or heavy robot manipulators mounted to large, high-payload aerial vehicles. Instead, we propose an aerial robotic arm with low mass and a small stowed configuration called a "flying vine". The flying vine consists of a small, maneuv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20754v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20754v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20754v1-abstract-full" style="display: none;"> Aerial robotic arms aim to enable inspection and environment interaction in otherwise hard-to-reach areas from the air. However, many aerial manipulators feature bulky or heavy robot manipulators mounted to large, high-payload aerial vehicles. Instead, we propose an aerial robotic arm with low mass and a small stowed configuration called a "flying vine". The flying vine consists of a small, maneuverable quadrotor equipped with a soft, growing, inflated beam as the arm. This soft robot arm is underactuated, and positioning of the end effector is achieved by controlling the coupled quadrotor-vine dynamics. In this work, we present the flying vine design and a modeling and control framework for tracking desired end effector trajectories. The dynamic model leverages data-driven modeling methods and introduces bilinear interpolation to account for time-varying dynamic parameters. We use trajectory optimization to plan quadrotor controls that produce desired end effector motions. Experimental results on a physical prototype demonstrate that our framework enables the flying vine to perform high-speed end effector tracking, laying a foundation for performing dynamic maneuvers with soft aerial manipulators. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20754v1-abstract-full').style.display = 'none'; document.getElementById('2503.20754v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to RA-L</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20672">arXiv:2503.20672</a> <span> [<a href="https://arxiv.org/pdf/2503.20672">pdf</a>, <a href="https://arxiv.org/format/2503.20672">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> BizGen: Advancing Article-level Visual Text Rendering for Infographics Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yuyang Peng</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+S">Shishi Xiao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+K">Keming Wu</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+Q">Qisheng Liao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bohan Chen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+K">Kevin Lin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Danqing Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Ji Li</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yuhui Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20672v1-abstract-short" style="display: inline;"> Recently, state-of-the-art text-to-image generation models, such as Flux and Ideogram 2.0, have made significant progress in sentence-level visual text rendering. In this paper, we focus on the more challenging scenarios of article-level visual text rendering and address a novel task of generating high-quality business content, including infographics and slides, based on user provided article-leve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20672v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20672v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20672v1-abstract-full" style="display: none;"> Recently, state-of-the-art text-to-image generation models, such as Flux and Ideogram 2.0, have made significant progress in sentence-level visual text rendering. In this paper, we focus on the more challenging scenarios of article-level visual text rendering and address a novel task of generating high-quality business content, including infographics and slides, based on user provided article-level descriptive prompts and ultra-dense layouts. The fundamental challenges are twofold: significantly longer context lengths and the scarcity of high-quality business content data. In contrast to most previous works that focus on a limited number of sub-regions and sentence-level prompts, ensuring precise adherence to ultra-dense layouts with tens or even hundreds of sub-regions in business content is far more challenging. We make two key technical contributions: (i) the construction of scalable, high-quality business content dataset, i.e., Infographics-650K, equipped with ultra-dense layouts and prompts by implementing a layer-wise retrieval-augmented infographic generation scheme; and (ii) a layout-guided cross attention scheme, which injects tens of region-wise prompts into a set of cropped region latent space according to the ultra-dense layouts, and refine each sub-regions flexibly during inference using a layout conditional CFG. We demonstrate the strong results of our system compared to previous SOTA systems such as Flux and SD3 on our BizEval prompt set. Additionally, we conduct thorough ablation experiments to verify the effectiveness of each component. We hope our constructed Infographics-650K and BizEval can encourage the broader community to advance the progress of business content generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20672v1-abstract-full').style.display = 'none'; document.getElementById('2503.20672v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2025. Project Page: https://bizgen-msra.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20212">arXiv:2503.20212</a> <span> [<a href="https://arxiv.org/pdf/2503.20212">pdf</a>, <a href="https://arxiv.org/format/2503.20212">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Dolphin: A Large-Scale Automatic Speech Recognition Model for Eastern Languages </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Meng%2C+Y">Yangyang Meng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinpeng Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+G">Guodong Lin</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+Y">Yu Pu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guanbo Wang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+H">Hu Du</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zhiming Shao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yukai Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Ke Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei-Qiang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20212v1-abstract-short" style="display: inline;"> This report introduces Dolphin, a large-scale multilingual automatic speech recognition (ASR) model that extends the Whisper architecture to support a wider range of languages. Our approach integrates in-house proprietary and open-source datasets to refine and optimize Dolphin's performance. The model is specifically designed to achieve notable recognition accuracy for 40 Eastern languages across… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20212v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20212v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20212v1-abstract-full" style="display: none;"> This report introduces Dolphin, a large-scale multilingual automatic speech recognition (ASR) model that extends the Whisper architecture to support a wider range of languages. Our approach integrates in-house proprietary and open-source datasets to refine and optimize Dolphin's performance. The model is specifically designed to achieve notable recognition accuracy for 40 Eastern languages across East Asia, South Asia, Southeast Asia, and the Middle East, while also supporting 22 Chinese dialects. Experimental evaluations show that Dolphin significantly outperforms current state-of-the-art open-source models across various languages. To promote reproducibility and community-driven innovation, we are making our trained models and inference source code publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20212v1-abstract-full').style.display = 'none'; document.getElementById('2503.20212v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20202">arXiv:2503.20202</a> <span> [<a href="https://arxiv.org/pdf/2503.20202">pdf</a>, <a href="https://arxiv.org/format/2503.20202">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> SARGes: Semantically Aligned Reliable Gesture Generation via Intent Chain </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+N">Nan Gao</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+Y">Yihua Bao</a>, <a href="/search/cs?searchtype=author&query=Weng%2C+D">Dongdong Weng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jiayi Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jia Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yan Zhou</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+P">Pengfei Wan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Di Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20202v1-abstract-short" style="display: inline;"> Co-speech gesture generation enhances human-computer interaction realism through speech-synchronized gesture synthesis. However, generating semantically meaningful gestures remains a challenging problem. We propose SARGes, a novel framework that leverages large language models (LLMs) to parse speech content and generate reliable semantic gesture labels, which subsequently guide the synthesis of me… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20202v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20202v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20202v1-abstract-full" style="display: none;"> Co-speech gesture generation enhances human-computer interaction realism through speech-synchronized gesture synthesis. However, generating semantically meaningful gestures remains a challenging problem. We propose SARGes, a novel framework that leverages large language models (LLMs) to parse speech content and generate reliable semantic gesture labels, which subsequently guide the synthesis of meaningful co-speech gestures.First, we constructed a comprehensive co-speech gesture ethogram and developed an LLM-based intent chain reasoning mechanism that systematically parses and decomposes gesture semantics into structured inference steps following ethogram criteria, effectively guiding LLMs to generate context-aware gesture labels. Subsequently, we constructed an intent chain-annotated text-to-gesture label dataset and trained a lightweight gesture label generation model, which then guides the generation of credible and semantically coherent co-speech gestures. Experimental results demonstrate that SARGes achieves highly semantically-aligned gesture labeling (50.2% accuracy) with efficient single-pass inference (0.4 seconds). The proposed method provides an interpretable intent reasoning pathway for semantic gesture synthesis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20202v1-abstract-full').style.display = 'none'; document.getElementById('2503.20202v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20190">arXiv:2503.20190</a> <span> [<a href="https://arxiv.org/pdf/2503.20190">pdf</a>, <a href="https://arxiv.org/format/2503.20190">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Cross-Modal Prototype Allocation: Unsupervised Slide Representation Learning via Patch-Text Contrast in Computational Pathology </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuxuan Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiawen Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jiali Hu</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+X">Xitong Ling</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+T">Tian Guan</a>, <a href="/search/cs?searchtype=author&query=Han%2C+A">Anjia Han</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yonghong He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20190v1-abstract-short" style="display: inline;"> With the rapid advancement of pathology foundation models (FMs), the representation learning of whole slide images (WSIs) attracts increasing attention. Existing studies develop high-quality patch feature extractors and employ carefully designed aggregation schemes to derive slide-level representations. However, mainstream weakly supervised slide representation learning methods, primarily based on… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20190v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20190v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20190v1-abstract-full" style="display: none;"> With the rapid advancement of pathology foundation models (FMs), the representation learning of whole slide images (WSIs) attracts increasing attention. Existing studies develop high-quality patch feature extractors and employ carefully designed aggregation schemes to derive slide-level representations. However, mainstream weakly supervised slide representation learning methods, primarily based on multiple instance learning (MIL), are tailored to specific downstream tasks, which limits their generalizability. To address this issue, some studies explore unsupervised slide representation learning. However, these approaches focus solely on the visual modality of patches, neglecting the rich semantic information embedded in textual data. In this work, we propose ProAlign, a cross-modal unsupervised slide representation learning framework. Specifically, we leverage a large language model (LLM) to generate descriptive text for the prototype types present in a WSI, introducing patch-text contrast to construct initial prototype embeddings. Furthermore, we propose a parameter-free attention aggregation strategy that utilizes the similarity between patches and these prototypes to form unsupervised slide embeddings applicable to a wide range of downstream tasks. Extensive experiments on four public datasets show that ProAlign outperforms existing unsupervised frameworks and achieves performance comparable to some weakly supervised models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20190v1-abstract-full').style.display = 'none'; document.getElementById('2503.20190v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11pages,3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19839">arXiv:2503.19839</a> <span> [<a href="https://arxiv.org/pdf/2503.19839">pdf</a>, <a href="https://arxiv.org/format/2503.19839">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FireEdit: Fine-grained Instruction-based Image Editing via Region-aware Vision Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jun Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiahao Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zunnan Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hanhui Li</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yiji Cheng</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+F">Fa-Ting Hong</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Q">Qin Lin</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Q">Qinglin Lu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiaodan Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19839v2-abstract-short" style="display: inline;"> Currently, instruction-based image editing methods have made significant progress by leveraging the powerful cross-modal understanding capabilities of vision language models (VLMs). However, they still face challenges in three key areas: 1) complex scenarios; 2) semantic consistency; and 3) fine-grained editing. To address these issues, we propose FireEdit, an innovative Fine-grained Instruction-b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19839v2-abstract-full').style.display = 'inline'; document.getElementById('2503.19839v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19839v2-abstract-full" style="display: none;"> Currently, instruction-based image editing methods have made significant progress by leveraging the powerful cross-modal understanding capabilities of vision language models (VLMs). However, they still face challenges in three key areas: 1) complex scenarios; 2) semantic consistency; and 3) fine-grained editing. To address these issues, we propose FireEdit, an innovative Fine-grained Instruction-based image editing framework that exploits a REgion-aware VLM. FireEdit is designed to accurately comprehend user instructions and ensure effective control over the editing process. Specifically, we enhance the fine-grained visual perception capabilities of the VLM by introducing additional region tokens. Relying solely on the output of the LLM to guide the diffusion model may lead to suboptimal editing results. Therefore, we propose a Time-Aware Target Injection module and a Hybrid Visual Cross Attention module. The former dynamically adjusts the guidance strength at various denoising stages by integrating timestep embeddings with the text embeddings. The latter enhances visual details for image editing, thereby preserving semantic consistency between the edited result and the source image. By combining the VLM enhanced with fine-grained region tokens and the time-dependent diffusion model, FireEdit demonstrates significant advantages in comprehending editing instructions and maintaining high semantic consistency. Extensive experiments indicate that our approach surpasses the state-of-the-art instruction-based image editing methods. Our project is available at https://zjgans.github.io/fireedit.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19839v2-abstract-full').style.display = 'none'; document.getElementById('2503.19839v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19717">arXiv:2503.19717</a> <span> [<a href="https://arxiv.org/pdf/2503.19717">pdf</a>, <a href="https://arxiv.org/format/2503.19717">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Invertible Koopman neural operator for data-driven modeling of partial differential equations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Yuhong Jin</a>, <a href="/search/cs?searchtype=author&query=Cong%2C+A">Andong Cong</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+L">Lei Hou</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Q">Qiang Gao</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+X">Xiangdong Ge</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+C">Chonglong Zhu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yongzhi Feng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jun Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19717v1-abstract-short" style="display: inline;"> Koopman operator theory is a popular candidate for data-driven modeling because it provides a global linearization representation for nonlinear dynamical systems. However, existing Koopman operator-based methods suffer from shortcomings in constructing the well-behaved observable function and its inverse and are inefficient enough when dealing with partial differential equations (PDEs). To address… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19717v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19717v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19717v1-abstract-full" style="display: none;"> Koopman operator theory is a popular candidate for data-driven modeling because it provides a global linearization representation for nonlinear dynamical systems. However, existing Koopman operator-based methods suffer from shortcomings in constructing the well-behaved observable function and its inverse and are inefficient enough when dealing with partial differential equations (PDEs). To address these issues, this paper proposes the Invertible Koopman Neural Operator (IKNO), a novel data-driven modeling approach inspired by the Koopman operator theory and neural operator. IKNO leverages an Invertible Neural Network to parameterize observable function and its inverse simultaneously under the same learnable parameters, explicitly guaranteeing the reconstruction relation, thus eliminating the dependency on the reconstruction loss, which is an essential improvement over the original Koopman Neural Operator (KNO). The structured linear matrix inspired by the Koopman operator theory is parameterized to learn the evolution of observables' low-frequency modes in the frequency space rather than directly in the observable space, sustaining IKNO is resolution-invariant like other neural operators. Moreover, with preprocessing such as interpolation and dimension expansion, IKNO can be extended to operator learning tasks defined on non-Cartesian domains. We fully support the above claims based on rich numerical and real-world examples and demonstrate the effectiveness of IKNO and superiority over other neural operators. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19717v1-abstract-full').style.display = 'none'; document.getElementById('2503.19717v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19584">arXiv:2503.19584</a> <span> [<a href="https://arxiv.org/pdf/2503.19584">pdf</a>, <a href="https://arxiv.org/format/2503.19584">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Multi-agent Application System in Office Collaboration Scenarios </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+S">Songtao Sun</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jingyi Li</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yuanfei Dong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haoguang Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chenxin Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F">Fuyang Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qiang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19584v2-abstract-short" style="display: inline;"> This paper introduces a multi-agent application system designed to enhance office collaboration efficiency and work quality. The system integrates artificial intelligence, machine learning, and natural language processing technologies, achieving functionalities such as task allocation, progress monitoring, and information sharing. The agents within the system are capable of providing personalized… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19584v2-abstract-full').style.display = 'inline'; document.getElementById('2503.19584v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19584v2-abstract-full" style="display: none;"> This paper introduces a multi-agent application system designed to enhance office collaboration efficiency and work quality. The system integrates artificial intelligence, machine learning, and natural language processing technologies, achieving functionalities such as task allocation, progress monitoring, and information sharing. The agents within the system are capable of providing personalized collaboration support based on team members' needs and incorporate data analysis tools to improve decision-making quality. The paper also proposes an intelligent agent architecture that separates Plan and Solver, and through techniques such as multi-turn query rewriting and business tool retrieval, it enhances the agent's multi-intent and multi-turn dialogue capabilities. Furthermore, the paper details the design of tools and multi-turn dialogue in the context of office collaboration scenarios, and validates the system's effectiveness through experiments and evaluations. Ultimately, the system has demonstrated outstanding performance in real business applications, particularly in query understanding, task planning, and tool calling. Looking forward, the system is expected to play a more significant role in addressing complex interaction issues within dynamic environments and large-scale multi-agent systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19584v2-abstract-full').style.display = 'none'; document.getElementById('2503.19584v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19391">arXiv:2503.19391</a> <span> [<a href="https://arxiv.org/pdf/2503.19391">pdf</a>, <a href="https://arxiv.org/format/2503.19391">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> TraF-Align: Trajectory-aware Feature Alignment for Asynchronous Multi-agent Perception </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zhiying Song</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Lei Yang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+F">Fuxi Wen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jun Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19391v1-abstract-short" style="display: inline;"> Cooperative perception presents significant potential for enhancing the sensing capabilities of individual vehicles, however, inter-agent latency remains a critical challenge. Latencies cause misalignments in both spatial and semantic features, complicating the fusion of real-time observations from the ego vehicle with delayed data from others. To address these issues, we propose TraF-Align, a nov… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19391v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19391v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19391v1-abstract-full" style="display: none;"> Cooperative perception presents significant potential for enhancing the sensing capabilities of individual vehicles, however, inter-agent latency remains a critical challenge. Latencies cause misalignments in both spatial and semantic features, complicating the fusion of real-time observations from the ego vehicle with delayed data from others. To address these issues, we propose TraF-Align, a novel framework that learns the flow path of features by predicting the feature-level trajectory of objects from past observations up to the ego vehicle's current time. By generating temporally ordered sampling points along these paths, TraF-Align directs attention from the current-time query to relevant historical features along each trajectory, supporting the reconstruction of current-time features and promoting semantic interaction across multiple frames. This approach corrects spatial misalignment and ensures semantic consistency across agents, effectively compensating for motion and achieving coherent feature fusion. Experiments on two real-world datasets, V2V4Real and DAIR-V2X-Seq, show that TraF-Align sets a new benchmark for asynchronous cooperative perception. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19391v1-abstract-full').style.display = 'none'; document.getElementById('2503.19391v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19207">arXiv:2503.19207</a> <span> [<a href="https://arxiv.org/pdf/2503.19207">pdf</a>, <a href="https://arxiv.org/format/2503.19207">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FRESA:Feedforward Reconstruction of Personalized Skinned Avatars from Few Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+R">Rong Wang</a>, <a href="/search/cs?searchtype=author&query=Prada%2C+F">Fabian Prada</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziyan Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhongshi Jiang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+C">Chengxiang Yin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Junxuan Li</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+S">Shunsuke Saito</a>, <a href="/search/cs?searchtype=author&query=Santesteban%2C+I">Igor Santesteban</a>, <a href="/search/cs?searchtype=author&query=Romero%2C+J">Javier Romero</a>, <a href="/search/cs?searchtype=author&query=Joshi%2C+R">Rohan Joshi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongdong Li</a>, <a href="/search/cs?searchtype=author&query=Saragih%2C+J">Jason Saragih</a>, <a href="/search/cs?searchtype=author&query=Sheikh%2C+Y">Yaser Sheikh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19207v1-abstract-short" style="display: inline;"> We present a novel method for reconstructing personalized 3D human avatars with realistic animation from only a few images. Due to the large variations in body shapes, poses, and cloth types, existing methods mostly require hours of per-subject optimization during inference, which limits their practical applications. In contrast, we learn a universal prior from over a thousand clothed humans to ac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19207v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19207v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19207v1-abstract-full" style="display: none;"> We present a novel method for reconstructing personalized 3D human avatars with realistic animation from only a few images. Due to the large variations in body shapes, poses, and cloth types, existing methods mostly require hours of per-subject optimization during inference, which limits their practical applications. In contrast, we learn a universal prior from over a thousand clothed humans to achieve instant feedforward generation and zero-shot generalization. Specifically, instead of rigging the avatar with shared skinning weights, we jointly infer personalized avatar shape, skinning weights, and pose-dependent deformations, which effectively improves overall geometric fidelity and reduces deformation artifacts. Moreover, to normalize pose variations and resolve coupled ambiguity between canonical shapes and skinning weights, we design a 3D canonicalization process to produce pixel-aligned initial conditions, which helps to reconstruct fine-grained geometric details. We then propose a multi-frame feature aggregation to robustly reduce artifacts introduced in canonicalization and fuse a plausible avatar preserving person-specific identities. Finally, we train the model in an end-to-end framework on a large-scale capture dataset, which contains diverse human subjects paired with high-quality 3D scans. Extensive experiments show that our method generates more authentic reconstruction and animation than state-of-the-arts, and can be directly generalized to inputs from casually taken phone photos. Project page and code is available at https://github.com/rongakowang/FRESA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19207v1-abstract-full').style.display = 'none'; document.getElementById('2503.19207v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18665">arXiv:2503.18665</a> <span> [<a href="https://arxiv.org/pdf/2503.18665">pdf</a>, <a href="https://arxiv.org/format/2503.18665">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Boosting Virtual Agent Learning and Reasoning: A Step-wise, Multi-dimensional, and Generalist Reward Model with Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Miao%2C+B">Bingchen Miao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yang Wu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+M">Minghe Gao</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Q">Qifan Yu</a>, <a href="/search/cs?searchtype=author&query=Bu%2C+W">Wendong Bu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenqiao Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunfei Li</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Siliang Tang</a>, <a href="/search/cs?searchtype=author&query=Chua%2C+T">Tat-Seng Chua</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Juncheng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18665v1-abstract-short" style="display: inline;"> The development of Generalist Virtual Agents (GVAs) powered by Multimodal Large Language Models (MLLMs) has shown significant promise in autonomous task execution. However, current training paradigms face critical limitations, including reliance on outcome supervision and labor-intensive human annotations. To address these challenges, we propose Similar, a Step-wise Multi-dimensional Generalist Re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18665v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18665v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18665v1-abstract-full" style="display: none;"> The development of Generalist Virtual Agents (GVAs) powered by Multimodal Large Language Models (MLLMs) has shown significant promise in autonomous task execution. However, current training paradigms face critical limitations, including reliance on outcome supervision and labor-intensive human annotations. To address these challenges, we propose Similar, a Step-wise Multi-dimensional Generalist Reward Model, which offers fine-grained signals for agent training and can choose better action for inference-time scaling. Specifically, we begin by systematically defining five dimensions for evaluating agent actions. Building on this framework, we design an MCTS-P algorithm to automatically collect and annotate step-wise, five-dimensional agent execution data. Using this data, we train Similar with the Triple-M strategy. Furthermore, we introduce the first benchmark in the virtual agent domain for step-wise, multi-dimensional reward model training and evaluation, named SRM. This benchmark consists of two components: SRMTrain, which serves as the training set for Similar, and SRMEval, a manually selected test set for evaluating the reward model. Experimental results demonstrate that Similar, through its step-wise, multi-dimensional assessment and synergistic gain, provides GVAs with effective intermediate signals during both training and inference-time scaling. The code is available at https://github.com/Galery23/Similar-v1. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18665v1-abstract-full').style.display = 'none'; document.getElementById('2503.18665v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18578">arXiv:2503.18578</a> <span> [<a href="https://arxiv.org/pdf/2503.18578">pdf</a>, <a href="https://arxiv.org/format/2503.18578">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Galaxy Walker: Geometry-aware VLMs For Galaxy-scale Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianyu Chen</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+X">Xingcheng Fu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yisen Gao</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+H">Haodong Qian</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yuecen Wei</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+K">Kun Yan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Haoyi Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jianxin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18578v1-abstract-short" style="display: inline;"> Modern vision-language models (VLMs) develop patch embedding and convolution backbone within vector space, especially Euclidean ones, at the very founding. When expanding VLMs to a galaxy scale for understanding astronomical phenomena, the integration of spherical space for planetary orbits and hyperbolic spaces for black holes raises two formidable challenges. a) The current pre-training model is… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18578v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18578v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18578v1-abstract-full" style="display: none;"> Modern vision-language models (VLMs) develop patch embedding and convolution backbone within vector space, especially Euclidean ones, at the very founding. When expanding VLMs to a galaxy scale for understanding astronomical phenomena, the integration of spherical space for planetary orbits and hyperbolic spaces for black holes raises two formidable challenges. a) The current pre-training model is confined to Euclidean space rather than a comprehensive geometric embedding. b) The predominant architecture lacks suitable backbones for anisotropic physical geometries. In this paper, we introduced Galaxy-Walker, a geometry-aware VLM, for the universe-level vision understanding tasks. We proposed the geometry prompt that generates geometry tokens by random walks across diverse spaces on a multi-scale physical graph, along with a geometry adapter that compresses and reshapes the space anisotropy in a mixture-of-experts manner. Extensive experiments demonstrate the effectiveness of our approach, with Galaxy-Walker achieving state-of-the-art performance in both galaxy property estimation ($R^2$ scores up to $0.91$) and morphology classification tasks (up to $+0.17$ F1 improvement in challenging features), significantly outperforming both domain-specific models and general-purpose VLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18578v1-abstract-full').style.display = 'none'; document.getElementById('2503.18578v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18503">arXiv:2503.18503</a> <span> [<a href="https://arxiv.org/pdf/2503.18503">pdf</a>, <a href="https://arxiv.org/format/2503.18503">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Deterministic Certification of Graph Neural Networks against Graph Poisoning Attacks with Arbitrary Perturbations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiate Li</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+M">Meng Pang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yun Dong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Binghui Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18503v1-abstract-short" style="display: inline;"> Graph neural networks (GNNs) are becoming the de facto method to learn on the graph data and have achieved the state-of-the-art on node and graph classification tasks. However, recent works show GNNs are vulnerable to training-time poisoning attacks -- marginally perturbing edges, nodes, or/and node features of training graph(s) can largely degrade GNNs' testing performance. Most previous defenses… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18503v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18503v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18503v1-abstract-full" style="display: none;"> Graph neural networks (GNNs) are becoming the de facto method to learn on the graph data and have achieved the state-of-the-art on node and graph classification tasks. However, recent works show GNNs are vulnerable to training-time poisoning attacks -- marginally perturbing edges, nodes, or/and node features of training graph(s) can largely degrade GNNs' testing performance. Most previous defenses against graph poisoning attacks are empirical and are soon broken by adaptive / stronger ones. A few provable defenses provide robustness guarantees, but have large gaps when applied in practice: 1) restrict the attacker on only one type of perturbation; 2) design for a particular GNN architecture or task; and 3) robustness guarantees are not 100\% accurate. In this work, we bridge all these gaps by developing PGNNCert, the first certified defense of GNNs against poisoning attacks under arbitrary (edge, node, and node feature) perturbations with deterministic robustness guarantees. Extensive evaluations on multiple node and graph classification datasets and GNNs demonstrate the effectiveness of PGNNCert to provably defend against arbitrary poisoning perturbations. PGNNCert is also shown to significantly outperform the state-of-the-art certified defenses against edge perturbation or node perturbation during GNN training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18503v1-abstract-full').style.display = 'none'; document.getElementById('2503.18503v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18455">arXiv:2503.18455</a> <span> [<a href="https://arxiv.org/pdf/2503.18455">pdf</a>, <a href="https://arxiv.org/format/2503.18455">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> SEAlign: Alignment Training for Software Engineering Agent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kechi Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Huangzhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Ge Li</a>, <a href="/search/cs?searchtype=author&query=You%2C+J">Jinliang You</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jia Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yunfei Zhao</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zhi Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18455v1-abstract-short" style="display: inline;"> Recent advances in code generation models have demonstrated impressive capabilities in automating software development tasks, yet these models still struggle in real-world software engineering scenarios. Although current training methods, particularly post-training, excel at solving competitive programming problems, they fail to adequately prepare models for the complexities of practical software… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18455v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18455v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18455v1-abstract-full" style="display: none;"> Recent advances in code generation models have demonstrated impressive capabilities in automating software development tasks, yet these models still struggle in real-world software engineering scenarios. Although current training methods, particularly post-training, excel at solving competitive programming problems, they fail to adequately prepare models for the complexities of practical software development. This misalignment raises the critical question: Are existing alignment training methods well suited for real-world software engineering tasks? In this study, we identify this issue and propose SEAlign, a novel alignment framework designed to bridge the gap between code generation models and real-world software development tasks. SEAlign leverages the unique characteristics of software engineering processes, including high-quality workflow steps, to enhance model capabilities. Our framework further employs Monte Carlo Tree Search for fine-grained alignment in multi-step decision processes, followed by preference optimization on critical actions to ensure models meet real-world requirements. We evaluate SEAlign on three standard agentic benchmarks for real-world software engineering, including HumanEvalFix, SWE-Bench-Lite, and SWE-Bench-Verified. Experimental results demonstrate state-of-the-art performance with minimal training overhead. In addition, we develop an agent-based software development platform using SEAlign, which successfully automates the creation of several small applications. Human evaluations of these applications highlight significant improvements in both task performance and user experience. Our findings underscore the potential of SEAlign to accelerate the adoption of large code models in real-world software development. We believe that this research makes a meaningful step towards fully automated software engineering. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18455v1-abstract-full').style.display = 'none'; document.getElementById('2503.18455v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18432">arXiv:2503.18432</a> <span> [<a href="https://arxiv.org/pdf/2503.18432">pdf</a>, <a href="https://arxiv.org/format/2503.18432">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Teaching LLMs for Step-Level Automatic Math Correction via Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Junsong Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jie Zhou</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yutao Yang</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+B">Bihao Zhan</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Q">Qianjun Pan</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Yuyang Ding</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qin Chen</a>, <a href="/search/cs?searchtype=author&query=Bo%2C+J">Jiang Bo</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xin Lin</a>, <a href="/search/cs?searchtype=author&query=He%2C+L">Liang He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18432v1-abstract-short" style="display: inline;"> Automatic math correction aims to check students' solutions to mathematical problems via artificial intelligence technologies. Most existing studies focus on judging the final answer at the problem level, while they ignore detailed feedback on each step in a math problem-solving process, which requires abilities of semantic understanding and reasoning. In this paper, we propose a reinforcement lea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18432v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18432v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18432v1-abstract-full" style="display: none;"> Automatic math correction aims to check students' solutions to mathematical problems via artificial intelligence technologies. Most existing studies focus on judging the final answer at the problem level, while they ignore detailed feedback on each step in a math problem-solving process, which requires abilities of semantic understanding and reasoning. In this paper, we propose a reinforcement learning (RL)-based method to boost large language model (LLM) for step-level automatic math correction, named StepAMC. Particularly, we convert the step-level automatic math correction within the text classification task into an RL problem to enhance the reasoning capabilities of LLMs. Then, we design a space-constrained policy network to improve the stability of RL. Then, we introduce a fine-grained reward network to convert the binary human feedback into a continuous value. We conduct extensive experiments over two benchmark datasets and the results show that our model outperforms the eleven strong baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18432v1-abstract-full').style.display = 'none'; document.getElementById('2503.18432v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Li%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Li%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository