Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 3,287 results for author: <span class="mathjax">Yang, J</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Yang%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Yang, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Yang%2C+J&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Yang, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yang%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yang%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23461">arXiv:2503.23461</a> <span> [<a href="https://arxiv.org/pdf/2503.23461">pdf</a>, <a href="https://arxiv.org/format/2503.23461">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TextCrafter: Accurately Rendering Multiple Texts in Complex Visual Scenes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Du%2C+N">Nikai Du</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhennan Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhizhou Chen</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+S">Shan Gao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xi Chen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhengkai Jiang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jian Yang</a>, <a href="/search/cs?searchtype=author&query=Tai%2C+Y">Ying Tai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23461v1-abstract-short" style="display: inline;"> This paper explores the task of Complex Visual Text Generation (CVTG), which centers on generating intricate textual content distributed across diverse regions within visual images. In CVTG, image generation models often rendering distorted and blurred visual text or missing some visual text. To tackle these challenges, we propose TextCrafter, a novel multi-visual text rendering method. TextCrafte… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23461v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23461v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23461v1-abstract-full" style="display: none;"> This paper explores the task of Complex Visual Text Generation (CVTG), which centers on generating intricate textual content distributed across diverse regions within visual images. In CVTG, image generation models often rendering distorted and blurred visual text or missing some visual text. To tackle these challenges, we propose TextCrafter, a novel multi-visual text rendering method. TextCrafter employs a progressive strategy to decompose complex visual text into distinct components while ensuring robust alignment between textual content and its visual carrier. Additionally, it incorporates a token focus enhancement mechanism to amplify the prominence of visual text during the generation process. TextCrafter effectively addresses key challenges in CVTG tasks, such as text confusion, omissions, and blurriness. Moreover, we present a new benchmark dataset, CVTG-2K, tailored to rigorously evaluate the performance of generative models on CVTG tasks. Extensive experiments demonstrate that our method surpasses state-of-the-art approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23461v1-abstract-full').style.display = 'none'; document.getElementById('2503.23461v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23337">arXiv:2503.23337</a> <span> [<a href="https://arxiv.org/pdf/2503.23337">pdf</a>, <a href="https://arxiv.org/format/2503.23337">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Enhancing 3D Gaussian Splatting Compression via Spatial Condition-based Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jingui Ma</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yang Hu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+L">Luyang Tang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiayu Yang</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+Y">Yongqi Zhai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ronggang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23337v1-abstract-short" style="display: inline;"> Recently, 3D Gaussian Spatting (3DGS) has gained widespread attention in Novel View Synthesis (NVS) due to the remarkable real-time rendering performance. However, the substantial cost of storage and transmission of vanilla 3DGS hinders its further application (hundreds of megabytes or even gigabytes for a single scene). Motivated by the achievements of prediction in video compression, we introduc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23337v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23337v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23337v1-abstract-full" style="display: none;"> Recently, 3D Gaussian Spatting (3DGS) has gained widespread attention in Novel View Synthesis (NVS) due to the remarkable real-time rendering performance. However, the substantial cost of storage and transmission of vanilla 3DGS hinders its further application (hundreds of megabytes or even gigabytes for a single scene). Motivated by the achievements of prediction in video compression, we introduce the prediction technique into the anchor-based Gaussian representation to effectively reduce the bit rate. Specifically, we propose a spatial condition-based prediction module to utilize the grid-captured scene information for prediction, with a residual compensation strategy designed to learn the missing fine-grained information. Besides, to further compress the residual, we propose an instance-aware hyper prior, developing a structure-aware and instance-aware entropy model. Extensive experiments demonstrate the effectiveness of our prediction-based compression framework and each technical component. Even compared with SOTA compression method, our framework still achieves a bit rate savings of 24.42 percent. Code is to be released! <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23337v1-abstract-full').style.display = 'none'; document.getElementById('2503.23337v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The paper has been accepted by ICME2025 in March,2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23108">arXiv:2503.23108</a> <span> [<a href="https://arxiv.org/pdf/2503.23108">pdf</a>, <a href="https://arxiv.org/format/2503.23108">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> SupertonicTTS: Towards Highly Scalable and Efficient Text-to-Speech System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+H">Hyeongju Kim</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jinhyeok Yang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yechan Yu</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+S">Seunghun Ji</a>, <a href="/search/cs?searchtype=author&query=Morton%2C+J">Jacob Morton</a>, <a href="/search/cs?searchtype=author&query=Bous%2C+F">Frederik Bous</a>, <a href="/search/cs?searchtype=author&query=Byun%2C+J">Joon Byun</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Juheon Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23108v1-abstract-short" style="display: inline;"> We present a novel text-to-speech (TTS) system, namely SupertonicTTS, for improved scalability and efficiency in speech synthesis. SupertonicTTS is comprised of three components: a speech autoencoder for continuous latent representation, a text-to-latent module leveraging flow-matching for text-to-latent mapping, and an utterance-level duration predictor. To enable a lightweight architecture, we e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23108v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23108v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23108v1-abstract-full" style="display: none;"> We present a novel text-to-speech (TTS) system, namely SupertonicTTS, for improved scalability and efficiency in speech synthesis. SupertonicTTS is comprised of three components: a speech autoencoder for continuous latent representation, a text-to-latent module leveraging flow-matching for text-to-latent mapping, and an utterance-level duration predictor. To enable a lightweight architecture, we employ a low-dimensional latent space, temporal compression of latents, and ConvNeXt blocks. We further simplify the TTS pipeline by operating directly on raw character-level text and employing cross-attention for text-speech alignment, thus eliminating the need for grapheme-to-phoneme (G2P) modules and external aligners. In addition, we introduce context-sharing batch expansion that accelerates loss convergence and stabilizes text-speech alignment. Experimental results demonstrate that SupertonicTTS achieves competitive performance while significantly reducing architectural complexity and computational overhead compared to contemporary TTS models. Audio samples demonstrating the capabilities of SupertonicTTS are available at: https://supertonictts.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23108v1-abstract-full').style.display = 'none'; document.getElementById('2503.23108v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23030">arXiv:2503.23030</a> <span> [<a href="https://arxiv.org/pdf/2503.23030">pdf</a>, <a href="https://arxiv.org/format/2503.23030">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Visual and Semantic Prompt Collaboration for Generalized Zero-Shot Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Huajie Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhengxian Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xiaohan Yu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yongli Hu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+B">Baocai Yin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jian Yang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Y">Yuankai Qi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23030v1-abstract-short" style="display: inline;"> Generalized zero-shot learning aims to recognize both seen and unseen classes with the help of semantic information that is shared among different classes. It inevitably requires consistent visual-semantic alignment. Existing approaches fine-tune the visual backbone by seen-class data to obtain semantic-related visual features, which may cause overfitting on seen classes with a limited number of t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23030v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23030v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23030v1-abstract-full" style="display: none;"> Generalized zero-shot learning aims to recognize both seen and unseen classes with the help of semantic information that is shared among different classes. It inevitably requires consistent visual-semantic alignment. Existing approaches fine-tune the visual backbone by seen-class data to obtain semantic-related visual features, which may cause overfitting on seen classes with a limited number of training images. This paper proposes a novel visual and semantic prompt collaboration framework, which utilizes prompt tuning techniques for efficient feature adaptation. Specifically, we design a visual prompt to integrate the visual information for discriminative feature learning and a semantic prompt to integrate the semantic formation for visualsemantic alignment. To achieve effective prompt information integration, we further design a weak prompt fusion mechanism for the shallow layers and a strong prompt fusion mechanism for the deep layers in the network. Through the collaboration of visual and semantic prompts, we can obtain discriminative semantic-related features for generalized zero-shot image recognition. Extensive experiments demonstrate that our framework consistently achieves favorable performance in both conventional zero-shot learning and generalized zero-shot learning benchmarks compared to other state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23030v1-abstract-full').style.display = 'none'; document.getElementById('2503.23030v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.22851">arXiv:2503.22851</a> <span> [<a href="https://arxiv.org/pdf/2503.22851">pdf</a>, <a href="https://arxiv.org/format/2503.22851">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> RobuNFR: Evaluating the Robustness of Large Language Models on Non-Functional Requirements Aware Code Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+F">Feng Lin</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D+J">Dong Jae Kim</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhenhao Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jinqiu Yang</a>, <a href="/search/cs?searchtype=author&query=Tse-Husn"> Tse-Husn</a>, <a href="/search/cs?searchtype=author&query=Chen"> Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.22851v1-abstract-short" style="display: inline;"> When using LLMs to address Non-Functional Requirements (NFRs), developers may behave differently (e.g., expressing the same NFR in different words). Robust LLMs should output consistent results across these variations; however, this aspect remains underexplored. We propose RobuNFR for evaluating the robustness of LLMs in NFR-aware code generation across four NFR dimensions: design, readability, re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22851v1-abstract-full').style.display = 'inline'; document.getElementById('2503.22851v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.22851v1-abstract-full" style="display: none;"> When using LLMs to address Non-Functional Requirements (NFRs), developers may behave differently (e.g., expressing the same NFR in different words). Robust LLMs should output consistent results across these variations; however, this aspect remains underexplored. We propose RobuNFR for evaluating the robustness of LLMs in NFR-aware code generation across four NFR dimensions: design, readability, reliability, and performance, using three methodologies: prompt variation, regression testing, and diverse workflows. Our experiments show that RobuNFR reveals robustness issues in the tested LLMs when considering NFRs in code generation. Specifically, under prompt variation, including NFRs leads to a decrease in Pass@1 by up to 39 percent and an increase in the standard deviation from 0.48 to 2.48 compared to the baseline without NFRs (i.e., Function-Only). While incorporating NFRs generally improves overall NFR metrics, it also results in higher prompt sensitivity. In regression settings, some LLMs exhibit differences across versions, with improvements in one aspect (e.g., reduced code smells) often accompanied by regressions in another (e.g., decreased correctness), revealing inconsistencies that challenge their robustness. When varying workflows, the tested LLMs show significantly different NFR-aware code generation capabilities between two workflows: (1) integrating NFRs and functional requirements into the initial prompt and (2) enhancing Function-Only-generated code with the same NFR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22851v1-abstract-full').style.display = 'none'; document.getElementById('2503.22851v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.22718">arXiv:2503.22718</a> <span> [<a href="https://arxiv.org/pdf/2503.22718">pdf</a>, <a href="https://arxiv.org/format/2503.22718">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> LLM-ABM for Transportation: Assessing the Potential of LLM Agents in System Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianming Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jirong Yang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Y">Yafeng Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.22718v1-abstract-short" style="display: inline;"> Agent-based modeling approaches represent the state-of-art in modeling travel demand and transportation system dynamics and are valuable tools for transportation planning. However, established agent-based approaches in transportation rely on multi-hierarchical mathematical models to simulate travel behavior, which faces theoretical and practical limitations. The advent of large language models (LL… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22718v1-abstract-full').style.display = 'inline'; document.getElementById('2503.22718v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.22718v1-abstract-full" style="display: none;"> Agent-based modeling approaches represent the state-of-art in modeling travel demand and transportation system dynamics and are valuable tools for transportation planning. However, established agent-based approaches in transportation rely on multi-hierarchical mathematical models to simulate travel behavior, which faces theoretical and practical limitations. The advent of large language models (LLM) provides a new opportunity to refine agent-based modeling in transportation. LLM agents, which have impressive reasoning and planning abilities, can serve as a proxy of human travelers and be integrated into the modeling framework. However, despite evidence of their behavioral soundness, no existing studies have assessed the impact and validity of LLM-agent-based simulations from a system perspective in transportation. This paper aims to address this issue by designing and integrating LLM agents with human-traveler-like characteristics into a simulation of a transportation system and assessing its performance based on existing benchmarks. Using the classical transportation setting of the morning commute, we find that not only do the agents exhibit fine behavioral soundness, but also produce system dynamics that align well with standard benchmarks. Our analysis first verifies the effectiveness and potential of LLM-agent-based modeling for transportation planning on the system level. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22718v1-abstract-full').style.display = 'none'; document.getElementById('2503.22718v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by The 1st Workshop on AI for Urban Planning at AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.22225">arXiv:2503.22225</a> <span> [<a href="https://arxiv.org/pdf/2503.22225">pdf</a>, <a href="https://arxiv.org/format/2503.22225">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Follow Your Motion: A Generic Temporal Consistency Portrait Editing Framework with Trajectory Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+H">Haijie Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhenyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+H">Hao Tang</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+J">Jianjun Qian</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jian Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.22225v1-abstract-short" style="display: inline;"> Pre-trained conditional diffusion models have demonstrated remarkable potential in image editing. However, they often face challenges with temporal consistency, particularly in the talking head domain, where continuous changes in facial expressions intensify the level of difficulty. These issues stem from the independent editing of individual images and the inherent loss of temporal continuity dur… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22225v1-abstract-full').style.display = 'inline'; document.getElementById('2503.22225v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.22225v1-abstract-full" style="display: none;"> Pre-trained conditional diffusion models have demonstrated remarkable potential in image editing. However, they often face challenges with temporal consistency, particularly in the talking head domain, where continuous changes in facial expressions intensify the level of difficulty. These issues stem from the independent editing of individual images and the inherent loss of temporal continuity during the editing process. In this paper, we introduce Follow Your Motion (FYM), a generic framework for maintaining temporal consistency in portrait editing. Specifically, given portrait images rendered by a pre-trained 3D Gaussian Splatting model, we first develop a diffusion model that intuitively and inherently learns motion trajectory changes at different scales and pixel coordinates, from the first frame to each subsequent frame. This approach ensures that temporally inconsistent edited avatars inherit the motion information from the rendered avatars. Secondly, to maintain fine-grained expression temporal consistency in talking head editing, we propose a dynamic re-weighted attention mechanism. This mechanism assigns higher weight coefficients to landmark points in space and dynamically updates these weights based on landmark loss, achieving more consistent and refined facial expressions. Extensive experiments demonstrate that our method outperforms existing approaches in terms of temporal consistency and can be used to optimize and compensate for temporally inconsistent outputs in a range of applications, such as text-driven editing, relighting, and various other applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22225v1-abstract-full').style.display = 'none'; document.getElementById('2503.22225v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">https://anonymous-hub1127.github.io/FYM.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21460">arXiv:2503.21460</a> <span> [<a href="https://arxiv.org/pdf/2503.21460">pdf</a>, <a href="https://arxiv.org/format/2503.21460">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Large Language Model Agent: A Survey on Methodology, Applications and Challenges </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+J">Junyu Luo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weizhi Zhang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Ye Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yusheng Zhao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Junwei Yang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yiyang Gu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Bohan Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Binqi Chen</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Z">Ziyue Qiao</a>, <a href="/search/cs?searchtype=author&query=Long%2C+Q">Qingqing Long</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+R">Rongcheng Tu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+X">Xiao Luo</a>, <a href="/search/cs?searchtype=author&query=Ju%2C+W">Wei Ju</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhiping Xiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yifan Wang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+M">Meng Xiao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chenwu Liu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jingyang Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shichang Zhang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Yiqiao Jin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xian Wu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hanqing Zhao</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+D">Dacheng Tao</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+P+S">Philip S. Yu</a> , et al. (1 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21460v1-abstract-short" style="display: inline;"> The era of intelligent agents is upon us, driven by revolutionary advancements in large language models. Large Language Model (LLM) agents, with goal-driven behaviors and dynamic adaptation capabilities, potentially represent a critical pathway toward artificial general intelligence. This survey systematically deconstructs LLM agent systems through a methodology-centered taxonomy, linking architec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21460v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21460v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21460v1-abstract-full" style="display: none;"> The era of intelligent agents is upon us, driven by revolutionary advancements in large language models. Large Language Model (LLM) agents, with goal-driven behaviors and dynamic adaptation capabilities, potentially represent a critical pathway toward artificial general intelligence. This survey systematically deconstructs LLM agent systems through a methodology-centered taxonomy, linking architectural foundations, collaboration mechanisms, and evolutionary pathways. We unify fragmented research threads by revealing fundamental connections between agent design principles and their emergent behaviors in complex environments. Our work provides a unified architectural perspective, examining how agents are constructed, how they collaborate, and how they evolve over time, while also addressing evaluation methodologies, tool applications, practical challenges, and diverse application domains. By surveying the latest developments in this rapidly evolving field, we offer researchers a structured taxonomy for understanding LLM agents and identify promising directions for future research. The collection is available at https://github.com/luo-junyu/Awesome-Agent-Papers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21460v1-abstract-full').style.display = 'none'; document.getElementById('2503.21460v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">329 papers surveyed, resources are at https://github.com/luo-junyu/Awesome-Agent-Papers</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21099">arXiv:2503.21099</a> <span> [<a href="https://arxiv.org/pdf/2503.21099">pdf</a>, <a href="https://arxiv.org/format/2503.21099">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Learning Class Prototypes for Unified Sparse Supervised 3D Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yun Zhu</a>, <a href="/search/cs?searchtype=author&query=Hui%2C+L">Le Hui</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hang Yang</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+J">Jianjun Qian</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+J">Jin Xie</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jian Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21099v1-abstract-short" style="display: inline;"> Both indoor and outdoor scene perceptions are essential for embodied intelligence. However, current sparse supervised 3D object detection methods focus solely on outdoor scenes without considering indoor settings. To this end, we propose a unified sparse supervised 3D object detection method for both indoor and outdoor scenes through learning class prototypes to effectively utilize unlabeled objec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21099v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21099v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21099v1-abstract-full" style="display: none;"> Both indoor and outdoor scene perceptions are essential for embodied intelligence. However, current sparse supervised 3D object detection methods focus solely on outdoor scenes without considering indoor settings. To this end, we propose a unified sparse supervised 3D object detection method for both indoor and outdoor scenes through learning class prototypes to effectively utilize unlabeled objects. Specifically, we first propose a prototype-based object mining module that converts the unlabeled object mining into a matching problem between class prototypes and unlabeled features. By using optimal transport matching results, we assign prototype labels to high-confidence features, thereby achieving the mining of unlabeled objects. We then present a multi-label cooperative refinement module to effectively recover missed detections through pseudo label quality control and prototype label cooperation. Experiments show that our method achieves state-of-the-art performance under the one object per scene sparse supervised setting across indoor and outdoor datasets. With only one labeled object per scene, our method achieves about 78%, 90%, and 96% performance compared to the fully supervised detector on ScanNet V2, SUN RGB-D, and KITTI, respectively, highlighting the scalability of our method. Code is available at https://github.com/zyrant/CPDet3D. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21099v1-abstract-full').style.display = 'none'; document.getElementById('2503.21099v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21072">arXiv:2503.21072</a> <span> [<a href="https://arxiv.org/pdf/2503.21072">pdf</a>, <a href="https://arxiv.org/format/2503.21072">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HSLiNets: Evaluating Band Ordering Strategies in Hyperspectral and LiDAR Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+J+X">Judy X Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jing Wang</a>, <a href="/search/cs?searchtype=author&query=Zhuanfeng"> Zhuanfeng</a>, <a href="/search/cs?searchtype=author&query=Li"> Li</a>, <a href="/search/cs?searchtype=author&query=Long%2C+C+S+Z">Chenhong Sui Zekun Long</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jun Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21072v1-abstract-short" style="display: inline;"> The integration of hyperspectral imaging (HSI) and Light Detection and Ranging (LiDAR) data provides complementary spectral and spatial information for remote sensing applications. While previous studies have explored the role of band selection and grouping in HSI classification, little attention has been given to how the spectral sequence or band order affects classification outcomes when fused w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21072v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21072v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21072v1-abstract-full" style="display: none;"> The integration of hyperspectral imaging (HSI) and Light Detection and Ranging (LiDAR) data provides complementary spectral and spatial information for remote sensing applications. While previous studies have explored the role of band selection and grouping in HSI classification, little attention has been given to how the spectral sequence or band order affects classification outcomes when fused with LiDAR. In this work, we systematically investigate the influence of band order on HSI-LiDAR fusion performance. Through extensive experiments, we demonstrate that band order significantly impacts classification accuracy, revealing a previously overlooked factor in fusion-based models. Motivated by this observation, we propose a novel fusion architecture that not only integrates HSI and LiDAR data but also learns from multiple band order configurations. The proposed method enhances feature representation by adaptively fusing different spectral sequences, leading to improved classification accuracy. Experimental results on the Houston 2013 and Trento datasets show that our approach outperforms state-of-the-art fusion models. Data and code are available at https://github.com/Judyxyang/HSLiNets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21072v1-abstract-full').style.display = 'none'; document.getElementById('2503.21072v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2 figures, 5 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20824">arXiv:2503.20824</a> <span> [<a href="https://arxiv.org/pdf/2503.20824">pdf</a>, <a href="https://arxiv.org/format/2503.20824">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Exploiting Temporal State Space Sharing for Video Semantic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hesham%2C+S+A+S">Syed Ariff Syed Hesham</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yun Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+G">Guolei Sun</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+H">Henghui Ding</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jing Yang</a>, <a href="/search/cs?searchtype=author&query=Konukoglu%2C+E">Ender Konukoglu</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+X">Xue Geng</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xudong Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20824v1-abstract-short" style="display: inline;"> Video semantic segmentation (VSS) plays a vital role in understanding the temporal evolution of scenes. Traditional methods often segment videos frame-by-frame or in a short temporal window, leading to limited temporal context, redundant computations, and heavy memory requirements. To this end, we introduce a Temporal Video State Space Sharing (TV3S) architecture to leverage Mamba state space mode… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20824v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20824v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20824v1-abstract-full" style="display: none;"> Video semantic segmentation (VSS) plays a vital role in understanding the temporal evolution of scenes. Traditional methods often segment videos frame-by-frame or in a short temporal window, leading to limited temporal context, redundant computations, and heavy memory requirements. To this end, we introduce a Temporal Video State Space Sharing (TV3S) architecture to leverage Mamba state space models for temporal feature sharing. Our model features a selective gating mechanism that efficiently propagates relevant information across video frames, eliminating the need for a memory-heavy feature pool. By processing spatial patches independently and incorporating shifted operation, TV3S supports highly parallel computation in both training and inference stages, which reduces the delay in sequential state space processing and improves the scalability for long video sequences. Moreover, TV3S incorporates information from prior frames during inference, achieving long-range temporal coherence and superior adaptability to extended sequences. Evaluations on the VSPW and Cityscapes datasets reveal that our approach outperforms current state-of-the-art methods, establishing a new standard for VSS with consistent results across long video sequences. By achieving a good balance between accuracy and efficiency, TV3S shows a significant advancement in spatiotemporal modeling, paving the way for efficient video analysis. The code is publicly available at https://github.com/Ashesham/TV3S.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20824v1-abstract-full').style.display = 'none'; document.getElementById('2503.20824v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IEEE/CVF Conference on Computer Vision and Pattern Recognition 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20314">arXiv:2503.20314</a> <span> [<a href="https://arxiv.org/pdf/2503.20314">pdf</a>, <a href="https://arxiv.org/format/2503.20314">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Wan: Open and Advanced Large-Scale Video Generative Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=WanTeam"> WanTeam</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A">Ang Wang</a>, <a href="/search/cs?searchtype=author&query=Ai%2C+B">Baole Ai</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+B">Bin Wen</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+C">Chaojie Mao</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+C">Chen-Wei Xie</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Di Chen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F">Feiwu Yu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Haiming Zhao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jianxiao Yang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+J">Jianyuan Zeng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiayu Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingfeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jingren Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jinkai Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jixuan Chen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+K">Kai Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+K">Kang Zhao</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+K">Keyu Yan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Lianghua Huang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+M">Mengyang Feng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+N">Ningyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Pandeng Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+P">Pingyu Wu</a> , et al. (38 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20314v1-abstract-short" style="display: inline;"> This report presents Wan, a comprehensive and open suite of video foundation models designed to push the boundaries of video generation. Built upon the mainstream diffusion transformer paradigm, Wan achieves significant advancements in generative capabilities through a series of innovations, including our novel VAE, scalable pre-training strategies, large-scale data curation, and automated evaluat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20314v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20314v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20314v1-abstract-full" style="display: none;"> This report presents Wan, a comprehensive and open suite of video foundation models designed to push the boundaries of video generation. Built upon the mainstream diffusion transformer paradigm, Wan achieves significant advancements in generative capabilities through a series of innovations, including our novel VAE, scalable pre-training strategies, large-scale data curation, and automated evaluation metrics. These contributions collectively enhance the model's performance and versatility. Specifically, Wan is characterized by four key features: Leading Performance: The 14B model of Wan, trained on a vast dataset comprising billions of images and videos, demonstrates the scaling laws of video generation with respect to both data and model size. It consistently outperforms the existing open-source models as well as state-of-the-art commercial solutions across multiple internal and external benchmarks, demonstrating a clear and significant performance superiority. Comprehensiveness: Wan offers two capable models, i.e., 1.3B and 14B parameters, for efficiency and effectiveness respectively. It also covers multiple downstream applications, including image-to-video, instruction-guided video editing, and personal video generation, encompassing up to eight tasks. Consumer-Grade Efficiency: The 1.3B model demonstrates exceptional resource efficiency, requiring only 8.19 GB VRAM, making it compatible with a wide range of consumer-grade GPUs. Openness: We open-source the entire series of Wan, including source code and all models, with the goal of fostering the growth of the video generation community. This openness seeks to significantly expand the creative possibilities of video production in the industry and provide academia with high-quality video foundation models. All the code and models are available at https://github.com/Wan-Video/Wan2.1. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20314v1-abstract-full').style.display = 'none'; document.getElementById('2503.20314v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">60 pages, 33 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20174">arXiv:2503.20174</a> <span> [<a href="https://arxiv.org/pdf/2503.20174">pdf</a>, <a href="https://arxiv.org/format/2503.20174">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Devil is in the Uniformity: Exploring Diverse Learners within Transformer for Image Restoration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Shihao Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dayu Li</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+J">Jinshan Pan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Juncheng Zhou</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jinglei Shi</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jufeng Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20174v1-abstract-short" style="display: inline;"> Transformer-based approaches have gained significant attention in image restoration, where the core component, i.e, Multi-Head Attention (MHA), plays a crucial role in capturing diverse features and recovering high-quality results. In MHA, heads perform attention calculation independently from uniform split subspaces, and a redundancy issue is triggered to hinder the model from achieving satisfact… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20174v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20174v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20174v1-abstract-full" style="display: none;"> Transformer-based approaches have gained significant attention in image restoration, where the core component, i.e, Multi-Head Attention (MHA), plays a crucial role in capturing diverse features and recovering high-quality results. In MHA, heads perform attention calculation independently from uniform split subspaces, and a redundancy issue is triggered to hinder the model from achieving satisfactory outputs. In this paper, we propose to improve MHA by exploring diverse learners and introducing various interactions between heads, which results in a Hierarchical multI-head atteNtion driven Transformer model, termed HINT, for image restoration. HINT contains two modules, i.e., the Hierarchical Multi-Head Attention (HMHA) and the Query-Key Cache Updating (QKCU) module, to address the redundancy problem that is rooted in vanilla MHA. Specifically, HMHA extracts diverse contextual features by employing heads to learn from subspaces of varying sizes and containing different information. Moreover, QKCU, comprising intra- and inter-layer schemes, further reduces the redundancy problem by facilitating enhanced interactions between attention heads within and across layers. Extensive experiments are conducted on 12 benchmarks across 5 image restoration tasks, including low-light enhancement, dehazing, desnowing, denoising, and deraining, to demonstrate the superiority of HINT. The source code is available in the supplementary materials. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20174v1-abstract-full').style.display = 'none'; document.getElementById('2503.20174v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19910">arXiv:2503.19910</a> <span> [<a href="https://arxiv.org/pdf/2503.19910">pdf</a>, <a href="https://arxiv.org/format/2503.19910">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> CoLLM: A Large Language Model for Composed Image Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huynh%2C+C">Chuong Huynh</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jinyu Yang</a>, <a href="/search/cs?searchtype=author&query=Tawari%2C+A">Ashish Tawari</a>, <a href="/search/cs?searchtype=author&query=Shah%2C+M">Mubarak Shah</a>, <a href="/search/cs?searchtype=author&query=Tran%2C+S">Son Tran</a>, <a href="/search/cs?searchtype=author&query=Hamid%2C+R">Raffay Hamid</a>, <a href="/search/cs?searchtype=author&query=Chilimbi%2C+T">Trishul Chilimbi</a>, <a href="/search/cs?searchtype=author&query=Shrivastava%2C+A">Abhinav Shrivastava</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19910v1-abstract-short" style="display: inline;"> Composed Image Retrieval (CIR) is a complex task that aims to retrieve images based on a multimodal query. Typical training data consists of triplets containing a reference image, a textual description of desired modifications, and the target image, which are expensive and time-consuming to acquire. The scarcity of CIR datasets has led to zero-shot approaches utilizing synthetic triplets or levera… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19910v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19910v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19910v1-abstract-full" style="display: none;"> Composed Image Retrieval (CIR) is a complex task that aims to retrieve images based on a multimodal query. Typical training data consists of triplets containing a reference image, a textual description of desired modifications, and the target image, which are expensive and time-consuming to acquire. The scarcity of CIR datasets has led to zero-shot approaches utilizing synthetic triplets or leveraging vision-language models (VLMs) with ubiquitous web-crawled image-caption pairs. However, these methods have significant limitations: synthetic triplets suffer from limited scale, lack of diversity, and unnatural modification text, while image-caption pairs hinder joint embedding learning of the multimodal query due to the absence of triplet data. Moreover, existing approaches struggle with complex and nuanced modification texts that demand sophisticated fusion and understanding of vision and language modalities. We present CoLLM, a one-stop framework that effectively addresses these limitations. Our approach generates triplets on-the-fly from image-caption pairs, enabling supervised training without manual annotation. We leverage Large Language Models (LLMs) to generate joint embeddings of reference images and modification texts, facilitating deeper multimodal fusion. Additionally, we introduce Multi-Text CIR (MTCIR), a large-scale dataset comprising 3.4M samples, and refine existing CIR benchmarks (CIRR and Fashion-IQ) to enhance evaluation reliability. Experimental results demonstrate that CoLLM achieves state-of-the-art performance across multiple CIR benchmarks and settings. MTCIR yields competitive results, with up to 15% performance improvement. Our refined benchmarks provide more reliable evaluation metrics for CIR models, contributing to the advancement of this important field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19910v1-abstract-full').style.display = 'none'; document.getElementById('2503.19910v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2025. Project page: https://collm-cvpr25.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19782">arXiv:2503.19782</a> <span> [<a href="https://arxiv.org/pdf/2503.19782">pdf</a>, <a href="https://arxiv.org/format/2503.19782">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> A comparative study of calibration techniques for finite strain elastoplasticity: Numerically-exact sensitivities for FEMU and VFM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kumar%2C+S">Sanjeev Kumar</a>, <a href="/search/cs?searchtype=author&query=Seidl%2C+D+T">D. Thomas Seidl</a>, <a href="/search/cs?searchtype=author&query=Granzow%2C+B+N">Brian N. Granzow</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jin Yang</a>, <a href="/search/cs?searchtype=author&query=Fuhg%2C+J+N">Jan N. Fuhg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19782v1-abstract-short" style="display: inline;"> Accurate identification of material parameters is crucial for predictive modeling in computational mechanics. The two primary approaches in the experimental mechanics' community for calibration from full-field digital image correlation data are known as finite element model updating (FEMU) and the virtual fields method (VFM). In VFM, the objective function is a squared mismatch between internal an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19782v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19782v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19782v1-abstract-full" style="display: none;"> Accurate identification of material parameters is crucial for predictive modeling in computational mechanics. The two primary approaches in the experimental mechanics' community for calibration from full-field digital image correlation data are known as finite element model updating (FEMU) and the virtual fields method (VFM). In VFM, the objective function is a squared mismatch between internal and external virtual work or power. In FEMU, the objective function quantifies the weighted mismatch between model predictions and corresponding experimentally measured quantities of interest. It is minimized by iteratively updating the parameters of an FE model. While FEMU is seen as more flexible, VFM is commonly used instead of FEMU due to its considerably greater computational expense. However, comparisons between the two methods usually involve approximations of gradients or sensitivities with finite difference schemes, thereby making direct assessments difficult. Hence, in this study, we rigorously compare VFM and FEMU in the context of numerically-exact sensitivities obtained through local sensitivity analyses and the application of automatic differentiation software. To this end, both methods are tested on a finite strain elastoplasticity model. We conduct a series of test cases to assess both methods' robustness under practical challenges. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19782v1-abstract-full').style.display = 'none'; document.getElementById('2503.19782v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">44 pages, 15 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 74C15 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19656">arXiv:2503.19656</a> <span> [<a href="https://arxiv.org/pdf/2503.19656">pdf</a>, <a href="https://arxiv.org/format/2503.19656">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Towards Reliable Time Series Forecasting under Future Uncertainty: Ambiguity and Novelty Rejection Mechanisms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+N">Ninghui Feng</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+S">Songning Lai</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xin Zhou</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiayu Yang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+K">Kunlong Feng</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhenxiao Yin</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+F">Fobao Zhou</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zhangyi Hu</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+Y">Yutao Yue</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yuxuan Liang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Boyu Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hang Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19656v1-abstract-short" style="display: inline;"> In real-world time series forecasting, uncertainty and lack of reliable evaluation pose significant challenges. Notably, forecasting errors often arise from underfitting in-distribution data and failing to handle out-of-distribution inputs. To enhance model reliability, we introduce a dual rejection mechanism combining ambiguity and novelty rejection. Ambiguity rejection, using prediction error va… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19656v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19656v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19656v1-abstract-full" style="display: none;"> In real-world time series forecasting, uncertainty and lack of reliable evaluation pose significant challenges. Notably, forecasting errors often arise from underfitting in-distribution data and failing to handle out-of-distribution inputs. To enhance model reliability, we introduce a dual rejection mechanism combining ambiguity and novelty rejection. Ambiguity rejection, using prediction error variance, allows the model to abstain under low confidence, assessed through historical error variance analysis without future ground truth. Novelty rejection, employing Variational Autoencoders and Mahalanobis distance, detects deviations from training data. This dual approach improves forecasting reliability in dynamic environments by reducing errors and adapting to data changes, advancing reliability in complex scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19656v1-abstract-full').style.display = 'none'; document.getElementById('2503.19656v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19625">arXiv:2503.19625</a> <span> [<a href="https://arxiv.org/pdf/2503.19625">pdf</a>, <a href="https://arxiv.org/format/2503.19625">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DynOPETs: A Versatile Benchmark for Dynamic Object Pose Estimation and Tracking in Moving Camera Scenarios </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Meng%2C+X">Xiangting Meng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiaqi Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mingshu Chen</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+C">Chenxin Yan</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yujiao Shi</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+W">Wenchao Ding</a>, <a href="/search/cs?searchtype=author&query=Kneip%2C+L">Laurent Kneip</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19625v1-abstract-short" style="display: inline;"> In the realm of object pose estimation, scenarios involving both dynamic objects and moving cameras are prevalent. However, the scarcity of corresponding real-world datasets significantly hinders the development and evaluation of robust pose estimation models. This is largely attributed to the inherent challenges in accurately annotating object poses in dynamic scenes captured by moving cameras. T… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19625v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19625v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19625v1-abstract-full" style="display: none;"> In the realm of object pose estimation, scenarios involving both dynamic objects and moving cameras are prevalent. However, the scarcity of corresponding real-world datasets significantly hinders the development and evaluation of robust pose estimation models. This is largely attributed to the inherent challenges in accurately annotating object poses in dynamic scenes captured by moving cameras. To bridge this gap, this paper presents a novel dataset DynOPETs and a dedicated data acquisition and annotation pipeline tailored for object pose estimation and tracking in such unconstrained environments. Our efficient annotation method innovatively integrates pose estimation and pose tracking techniques to generate pseudo-labels, which are subsequently refined through pose graph optimization. The resulting dataset offers accurate pose annotations for dynamic objects observed from moving cameras. To validate the effectiveness and value of our dataset, we perform comprehensive evaluations using 18 state-of-the-art methods, demonstrating its potential to accelerate research in this challenging domain. The dataset will be made publicly available to facilitate further exploration and advancement in the field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19625v1-abstract-full').style.display = 'none'; document.getElementById('2503.19625v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19498">arXiv:2503.19498</a> <span> [<a href="https://arxiv.org/pdf/2503.19498">pdf</a>, <a href="https://arxiv.org/format/2503.19498">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> DomainCQA: Crafting Expert-Level QA from Domain-Specific Charts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhong%2C+L">Ling Zhong</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yujing Lu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jing Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weiming Li</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+P">Peng Wei</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yongheng Wang</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+M">Manni Duan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qing Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19498v2-abstract-short" style="display: inline;"> Chart Question Answering (CQA) benchmarks are essential for evaluating the capability of Multimodal Large Language Models (MLLMs) to interpret visual data. However, current benchmarks focus primarily on the evaluation of general-purpose CQA but fail to adequately capture domain-specific challenges. We introduce DomainCQA, a systematic methodology for constructing domain-specific CQA benchmarks, an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19498v2-abstract-full').style.display = 'inline'; document.getElementById('2503.19498v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19498v2-abstract-full" style="display: none;"> Chart Question Answering (CQA) benchmarks are essential for evaluating the capability of Multimodal Large Language Models (MLLMs) to interpret visual data. However, current benchmarks focus primarily on the evaluation of general-purpose CQA but fail to adequately capture domain-specific challenges. We introduce DomainCQA, a systematic methodology for constructing domain-specific CQA benchmarks, and demonstrate its effectiveness by developing AstroChart, a CQA benchmark in the field of astronomy. Our evaluation shows that chart reasoning and combining chart information with domain knowledge for deeper analysis and summarization, rather than domain-specific knowledge, pose the primary challenge for existing MLLMs, highlighting a critical gap in current benchmarks. By providing a scalable and rigorous framework, DomainCQA enables more precise assessment and improvement of MLLMs for domain-specific applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19498v2-abstract-full').style.display = 'none'; document.getElementById('2503.19498v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">87 pages, 65 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19271">arXiv:2503.19271</a> <span> [<a href="https://arxiv.org/pdf/2503.19271">pdf</a>, <a href="https://arxiv.org/format/2503.19271">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MARS: Memory-Enhanced Agents with Reflective Self-improvement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xuechen Liang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+M">Meiling Tao</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Yinghui Xia</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianhui Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kun Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yijin Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jingsong Yang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianyu Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuantao Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Miao Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xueqian Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19271v1-abstract-short" style="display: inline;"> Large language models (LLMs) have made significant advances in the field of natural language processing, but they still face challenges such as continuous decision-making, lack of long-term memory, and limited context windows in dynamic environments. To address these issues, this paper proposes an innovative framework Memory-Enhanced Agents with Reflective Self-improvement. The MARS framework comp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19271v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19271v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19271v1-abstract-full" style="display: none;"> Large language models (LLMs) have made significant advances in the field of natural language processing, but they still face challenges such as continuous decision-making, lack of long-term memory, and limited context windows in dynamic environments. To address these issues, this paper proposes an innovative framework Memory-Enhanced Agents with Reflective Self-improvement. The MARS framework comprises three agents: the User, the Assistant, and the Checker. By integrating iterative feedback, reflective mechanisms, and a memory optimization mechanism based on the Ebbinghaus forgetting curve, it significantly enhances the agents capabilities in handling multi-tasking and long-span information. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19271v1-abstract-full').style.display = 'none'; document.getElementById('2503.19271v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19201">arXiv:2503.19201</a> <span> [<a href="https://arxiv.org/pdf/2503.19201">pdf</a>, <a href="https://arxiv.org/format/2503.19201">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Shared Low-Rank Adaptation Approach to Personalized RLHF </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+R">Renpu Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peng Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Donghao Li</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Cong Shen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jing Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19201v1-abstract-short" style="display: inline;"> Reinforcement Learning from Human Feedback (RLHF) has emerged as a pivotal technique for aligning artificial intelligence systems with human values, achieving remarkable success in fine-tuning large language models. However, existing RLHF frameworks often assume that human preferences are relatively homogeneous and can be captured by a single, unified reward model. This assumption overlooks the in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19201v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19201v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19201v1-abstract-full" style="display: none;"> Reinforcement Learning from Human Feedback (RLHF) has emerged as a pivotal technique for aligning artificial intelligence systems with human values, achieving remarkable success in fine-tuning large language models. However, existing RLHF frameworks often assume that human preferences are relatively homogeneous and can be captured by a single, unified reward model. This assumption overlooks the inherent diversity and heterogeneity across individuals, limiting the adaptability of RLHF to personalized scenarios and risking misalignments that can diminish user satisfaction and trust in AI systems. In this paper, we address these challenges by introducing Low-Rank Adaptation (LoRA) into the personalized RLHF framework. We apply LoRA in the the aggregated parameter space of all personalized reward functions, thereby enabling efficient learning of personalized reward models from potentially limited local datasets. Our approach exploits potential shared structures among the local ground-truth reward models while allowing for individual adaptation, without relying on restrictive assumptions about shared representations as in prior works. We further establish sample complexity guarantees for our method. Theoretical analysis demonstrates the effectiveness of the proposed approach in capturing both shared and individual-specific structures within heterogeneous human preferences, addressing the dual challenge of personalization requirements and practical data constraints. Experimental results on real-world datasets corroborate the efficiency of our algorithm in the personalized RLHF setting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19201v1-abstract-full').style.display = 'none'; document.getElementById('2503.19201v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published as a conference paper at AISTATS 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18626">arXiv:2503.18626</a> <span> [<a href="https://arxiv.org/pdf/2503.18626">pdf</a>, <a href="https://arxiv.org/format/2503.18626">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Generative Dataset Distillation using Min-Max Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fan%2C+J">Junqiao Fan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yunjiao Zhou</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+M+C+J">Min Chang Jordan Ren</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jianfei Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18626v1-abstract-short" style="display: inline;"> In this paper, we address the problem of generative dataset distillation that utilizes generative models to synthesize images. The generator may produce any number of images under a preserved evaluation time. In this work, we leverage the popular diffusion model as the generator to compute a surrogate dataset, boosted by a min-max loss to control the dataset's diversity and representativeness duri… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18626v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18626v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18626v1-abstract-full" style="display: none;"> In this paper, we address the problem of generative dataset distillation that utilizes generative models to synthesize images. The generator may produce any number of images under a preserved evaluation time. In this work, we leverage the popular diffusion model as the generator to compute a surrogate dataset, boosted by a min-max loss to control the dataset's diversity and representativeness during training. However, the diffusion model is time-consuming when generating images, as it requires an iterative generation process. We observe a critical trade-off between the number of image samples and the image quality controlled by the diffusion steps and propose Diffusion Step Reduction to achieve optimal performance. This paper details our comprehensive method and its performance. Our model achieved $2^{nd}$ place in the generative track of \href{https://www.dd-challenge.com/#/}{The First Dataset Distillation Challenge of ECCV2024}, demonstrating its superior performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18626v1-abstract-full').style.display = 'none'; document.getElementById('2503.18626v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The paper is accepted as the ECCV2024 workshop paper and achieved second place in the generative track of The First Dataset Distillation Challenge of ECCV2024, https://www.dd-challenge.com/#/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18386">arXiv:2503.18386</a> <span> [<a href="https://arxiv.org/pdf/2503.18386">pdf</a>, <a href="https://arxiv.org/format/2503.18386">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Resource-Efficient Motion Control for Video Generation via Dynamic Mask Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+S">Sicong Feng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jielong Yang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+L">Li Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18386v1-abstract-short" style="display: inline;"> Recent advances in diffusion models bring new vitality to visual content creation. However, current text-to-video generation models still face significant challenges such as high training costs, substantial data requirements, and difficulties in maintaining consistency between given text and motion of the foreground object. To address these challenges, we propose mask-guided video generation, whic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18386v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18386v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18386v1-abstract-full" style="display: none;"> Recent advances in diffusion models bring new vitality to visual content creation. However, current text-to-video generation models still face significant challenges such as high training costs, substantial data requirements, and difficulties in maintaining consistency between given text and motion of the foreground object. To address these challenges, we propose mask-guided video generation, which can control video generation through mask motion sequences, while requiring limited training data. Our model enhances existing architectures by incorporating foreground masks for precise text-position matching and motion trajectory control. Through mask motion sequences, we guide the video generation process to maintain consistent foreground objects throughout the sequence. Additionally, through a first-frame sharing strategy and autoregressive extension approach, we achieve more stable and longer video generation. Extensive qualitative and quantitative experiments demonstrate that this approach excels in various video generation tasks, such as video editing and generating artistic videos, outperforming previous methods in terms of consistency and quality. Our generated results can be viewed in the supplementary materials. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18386v1-abstract-full').style.display = 'none'; document.getElementById('2503.18386v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17793">arXiv:2503.17793</a> <span> [<a href="https://arxiv.org/pdf/2503.17793">pdf</a>, <a href="https://arxiv.org/format/2503.17793">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Every Sample Matters: Leveraging Mixture-of-Experts and High-Quality Data for Efficient and Accurate Code LLM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Codefuse"> Codefuse</a>, <a href="/search/cs?searchtype=author&query=Team%2C+L">Ling Team</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+W">Wenting Cai</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yuchen Cao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chaoyu Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chen Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Siba Chen</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Q">Qing Cui</a>, <a href="/search/cs?searchtype=author&query=Di%2C+P">Peng Di</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+J">Junpeng Fang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Z">Zi Gong</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+T">Ting Guo</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhengyu He</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yang Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Cong Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jianguo Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zheng Li</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+S">Shijie Lian</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">BingChang Liu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+S">Songshan Luo</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+S">Shuo Mao</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+M">Min Shen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jian Wu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiaolong Yang</a> , et al. (8 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17793v1-abstract-short" style="display: inline;"> Recent advancements in code large language models (LLMs) have demonstrated remarkable capabilities in code generation and understanding. It is still challenging to build a code LLM with comprehensive performance yet ultimate efficiency. Many attempts have been released in the open source community to break the trade-off between performance and efficiency, such as the Qwen Coder series and the Deep… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17793v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17793v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17793v1-abstract-full" style="display: none;"> Recent advancements in code large language models (LLMs) have demonstrated remarkable capabilities in code generation and understanding. It is still challenging to build a code LLM with comprehensive performance yet ultimate efficiency. Many attempts have been released in the open source community to break the trade-off between performance and efficiency, such as the Qwen Coder series and the DeepSeek Coder series. This paper introduces yet another attempt in this area, namely Ling-Coder-Lite. We leverage the efficient Mixture-of-Experts (MoE) architecture along with a set of high-quality data curation methods (especially those based on program analytics) to build an efficient yet powerful code LLM. Ling-Coder-Lite exhibits on-par performance on 12 representative coding benchmarks compared to state-of-the-art models of similar size, such as Qwen2.5-Coder-7B and DeepSeek-Coder-V2-Lite, while offering competitive latency and throughput. In practice, we achieve a 50\% reduction in deployment resources compared to the similar-sized dense model without performance loss. To facilitate further research and development in this area, we open-source our models as well as a substantial portion of high-quality data for the annealing and post-training stages. The models and data can be accessed at~\url{https://huggingface.co/inclusionAI/Ling-Coder-lite}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17793v1-abstract-full').style.display = 'none'; document.getElementById('2503.17793v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 6 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17407">arXiv:2503.17407</a> <span> [<a href="https://arxiv.org/pdf/2503.17407">pdf</a>, <a href="https://arxiv.org/format/2503.17407">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Comprehensive Survey on Long Context Language Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaheng Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+D">Dawei Zhu</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Z">Zhiqi Bai</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yancheng He</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+H">Huanxuan Liao</a>, <a href="/search/cs?searchtype=author&query=Que%2C+H">Haoran Que</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zekun Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenchen Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Ge Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiebin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuanxing Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhuo Chen</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hangyu Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shilong Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziqiang Liu</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+Y">Yong Shan</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yifan Song</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+J">Jiayi Tian</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wenhao Wu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhejian Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+R">Ruijie Zhu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+J">Junlan Feng</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a>, <a href="/search/cs?searchtype=author&query=He%2C+S">Shizhu He</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhoujun Li</a> , et al. (12 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17407v1-abstract-short" style="display: inline;"> Efficient processing of long contexts has been a persistent pursuit in Natural Language Processing. With the growing number of long documents, dialogues, and other textual data, it is important to develop Long Context Language Models (LCLMs) that can process and analyze extensive inputs in an effective and efficient way. In this paper, we present a comprehensive survey on recent advances in long-c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17407v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17407v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17407v1-abstract-full" style="display: none;"> Efficient processing of long contexts has been a persistent pursuit in Natural Language Processing. With the growing number of long documents, dialogues, and other textual data, it is important to develop Long Context Language Models (LCLMs) that can process and analyze extensive inputs in an effective and efficient way. In this paper, we present a comprehensive survey on recent advances in long-context modeling for large language models. Our survey is structured around three key aspects: how to obtain effective and efficient LCLMs, how to train and deploy LCLMs efficiently, and how to evaluate and analyze LCLMs comprehensively. For the first aspect, we discuss data strategies, architectural designs, and workflow approaches oriented with long context processing. For the second aspect, we provide a detailed examination of the infrastructure required for LCLM training and inference. For the third aspect, we present evaluation paradigms for long-context comprehension and long-form generation, as well as behavioral analysis and mechanism interpretability of LCLMs. Beyond these three key aspects, we thoroughly explore the diverse application scenarios where existing LCLMs have been deployed and outline promising future development directions. This survey provides an up-to-date review of the literature on long-context LLMs, which we wish to serve as a valuable resource for both researchers and engineers. An associated GitHub repository collecting the latest papers and repos is available at: \href{https://github.com/LCLM-Horizon/A-Comprehensive-Survey-For-Long-Context-Language-Modeling}{\color[RGB]{175,36,67}{LCLM-Horizon}}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17407v1-abstract-full').style.display = 'none'; document.getElementById('2503.17407v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17027">arXiv:2503.17027</a> <span> [<a href="https://arxiv.org/pdf/2503.17027">pdf</a>, <a href="https://arxiv.org/format/2503.17027">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RAW-Adapter: Adapting Pre-trained Visual Model to Camera RAW Images and A Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cui%2C+Z">Ziteng Cui</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jianfei Yang</a>, <a href="/search/cs?searchtype=author&query=Harada%2C+T">Tatsuya Harada</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17027v1-abstract-short" style="display: inline;"> In the computer vision community, the preference for pre-training visual models has largely shifted toward sRGB images due to their ease of acquisition and compact storage. However, camera RAW images preserve abundant physical details across diverse real-world scenarios. Despite this, most existing visual perception methods that utilize RAW data directly integrate image signal processing (ISP) sta… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17027v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17027v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17027v1-abstract-full" style="display: none;"> In the computer vision community, the preference for pre-training visual models has largely shifted toward sRGB images due to their ease of acquisition and compact storage. However, camera RAW images preserve abundant physical details across diverse real-world scenarios. Despite this, most existing visual perception methods that utilize RAW data directly integrate image signal processing (ISP) stages with subsequent network modules, often overlooking potential synergies at the model level. Building on recent advances in adapter-based methodologies in both NLP and computer vision, we propose RAW-Adapter, a novel framework that incorporates learnable ISP modules as input-level adapters to adjust RAW inputs. At the same time, it employs model-level adapters to seamlessly bridge ISP processing with high-level downstream architectures. Moreover, RAW-Adapter serves as a general framework applicable to various computer vision frameworks. Furthermore, we introduce RAW-Bench, which incorporates 17 types of RAW-based common corruptions, including lightness degradations, weather effects, blurriness, camera imaging degradations, and variations in camera color response. Using this benchmark, we systematically compare the performance of RAW-Adapter with state-of-the-art (SOTA) ISP methods and other RAW-based high-level vision algorithms. Additionally, we propose a RAW-based data augmentation strategy to further enhance RAW-Adapter's performance and improve its out-of-domain (OOD) generalization ability. Extensive experiments substantiate the effectiveness and efficiency of RAW-Adapter, highlighting its robust performance across diverse scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17027v1-abstract-full').style.display = 'none'; document.getElementById('2503.17027v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 17 figures, extension of ECCV 2024 work: arXiv:2408.14802</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16979">arXiv:2503.16979</a> <span> [<a href="https://arxiv.org/pdf/2503.16979">pdf</a>, <a href="https://arxiv.org/format/2503.16979">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Instant Gaussian Stream: Fast and Generalizable Streaming of Dynamic Scene Reconstruction via Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+J">Jinbo Yan</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+R">Rui Peng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhiyan Wang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+L">Luyang Tang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiayu Yang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+J">Jie Liang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiahao Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ronggang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16979v1-abstract-short" style="display: inline;"> Building Free-Viewpoint Videos in a streaming manner offers the advantage of rapid responsiveness compared to offline training methods, greatly enhancing user experience. However, current streaming approaches face challenges of high per-frame reconstruction time (10s+) and error accumulation, limiting their broader application. In this paper, we propose Instant Gaussian Stream (IGS), a fast and ge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16979v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16979v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16979v1-abstract-full" style="display: none;"> Building Free-Viewpoint Videos in a streaming manner offers the advantage of rapid responsiveness compared to offline training methods, greatly enhancing user experience. However, current streaming approaches face challenges of high per-frame reconstruction time (10s+) and error accumulation, limiting their broader application. In this paper, we propose Instant Gaussian Stream (IGS), a fast and generalizable streaming framework, to address these issues. First, we introduce a generalized Anchor-driven Gaussian Motion Network, which projects multi-view 2D motion features into 3D space, using anchor points to drive the motion of all Gaussians. This generalized Network generates the motion of Gaussians for each target frame in the time required for a single inference. Second, we propose a Key-frame-guided Streaming Strategy that refines each key frame, enabling accurate reconstruction of temporally complex scenes while mitigating error accumulation. We conducted extensive in-domain and cross-domain evaluations, demonstrating that our approach can achieve streaming with a average per-frame reconstruction time of 2s+, alongside a enhancement in view synthesis quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16979v1-abstract-full').style.display = 'none'; document.getElementById('2503.16979v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16594">arXiv:2503.16594</a> <span> [<a href="https://arxiv.org/pdf/2503.16594">pdf</a>, <a href="https://arxiv.org/format/2503.16594">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Transformer-based Wireless Symbol Detection Over Fading Channels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fan%2C+L">Li Fan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jing Yang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Cong Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16594v1-abstract-short" style="display: inline;"> Pre-trained Transformers, through in-context learning (ICL), have demonstrated exceptional capabilities to adapt to new tasks using example prompts without model update. Transformer-based wireless receivers, where prompts consist of the pilot data in the form of transmitted and received signal pairs, have shown high detection accuracy when pilot data are abundant. However, pilot information is oft… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16594v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16594v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16594v1-abstract-full" style="display: none;"> Pre-trained Transformers, through in-context learning (ICL), have demonstrated exceptional capabilities to adapt to new tasks using example prompts without model update. Transformer-based wireless receivers, where prompts consist of the pilot data in the form of transmitted and received signal pairs, have shown high detection accuracy when pilot data are abundant. However, pilot information is often costly and limited in practice. In this work, we propose the DEcision Feedback INcontExt Detection (DEFINED) solution as a new wireless receiver design, which bypasses channel estimation and directly performs symbol detection using the (sometimes extremely) limited pilot data. The key innovation in DEFINED is the proposed decision feedback mechanism in ICL, where we sequentially incorporate the detected symbols into the prompts as pseudo-labels to improve the detection for subsequent symbols. Furthermore, we proposed another detection method where we combine ICL with Semi-Supervised Learning (SSL) to extract information from both labeled and unlabeled data during inference, thus avoiding the errors propagated during the decision feedback process of the original DEFINED. Extensive experiments across a broad range of wireless communication settings demonstrate that a small Transformer trained with DEFINED or IC-SSL achieves significant performance improvements over conventional methods, in some cases only needing a single pilot pair to achieve similar performance of the latter with more than 4 pilot pairs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16594v1-abstract-full').style.display = 'none'; document.getElementById('2503.16594v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: substantial text overlap with arXiv:2411.07600</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16444">arXiv:2503.16444</a> <span> [<a href="https://arxiv.org/pdf/2503.16444">pdf</a>, <a href="https://arxiv.org/format/2503.16444">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Conversational Explanations: Discussing Explainable AI with Non-AI Experts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tong Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mengao Zhang</a>, <a href="/search/cs?searchtype=author&query=Low%2C+W+Y">Wei Yan Low</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X+J">X. Jessie Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Boyang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16444v1-abstract-short" style="display: inline;"> Explainable AI (XAI) aims to provide insights into the decisions made by AI models. To date, most XAI approaches provide only one-time, static explanations, which cannot cater to users' diverse knowledge levels and information needs. Conversational explanations have been proposed as an effective method to customize XAI explanations. However, building conversational explanation systems is hindered… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16444v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16444v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16444v1-abstract-full" style="display: none;"> Explainable AI (XAI) aims to provide insights into the decisions made by AI models. To date, most XAI approaches provide only one-time, static explanations, which cannot cater to users' diverse knowledge levels and information needs. Conversational explanations have been proposed as an effective method to customize XAI explanations. However, building conversational explanation systems is hindered by the scarcity of training data. Training with synthetic data faces two main challenges: lack of data diversity and hallucination in the generated data. To alleviate these issues, we introduce a repetition penalty to promote data diversity and exploit a hallucination detector to filter out untruthful synthetic conversation turns. We conducted both automatic and human evaluations on the proposed system, fEw-shot Multi-round ConvErsational Explanation (EMCEE). For automatic evaluation, EMCEE achieves relative improvements of 81.6% in BLEU and 80.5% in ROUGE compared to the baselines. EMCEE also mitigates the degeneration of data quality caused by training on synthetic data. In human evaluations (N=60), EMCEE outperforms baseline models and the control group in improving users' comprehension, acceptance, trust, and collaboration with static explanations by large margins. Through a fine-grained analysis of model responses, we further demonstrate that training on self-generated synthetic data improves the model's ability to generate more truthful and understandable answers, leading to better user interactions. To the best of our knowledge, this is the first conversational explanation method that can answer free-form user questions following static explanations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16444v1-abstract-full').style.display = 'none'; document.getElementById('2503.16444v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to IUI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15019">arXiv:2503.15019</a> <span> [<a href="https://arxiv.org/pdf/2503.15019">pdf</a>, <a href="https://arxiv.org/format/2503.15019">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Learning 4D Panoptic Scene Graph Generation from Rich 2D Visual Scene </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shengqiong Wu</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jingkang Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiangtai Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Juncheng Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hanwang Zhang</a>, <a href="/search/cs?searchtype=author&query=Chua%2C+T">Tat-seng Chua</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15019v1-abstract-short" style="display: inline;"> The latest emerged 4D Panoptic Scene Graph (4D-PSG) provides an advanced-ever representation for comprehensively modeling the dynamic 4D visual real world. Unfortunately, current pioneering 4D-PSG research can primarily suffer from data scarcity issues severely, as well as the resulting out-of-vocabulary problems; also, the pipeline nature of the benchmark generation method can lead to suboptimal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15019v1-abstract-full').style.display = 'inline'; document.getElementById('2503.15019v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15019v1-abstract-full" style="display: none;"> The latest emerged 4D Panoptic Scene Graph (4D-PSG) provides an advanced-ever representation for comprehensively modeling the dynamic 4D visual real world. Unfortunately, current pioneering 4D-PSG research can primarily suffer from data scarcity issues severely, as well as the resulting out-of-vocabulary problems; also, the pipeline nature of the benchmark generation method can lead to suboptimal performance. To address these challenges, this paper investigates a novel framework for 4D-PSG generation that leverages rich 2D visual scene annotations to enhance 4D scene learning. First, we introduce a 4D Large Language Model (4D-LLM) integrated with a 3D mask decoder for end-to-end generation of 4D-PSG. A chained SG inference mechanism is further designed to exploit LLMs' open-vocabulary capabilities to infer accurate and comprehensive object and relation labels iteratively. Most importantly, we propose a 2D-to-4D visual scene transfer learning framework, where a spatial-temporal scene transcending strategy effectively transfers dimension-invariant features from abundant 2D SG annotations to 4D scenes, effectively compensating for data scarcity in 4D-PSG. Extensive experiments on the benchmark data demonstrate that we strikingly outperform baseline models by a large margin, highlighting the effectiveness of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15019v1-abstract-full').style.display = 'none'; document.getElementById('2503.15019v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14831">arXiv:2503.14831</a> <span> [<a href="https://arxiv.org/pdf/2503.14831">pdf</a>, <a href="https://arxiv.org/format/2503.14831">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Robust Transmission of Punctured Text with Large Language Model-based Recovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+S">Sojeong Park</a>, <a href="/search/cs?searchtype=author&query=Noh%2C+H">Hyeonho Noh</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H+J">Hyun Jong Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14831v1-abstract-short" style="display: inline;"> With the recent advancements in deep learning, semantic communication which transmits only task-oriented features, has rapidly emerged. However, since feature extraction relies on learning-based models, its performance fundamentally depends on the training dataset or tasks. For practical scenarios, it is essential to design a model that demonstrates robust performance regardless of dataset or task… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14831v1-abstract-full').style.display = 'inline'; document.getElementById('2503.14831v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14831v1-abstract-full" style="display: none;"> With the recent advancements in deep learning, semantic communication which transmits only task-oriented features, has rapidly emerged. However, since feature extraction relies on learning-based models, its performance fundamentally depends on the training dataset or tasks. For practical scenarios, it is essential to design a model that demonstrates robust performance regardless of dataset or tasks. In this correspondence, we propose a novel text transmission model that selects and transmits only a few characters and recovers the missing characters at the receiver using a large language model (LLM). Additionally, we propose a novel importance character extractor (ICE), which selects transmitted characters to enhance LLM recovery performance. Simulations demonstrate that the proposed filter selection by ICE outperforms random filter selection, which selects transmitted characters randomly. Moreover, the proposed model exhibits robust performance across different datasets and tasks and outperforms traditional bit-based communication in low signal-to-noise ratio conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14831v1-abstract-full').style.display = 'none'; document.getElementById('2503.14831v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14736">arXiv:2503.14736</a> <span> [<a href="https://arxiv.org/pdf/2503.14736">pdf</a>, <a href="https://arxiv.org/format/2503.14736">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HandSplat: Embedding-Driven Gaussian Splatting for High-Fidelity Hand Rendering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yilan Dong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haohe Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qing Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiahao Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenqing Wang</a>, <a href="/search/cs?searchtype=author&query=Slabaugh%2C+G">Gregory Slabaugh</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+S">Shanxin Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14736v1-abstract-short" style="display: inline;"> Existing 3D Gaussian Splatting (3DGS) methods for hand rendering rely on rigid skeletal motion with an oversimplified non-rigid motion model, which fails to capture fine geometric and appearance details. Additionally, they perform densification based solely on per-point gradients and process poses independently, ignoring spatial and temporal correlations. These limitations lead to geometric detail… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14736v1-abstract-full').style.display = 'inline'; document.getElementById('2503.14736v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14736v1-abstract-full" style="display: none;"> Existing 3D Gaussian Splatting (3DGS) methods for hand rendering rely on rigid skeletal motion with an oversimplified non-rigid motion model, which fails to capture fine geometric and appearance details. Additionally, they perform densification based solely on per-point gradients and process poses independently, ignoring spatial and temporal correlations. These limitations lead to geometric detail loss, temporal instability, and inefficient point distribution. To address these issues, we propose HandSplat, a novel Gaussian Splatting-based framework that enhances both fidelity and stability for hand rendering. To improve fidelity, we extend standard 3DGS attributes with implicit geometry and appearance embeddings for finer non-rigid motion modeling while preserving the static hand characteristic modeled by original 3DGS attributes. Additionally, we introduce a local gradient-aware densification strategy that dynamically refines Gaussian density in high-variation regions. To improve stability, we incorporate pose-conditioned attribute regularization to encourage attribute consistency across similar poses, mitigating temporal artifacts. Extensive experiments on InterHand2.6M demonstrate that HandSplat surpasses existing methods in fidelity and stability while achieving real-time performance. We will release the code and pre-trained models upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14736v1-abstract-full').style.display = 'none'; document.getElementById('2503.14736v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14340">arXiv:2503.14340</a> <span> [<a href="https://arxiv.org/pdf/2503.14340">pdf</a>, <a href="https://arxiv.org/format/2503.14340">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> MANTRA: Enhancing Automated Method-Level Refactoring with Contextual RAG and Multi-Agent LLM Collaboration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yisen Xu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+F">Feng Lin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jinqiu Yang</a>, <a href="/search/cs?searchtype=author&query=Tse-Hsun"> Tse-Hsun</a>, <a href="/search/cs?searchtype=author&query=Chen"> Chen</a>, <a href="/search/cs?searchtype=author&query=Tsantalis%2C+N">Nikolaos Tsantalis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14340v2-abstract-short" style="display: inline;"> Maintaining and scaling software systems relies heavily on effective code refactoring, yet this process remains labor-intensive, requiring developers to carefully analyze existing codebases and prevent the introduction of new defects. Although recent advancements have leveraged Large Language Models (LLMs) to automate refactoring tasks, current solutions are constrained in scope and lack mechanism… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14340v2-abstract-full').style.display = 'inline'; document.getElementById('2503.14340v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14340v2-abstract-full" style="display: none;"> Maintaining and scaling software systems relies heavily on effective code refactoring, yet this process remains labor-intensive, requiring developers to carefully analyze existing codebases and prevent the introduction of new defects. Although recent advancements have leveraged Large Language Models (LLMs) to automate refactoring tasks, current solutions are constrained in scope and lack mechanisms to guarantee code compilability and successful test execution. In this work, we introduce MANTRA, a comprehensive LLM agent-based framework that automates method-level refactoring. MANTRA integrates Context-Aware Retrieval-Augmented Generation, coordinated Multi-Agent Collaboration, and Verbal Reinforcement Learning to emulate human decision-making during refactoring while preserving code correctness and readability. Our empirical study, conducted on 703 instances of "pure refactorings" (i.e., code changes exclusively involving structural improvements), drawn from 10 representative Java projects, covers the six most prevalent refactoring operations. Experimental results demonstrate that MANTRA substantially surpasses a baseline LLM model (RawGPT ), achieving an 82.8% success rate (582/703) in producing code that compiles and passes all tests, compared to just 8.7% (61/703) with RawGPT. Moreover, in comparison to IntelliJ's LLM-powered refactoring tool (EM-Assist), MANTRA exhibits a 50% improvement in generating Extract Method transformations. A usability study involving 37 professional developers further shows that refactorings performed by MANTRA are perceived to be as readable and reusable as human-written code, and in certain cases, even more favorable. These results highlight the practical advantages of MANTRA and emphasize the growing potential of LLM-based systems in advancing the automation of software refactoring tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14340v2-abstract-full').style.display = 'none'; document.getElementById('2503.14340v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13862">arXiv:2503.13862</a> <span> [<a href="https://arxiv.org/pdf/2503.13862">pdf</a>, <a href="https://arxiv.org/format/2503.13862">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> HySurvPred: Multimodal Hyperbolic Embedding with Angle-Aware Hierarchical Contrastive Learning and Uncertainty Constraints for Survival Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiaqi Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenting Chen</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+X">Xiaohan Xing</a>, <a href="/search/cs?searchtype=author&query=He%2C+S">Sean He</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+X">Xiaoling Luo</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+X">Xinheng Lyu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+L">Linlin Shen</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+G">Guoping Qiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13862v1-abstract-short" style="display: inline;"> Multimodal learning that integrates histopathology images and genomic data holds great promise for cancer survival prediction. However, existing methods face key limitations: 1) They rely on multimodal mapping and metrics in Euclidean space, which cannot fully capture the hierarchical structures in histopathology (among patches from different resolutions) and genomics data (from genes to pathways)… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13862v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13862v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13862v1-abstract-full" style="display: none;"> Multimodal learning that integrates histopathology images and genomic data holds great promise for cancer survival prediction. However, existing methods face key limitations: 1) They rely on multimodal mapping and metrics in Euclidean space, which cannot fully capture the hierarchical structures in histopathology (among patches from different resolutions) and genomics data (from genes to pathways). 2) They discretize survival time into independent risk intervals, which ignores its continuous and ordinal nature and fails to achieve effective optimization. 3) They treat censorship as a binary indicator, excluding censored samples from model optimization and not making full use of them. To address these challenges, we propose HySurvPred, a novel framework for survival prediction that integrates three key modules: Multimodal Hyperbolic Mapping (MHM), Angle-aware Ranking-based Contrastive Loss (ARCL) and Censor-Conditioned Uncertainty Constraint (CUC). Instead of relying on Euclidean space, we design the MHM module to explore the inherent hierarchical structures within each modality in hyperbolic space. To better integrate multimodal features in hyperbolic space, we introduce the ARCL module, which uses ranking-based contrastive learning to preserve the ordinal nature of survival time, along with the CUC module to fully explore the censored data. Extensive experiments demonstrate that our method outperforms state-of-the-art methods on five benchmark datasets. The source code is to be released. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13862v1-abstract-full').style.display = 'none'; document.getElementById('2503.13862v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submitted to IJCAI2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13813">arXiv:2503.13813</a> <span> [<a href="https://arxiv.org/pdf/2503.13813">pdf</a>, <a href="https://arxiv.org/format/2503.13813">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Automatic MILP Model Construction for Multi-Robot Task Allocation and Scheduling Based on Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+M">Mingming Peng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhendong Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jie Yang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jin Huang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zhengqi Shi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qihao Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xinyu Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+L">Liang Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13813v1-abstract-short" style="display: inline;"> With the accelerated development of Industry 4.0, intelligent manufacturing systems increasingly require efficient task allocation and scheduling in multi-robot systems. However, existing methods rely on domain expertise and face challenges in adapting to dynamic production constraints. Additionally, enterprises have high privacy requirements for production scheduling data, which prevents the use… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13813v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13813v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13813v1-abstract-full" style="display: none;"> With the accelerated development of Industry 4.0, intelligent manufacturing systems increasingly require efficient task allocation and scheduling in multi-robot systems. However, existing methods rely on domain expertise and face challenges in adapting to dynamic production constraints. Additionally, enterprises have high privacy requirements for production scheduling data, which prevents the use of cloud-based large language models (LLMs) for solution development. To address these challenges, there is an urgent need for an automated modeling solution that meets data privacy requirements. This study proposes a knowledge-augmented mixed integer linear programming (MILP) automated formulation framework, integrating local LLMs with domain-specific knowledge bases to generate executable code from natural language descriptions automatically. The framework employs a knowledge-guided DeepSeek-R1-Distill-Qwen-32B model to extract complex spatiotemporal constraints (82% average accuracy) and leverages a supervised fine-tuned Qwen2.5-Coder-7B-Instruct model for efficient MILP code generation (90% average accuracy). Experimental results demonstrate that the framework successfully achieves automatic modeling in the aircraft skin manufacturing case while ensuring data privacy and computational efficiency. This research provides a low-barrier and highly reliable technical path for modeling in complex industrial scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13813v1-abstract-full').style.display = 'none'; document.getElementById('2503.13813v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13522">arXiv:2503.13522</a> <span> [<a href="https://arxiv.org/pdf/2503.13522">pdf</a>, <a href="https://arxiv.org/ps/2503.13522">ps</a>, <a href="https://arxiv.org/format/2503.13522">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Advanced Deep Learning Methods for Protein Structure Prediction and Design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yichao Zhang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+N">Ningyuan Deng</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xinyuan Song</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+Z">Ziqian Bi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tianyang Wang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Z">Zheyu Yao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Keyu Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Ming Li</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+Q">Qian Niu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Junyu Liu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+B">Benji Peng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Sen Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Ming Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Li Zhang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+X">Xuanhe Pan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jinlang Wang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+P">Pohsun Feng</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Y">Yizhu Wen</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+L+K">Lawrence KQ Yan</a>, <a href="/search/cs?searchtype=author&query=Tseng%2C+H">Hongming Tseng</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+Y">Yan Zhong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yunze Wang</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Z">Ziyuan Qin</a>, <a href="/search/cs?searchtype=author&query=Jing%2C+B">Bowen Jing</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Junjie Yang</a> , et al. (3 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13522v3-abstract-short" style="display: inline;"> After AlphaFold won the Nobel Prize, protein prediction with deep learning once again became a hot topic. We comprehensively explore advanced deep learning methods applied to protein structure prediction and design. It begins by examining recent innovations in prediction architectures, with detailed discussions on improvements such as diffusion based frameworks and novel pairwise attention modules… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13522v3-abstract-full').style.display = 'inline'; document.getElementById('2503.13522v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13522v3-abstract-full" style="display: none;"> After AlphaFold won the Nobel Prize, protein prediction with deep learning once again became a hot topic. We comprehensively explore advanced deep learning methods applied to protein structure prediction and design. It begins by examining recent innovations in prediction architectures, with detailed discussions on improvements such as diffusion based frameworks and novel pairwise attention modules. The text analyses key components including structure generation, evaluation metrics, multiple sequence alignment processing, and network architecture, thereby illustrating the current state of the art in computational protein modelling. Subsequent chapters focus on practical applications, presenting case studies that range from individual protein predictions to complex biomolecular interactions. Strategies for enhancing prediction accuracy and integrating deep learning techniques with experimental validation are thoroughly explored. The later sections review the industry landscape of protein design, highlighting the transformative role of artificial intelligence in biotechnology and discussing emerging market trends and future challenges. Supplementary appendices provide essential resources such as databases and open source tools, making this volume a valuable reference for researchers and students. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13522v3-abstract-full').style.display = 'none'; document.getElementById('2503.13522v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13165">arXiv:2503.13165</a> <span> [<a href="https://arxiv.org/pdf/2503.13165">pdf</a>, <a href="https://arxiv.org/format/2503.13165">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> From Zero to Detail: Deconstructing Ultra-High-Definition Image Restoration from Progressive Spectral Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chen Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhizhou Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yunzhe Xu</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+E">Enxuan Gu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jian Li</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+Z">Zili Yi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qian Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jian Yang</a>, <a href="/search/cs?searchtype=author&query=Tai%2C+Y">Ying Tai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13165v1-abstract-short" style="display: inline;"> Ultra-high-definition (UHD) image restoration faces significant challenges due to its high resolution, complex content, and intricate details. To cope with these challenges, we analyze the restoration process in depth through a progressive spectral perspective, and deconstruct the complex UHD restoration problem into three progressive stages: zero-frequency enhancement, low-frequency restoration,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13165v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13165v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13165v1-abstract-full" style="display: none;"> Ultra-high-definition (UHD) image restoration faces significant challenges due to its high resolution, complex content, and intricate details. To cope with these challenges, we analyze the restoration process in depth through a progressive spectral perspective, and deconstruct the complex UHD restoration problem into three progressive stages: zero-frequency enhancement, low-frequency restoration, and high-frequency refinement. Building on this insight, we propose a novel framework, ERR, which comprises three collaborative sub-networks: the zero-frequency enhancer (ZFE), the low-frequency restorer (LFR), and the high-frequency refiner (HFR). Specifically, the ZFE integrates global priors to learn global mapping, while the LFR restores low-frequency information, emphasizing reconstruction of coarse-grained content. Finally, the HFR employs our designed frequency-windowed kolmogorov-arnold networks (FW-KAN) to refine textures and details, producing high-quality image restoration. Our approach significantly outperforms previous UHD methods across various tasks, with extensive ablation studies validating the effectiveness of each component. The code is available at \href{https://github.com/NJU-PCALab/ERR}{here}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13165v1-abstract-full').style.display = 'none'; document.getElementById('2503.13165v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13012">arXiv:2503.13012</a> <span> [<a href="https://arxiv.org/pdf/2503.13012">pdf</a>, <a href="https://arxiv.org/format/2503.13012">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Test-Time Domain Generalization via Universe Learning: A Multi-Graph Matching Approach for Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lv%2C+X">Xingguo Lv</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+X">Xingbo Dong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liwen Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiewen Yang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Lei Zhao</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+B">Bin Pu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zhe Jin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xuejun Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13012v1-abstract-short" style="display: inline;"> Despite domain generalization (DG) has significantly addressed the performance degradation of pre-trained models caused by domain shifts, it often falls short in real-world deployment. Test-time adaptation (TTA), which adjusts a learned model using unlabeled test data, presents a promising solution. However, most existing TTA methods struggle to deliver strong performance in medical image segmenta… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13012v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13012v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13012v1-abstract-full" style="display: none;"> Despite domain generalization (DG) has significantly addressed the performance degradation of pre-trained models caused by domain shifts, it often falls short in real-world deployment. Test-time adaptation (TTA), which adjusts a learned model using unlabeled test data, presents a promising solution. However, most existing TTA methods struggle to deliver strong performance in medical image segmentation, primarily because they overlook the crucial prior knowledge inherent to medical images. To address this challenge, we incorporate morphological information and propose a framework based on multi-graph matching. Specifically, we introduce learnable universe embeddings that integrate morphological priors during multi-source training, along with novel unsupervised test-time paradigms for domain adaptation. This approach guarantees cycle-consistency in multi-matching while enabling the model to more effectively capture the invariant priors of unseen data, significantly mitigating the effects of domain shifts. Extensive experiments demonstrate that our method outperforms other state-of-the-art approaches on two medical image segmentation benchmarks for both multi-source and single-source domain generalization tasks. The source code is available at https://github.com/Yore0/TTDG-MGM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13012v1-abstract-full').style.display = 'none'; document.getElementById('2503.13012v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12944">arXiv:2503.12944</a> <span> [<a href="https://arxiv.org/pdf/2503.12944">pdf</a>, <a href="https://arxiv.org/format/2503.12944">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GIFT: Generated Indoor video frames for Texture-less point tracking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jianzheng Huang</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+X">Xianyu Mo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziling Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jinyu Yang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+F">Feng Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12944v1-abstract-short" style="display: inline;"> Point tracking is becoming a powerful solver for motion estimation and video editing. Compared to classical feature matching, point tracking methods have the key advantage of robustly tracking points under complex camera motion trajectories and over extended periods. However, despite certain improvements in methodologies, current point tracking methods still struggle to track any position in video… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12944v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12944v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12944v1-abstract-full" style="display: none;"> Point tracking is becoming a powerful solver for motion estimation and video editing. Compared to classical feature matching, point tracking methods have the key advantage of robustly tracking points under complex camera motion trajectories and over extended periods. However, despite certain improvements in methodologies, current point tracking methods still struggle to track any position in video frames, especially in areas that are texture-less or weakly textured. In this work, we first introduce metrics for evaluating the texture intensity of a 3D object. Using these metrics, we classify the 3D models in ShapeNet into three levels of texture intensity and create GIFT, a challenging synthetic benchmark comprising 1800 indoor video sequences with rich annotations. Unlike existing datasets that assign ground truth points arbitrarily, GIFT precisely anchors ground truth on classified target objects, ensuring that each video corresponds to a specific texture intensity level. Furthermore, we comprehensively evaluate current methods on GIFT to assess their performance across different texture intensity levels and analyze the impact of texture on point tracking. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12944v1-abstract-full').style.display = 'none'; document.getElementById('2503.12944v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12404">arXiv:2503.12404</a> <span> [<a href="https://arxiv.org/pdf/2503.12404">pdf</a>, <a href="https://arxiv.org/format/2503.12404">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SAM2-ELNet: Label Enhancement and Automatic Annotation for Remote Sensing Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jianhao Yang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenshuo Yu</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+Y">Yuanchao Lv</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jiance Sun</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+B">Bokang Sun</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mingyang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12404v1-abstract-short" style="display: inline;"> Remote sensing image segmentation is crucial for environmental monitoring, disaster assessment, and resource management, directly affecting the accuracy and efficiency of surface information extraction. The performance of existing supervised models in remote sensing image segmentation tasks highly depends on the quality of label data. However, current label data mainly relies on manual annotation,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12404v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12404v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12404v1-abstract-full" style="display: none;"> Remote sensing image segmentation is crucial for environmental monitoring, disaster assessment, and resource management, directly affecting the accuracy and efficiency of surface information extraction. The performance of existing supervised models in remote sensing image segmentation tasks highly depends on the quality of label data. However, current label data mainly relies on manual annotation, which comes with high time costs and is subject to subjective interference, resulting in distortion of label boundaries and often a loss of detail. To solve the above problems, our work proposes an Edge-enhanced Labeling Network, called SAM2-ELNet, which incorporates a labeling module and an edge attention mechanism. This model effectively addresses issues such as label detail loss, fragmentation, and inaccurate boundaries. Due to the scarcity of manually annotated remote sensing data, the feature extraction capabilities of traditional neural networks are limited. Our method uses the Hiera backbone of the pre-trained self-supervised large model segment anything model 2 (SAM2) as the encoder, achieves high-quality and efficient feature extraction even with small samples by fine-tuning on downstream tasks. This study compared the training effects of original and enhanced labels on the manually annotated Deep-SAR Oil Spill (SOS) dataset. Results showed that the model trained with enhanced labels performed better and had a lower final loss, indicating closer alignment with the real data distribution. Our work also explores the potential of extending the model into an efficient automatic annotation framework through generalization experiments, facilitating large-scale remote sensing image interpretation and intelligent recognition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12404v1-abstract-full').style.display = 'none'; document.getElementById('2503.12404v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12180">arXiv:2503.12180</a> <span> [<a href="https://arxiv.org/pdf/2503.12180">pdf</a>, <a href="https://arxiv.org/format/2503.12180">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Bench2FreeAD: A Benchmark for Vision-based End-to-end Navigation in Unstructured Robotic Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yuhang Peng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sidong Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jihaoyu Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shilong Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Han Wang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+J">Jiangtao Gong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12180v1-abstract-short" style="display: inline;"> Most current end-to-end (E2E) autonomous driving algorithms are built on standard vehicles in structured transportation scenarios, lacking exploration of robot navigation for unstructured scenarios such as auxiliary roads, campus roads, and indoor settings. This paper investigates E2E robot navigation in unstructured road environments. First, we introduce two data collection pipelines - one for re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12180v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12180v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12180v1-abstract-full" style="display: none;"> Most current end-to-end (E2E) autonomous driving algorithms are built on standard vehicles in structured transportation scenarios, lacking exploration of robot navigation for unstructured scenarios such as auxiliary roads, campus roads, and indoor settings. This paper investigates E2E robot navigation in unstructured road environments. First, we introduce two data collection pipelines - one for real-world robot data and another for synthetic data generated using the Isaac Sim simulator, which together produce an unstructured robotics navigation dataset -- FreeWorld Dataset. Second, we fine-tuned an efficient E2E autonomous driving model -- VAD -- using our datasets to validate the performance and adaptability of E2E autonomous driving models in these environments. Results demonstrate that fine-tuning through our datasets significantly enhances the navigation potential of E2E autonomous driving models in unstructured robotic environments. Thus, this paper presents the first dataset targeting E2E robot navigation tasks in unstructured scenarios, and provides a benchmark based on vision-based E2E autonomous driving algorithms to facilitate the development of E2E navigation technology for logistics and service robots. The project is available on Github. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12180v1-abstract-full').style.display = 'none'; document.getElementById('2503.12180v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 9 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T45 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11958">arXiv:2503.11958</a> <span> [<a href="https://arxiv.org/pdf/2503.11958">pdf</a>, <a href="https://arxiv.org/format/2503.11958">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> CHOrD: Generation of Collision-Free, House-Scale, and Organized Digital Twins for 3D Indoor Scenes with Controllable Floor Plans and Optimal Layouts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+C">Chong Su</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yingbin Fu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zheyuan Hu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jing Yang</a>, <a href="/search/cs?searchtype=author&query=Hanji%2C+P">Param Hanji</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shaojun Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xuan Zhao</a>, <a href="/search/cs?searchtype=author&query=%C3%96ztireli%2C+C">Cengiz 脰ztireli</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+F">Fangcheng Zhong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11958v1-abstract-short" style="display: inline;"> We introduce CHOrD, a novel framework for scalable synthesis of 3D indoor scenes, designed to create house-scale, collision-free, and hierarchically structured indoor digital twins. In contrast to existing methods that directly synthesize the scene layout as a scene graph or object list, CHOrD incorporates a 2D image-based intermediate layout representation, enabling effective prevention of collis… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11958v1-abstract-full').style.display = 'inline'; document.getElementById('2503.11958v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11958v1-abstract-full" style="display: none;"> We introduce CHOrD, a novel framework for scalable synthesis of 3D indoor scenes, designed to create house-scale, collision-free, and hierarchically structured indoor digital twins. In contrast to existing methods that directly synthesize the scene layout as a scene graph or object list, CHOrD incorporates a 2D image-based intermediate layout representation, enabling effective prevention of collision artifacts by successfully capturing them as out-of-distribution (OOD) scenarios during generation. Furthermore, unlike existing methods, CHOrD is capable of generating scene layouts that adhere to complex floor plans with multi-modal controls, enabling the creation of coherent, house-wide layouts robust to both geometric and semantic variations in room structures. Additionally, we propose a novel dataset with expanded coverage of household items and room configurations, as well as significantly improved data quality. CHOrD demonstrates state-of-the-art performance on both the 3D-FRONT and our proposed datasets, delivering photorealistic, spatially coherent indoor scene synthesis adaptable to arbitrary floor plan variations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11958v1-abstract-full').style.display = 'none'; document.getElementById('2503.11958v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Chong Su and Yingbin Fu contributed equally to this work</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11465">arXiv:2503.11465</a> <span> [<a href="https://arxiv.org/pdf/2503.11465">pdf</a>, <a href="https://arxiv.org/format/2503.11465">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Remote Photoplethysmography in Real-World and Extreme Lighting Scenarios </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shao%2C+H">Hang Shao</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+L">Lei Luo</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+J">Jianjun Qian</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Mengkai Yan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shuo Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jian Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11465v1-abstract-short" style="display: inline;"> Physiological activities can be manifested by the sensitive changes in facial imaging. While they are barely observable to our eyes, computer vision manners can, and the derived remote photoplethysmography (rPPG) has shown considerable promise. However, existing studies mainly rely on spatial skin recognition and temporal rhythmic interactions, so they focus on identifying explicit features under… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11465v1-abstract-full').style.display = 'inline'; document.getElementById('2503.11465v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11465v1-abstract-full" style="display: none;"> Physiological activities can be manifested by the sensitive changes in facial imaging. While they are barely observable to our eyes, computer vision manners can, and the derived remote photoplethysmography (rPPG) has shown considerable promise. However, existing studies mainly rely on spatial skin recognition and temporal rhythmic interactions, so they focus on identifying explicit features under ideal light conditions, but perform poorly in-the-wild with intricate obstacles and extreme illumination exposure. In this paper, we propose an end-to-end video transformer model for rPPG. It strives to eliminate complex and unknown external time-varying interferences, whether they are sufficient to occupy subtle biosignal amplitudes or exist as periodic perturbations that hinder network training. In the specific implementation, we utilize global interference sharing, subject background reference, and self-supervised disentanglement to eliminate interference, and further guide learning based on spatiotemporal filtering, reconstruction guidance, and frequency domain and biological prior constraints to achieve effective rPPG. To the best of our knowledge, this is the first robust rPPG model for real outdoor scenarios based on natural face videos, and is lightweight to deploy. Extensive experiments show the competitiveness and performance of our model in rPPG prediction across datasets and scenes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11465v1-abstract-full').style.display = 'none'; document.getElementById('2503.11465v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10907">arXiv:2503.10907</a> <span> [<a href="https://arxiv.org/pdf/2503.10907">pdf</a>, <a href="https://arxiv.org/format/2503.10907">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> H2-MARL: Multi-Agent Reinforcement Learning for Pareto Optimality in Hospital Capacity Strain and Human Mobility during Epidemic </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+X">Xueting Luo</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Hao Deng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jihong Yang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yao Shen</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Huanhuan Guo</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zhiyuan Sun</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mingqing Liu</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+J">Jiming Wei</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shengjie Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10907v1-abstract-short" style="display: inline;"> The necessity of achieving an effective balance between minimizing the losses associated with restricting human mobility and ensuring hospital capacity has gained significant attention in the aftermath of COVID-19. Reinforcement learning (RL)-based strategies for human mobility management have recently advanced in addressing the dynamic evolution of cities and epidemics; however, they still face c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10907v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10907v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10907v1-abstract-full" style="display: none;"> The necessity of achieving an effective balance between minimizing the losses associated with restricting human mobility and ensuring hospital capacity has gained significant attention in the aftermath of COVID-19. Reinforcement learning (RL)-based strategies for human mobility management have recently advanced in addressing the dynamic evolution of cities and epidemics; however, they still face challenges in achieving coordinated control at the township level and adapting to cities of varying scales. To address the above issues, we propose a multi-agent RL approach that achieves Pareto optimality in managing hospital capacity and human mobility (H2-MARL), applicable across cities of different scales. We first develop a township-level infection model with online-updatable parameters to simulate disease transmission and construct a city-wide dynamic spatiotemporal epidemic simulator. On this basis, H2-MARL is designed to treat each division as an agent, with a trade-off dual-objective reward function formulated and an experience replay buffer enriched with expert knowledge built. To evaluate the effectiveness of the model, we construct a township-level human mobility dataset containing over one billion records from four representative cities of varying scales. Extensive experiments demonstrate that H2-MARL has the optimal dual-objective trade-off capability, which can minimize hospital capacity strain while minimizing human mobility restriction loss. Meanwhile, the applicability of the proposed model to epidemic control in cities of varying scales is verified, which showcases its feasibility and versatility in practical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10907v1-abstract-full').style.display = 'none'; document.getElementById('2503.10907v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10857">arXiv:2503.10857</a> <span> [<a href="https://arxiv.org/pdf/2503.10857">pdf</a>, <a href="https://arxiv.org/format/2503.10857">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Understanding Graphical Perception in Large Multimodal Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jianwei Yang</a>, <a href="/search/cs?searchtype=author&query=Inala%2C+J+P">Jeevana Priya Inala</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+C">Chandan Singh</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jianfeng Gao</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yu Su</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chenglong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10857v1-abstract-short" style="display: inline;"> Despite the promising results of large multimodal models (LMMs) in complex vision-language tasks that require knowledge, reasoning, and perception abilities together, we surprisingly found that these models struggle with simple tasks on infographics that require perception only. As existing benchmarks primarily focus on end tasks that require various abilities, they provide limited, fine-grained i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10857v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10857v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10857v1-abstract-full" style="display: none;"> Despite the promising results of large multimodal models (LMMs) in complex vision-language tasks that require knowledge, reasoning, and perception abilities together, we surprisingly found that these models struggle with simple tasks on infographics that require perception only. As existing benchmarks primarily focus on end tasks that require various abilities, they provide limited, fine-grained insights into the limitations of the models' perception abilities. To address this gap, we leverage the theory of graphical perception, an approach used to study how humans decode visual information encoded on charts and graphs, to develop an evaluation framework for analyzing gaps in LMMs' perception abilities in charts. With automated task generation and response evaluation designs, our framework enables comprehensive and controlled testing of LMMs' graphical perception across diverse chart types, visual elements, and task types. We apply our framework to evaluate and diagnose the perception capabilities of state-of-the-art LMMs at three granularity levels (chart, visual element, and pixel). Our findings underscore several critical limitations of current state-of-the-art LMMs, including GPT-4o: their inability to (1) generalize across chart types, (2) understand fundamental visual elements, and (3) cross reference values within a chart. These insights provide guidance for future improvements in perception abilities of LMMs. The evaluation framework and labeled data are publicly available at https://github.com/microsoft/lmm-graphical-perception. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10857v1-abstract-full').style.display = 'none'; document.getElementById('2503.10857v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in Progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10586">arXiv:2503.10586</a> <span> [<a href="https://arxiv.org/pdf/2503.10586">pdf</a>, <a href="https://arxiv.org/format/2503.10586">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unlock the Power of Unlabeled Data in Language Driving Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chaoqun Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jie Yang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+X">Xiaobin Hong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruimao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10586v2-abstract-short" style="display: inline;"> Recent Vision-based Large Language Models~(VisionLLMs) for autonomous driving have seen rapid advancements. However, such promotion is extremely dependent on large-scale high-quality annotated data, which is costly and labor-intensive. To address this issue, we propose unlocking the value of abundant yet unlabeled data to improve the language-driving model in a semi-supervised learning manner. Spe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10586v2-abstract-full').style.display = 'inline'; document.getElementById('2503.10586v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10586v2-abstract-full" style="display: none;"> Recent Vision-based Large Language Models~(VisionLLMs) for autonomous driving have seen rapid advancements. However, such promotion is extremely dependent on large-scale high-quality annotated data, which is costly and labor-intensive. To address this issue, we propose unlocking the value of abundant yet unlabeled data to improve the language-driving model in a semi-supervised learning manner. Specifically, we first introduce a series of template-based prompts to extract scene information, generating questions that create pseudo-answers for the unlabeled data based on a model trained with limited labeled data. Next, we propose a Self-Consistency Refinement method to improve the quality of these pseudo-annotations, which are later used for further training. By utilizing a pre-trained VisionLLM (e.g., InternVL), we build a strong Language Driving Model (LDM) for driving scene question-answering, outperforming previous state-of-the-art methods. Extensive experiments on the DriveLM benchmark show that our approach performs well with just 5% labeled data, achieving competitive performance against models trained with full datasets. In particular, our LDM achieves 44.85% performance with limited labeled data, increasing to 54.27% when using unlabeled data, while models trained with full datasets reach 60.68% on the DriveLM benchmark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10586v2-abstract-full').style.display = 'none'; document.getElementById('2503.10586v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICRA2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10568">arXiv:2503.10568</a> <span> [<a href="https://arxiv.org/pdf/2503.10568">pdf</a>, <a href="https://arxiv.org/format/2503.10568">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Autoregressive Image Generation with Randomized Parallel Decoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">Haopeng Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jinyue Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guoqi Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Huan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10568v1-abstract-short" style="display: inline;"> We introduce ARPG, a novel visual autoregressive model that enables randomized parallel generation, addressing the inherent limitations of conventional raster-order approaches, which hinder inference efficiency and zero-shot generalization due to their sequential, predefined token generation order. Our key insight is that effective random-order modeling necessitates explicit guidance for determini… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10568v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10568v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10568v1-abstract-full" style="display: none;"> We introduce ARPG, a novel visual autoregressive model that enables randomized parallel generation, addressing the inherent limitations of conventional raster-order approaches, which hinder inference efficiency and zero-shot generalization due to their sequential, predefined token generation order. Our key insight is that effective random-order modeling necessitates explicit guidance for determining the position of the next predicted token. To this end, we propose a novel guided decoding framework that decouples positional guidance from content representation, encoding them separately as queries and key-value pairs. By directly incorporating this guidance into the causal attention mechanism, our approach enables fully random-order training and generation, eliminating the need for bidirectional attention. Consequently, ARPG readily generalizes to zero-shot tasks such as image inpainting, outpainting, and resolution expansion. Furthermore, it supports parallel inference by concurrently processing multiple queries using a shared KV cache. On the ImageNet-1K 256 benchmark, our approach attains an FID of 1.94 with only 64 sampling steps, achieving over a 20-fold increase in throughput while reducing memory consumption by over 75% compared to representative recent autoregressive models at a similar scale. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10568v1-abstract-full').style.display = 'none'; document.getElementById('2503.10568v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10150">arXiv:2503.10150</a> <span> [<a href="https://arxiv.org/pdf/2503.10150">pdf</a>, <a href="https://arxiv.org/format/2503.10150">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Retrieval-Augmented Generation with Hierarchical Knowledge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+H">Haoyu Huang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yongfeng Huang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Junjie Yang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Z">Zhenyu Pan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yongqiang Chen</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+K">Kaili Ma</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hongzhi Chen</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">James Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10150v1-abstract-short" style="display: inline;"> Graph-based Retrieval-Augmented Generation (RAG) methods have significantly enhanced the performance of large language models (LLMs) in domain-specific tasks. However, existing RAG methods do not adequately utilize the naturally inherent hierarchical knowledge in human cognition, which limits the capabilities of RAG systems. In this paper, we introduce a new RAG approach, called HiRAG, which utili… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10150v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10150v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10150v1-abstract-full" style="display: none;"> Graph-based Retrieval-Augmented Generation (RAG) methods have significantly enhanced the performance of large language models (LLMs) in domain-specific tasks. However, existing RAG methods do not adequately utilize the naturally inherent hierarchical knowledge in human cognition, which limits the capabilities of RAG systems. In this paper, we introduce a new RAG approach, called HiRAG, which utilizes hierarchical knowledge to enhance the semantic understanding and structure capturing capabilities of RAG systems in the indexing and retrieval processes. Our extensive experiments demonstrate that HiRAG achieves significant performance improvements over the state-of-the-art baseline methods. The code of our proposed method is available at \href{https://github.com/hhy-huang/HiRAG}{https://github.com/hhy-huang/HiRAG}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10150v1-abstract-full').style.display = 'none'; document.getElementById('2503.10150v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10149">arXiv:2503.10149</a> <span> [<a href="https://arxiv.org/pdf/2503.10149">pdf</a>, <a href="https://arxiv.org/format/2503.10149">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unlocking Generalization Power in LiDAR Point Cloud Registration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Zhenxuan Zeng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qiao Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L+Y">Lin Yuanbo Wu</a>, <a href="/search/cs?searchtype=author&query=An%2C+P">Pei An</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiaqi Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Ji Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10149v1-abstract-short" style="display: inline;"> In real-world environments, a LiDAR point cloud registration method with robust generalization capabilities (across varying distances and datasets) is crucial for ensuring safety in autonomous driving and other LiDAR-based applications. However, current methods fall short in achieving this level of generalization. To address these limitations, we propose UGP, a pruned framework designed to enhance… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10149v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10149v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10149v1-abstract-full" style="display: none;"> In real-world environments, a LiDAR point cloud registration method with robust generalization capabilities (across varying distances and datasets) is crucial for ensuring safety in autonomous driving and other LiDAR-based applications. However, current methods fall short in achieving this level of generalization. To address these limitations, we propose UGP, a pruned framework designed to enhance generalization power for LiDAR point cloud registration. The core insight in UGP is the elimination of cross-attention mechanisms to improve generalization, allowing the network to concentrate on intra-frame feature extraction. Additionally, we introduce a progressive self-attention module to reduce ambiguity in large-scale scenes and integrate Bird's Eye View (BEV) features to incorporate semantic information about scene elements. Together, these enhancements significantly boost the network's generalization performance. We validated our approach through various generalization experiments in multiple outdoor scenes. In cross-distance generalization experiments on KITTI and nuScenes, UGP achieved state-of-the-art mean Registration Recall rates of 94.5% and 91.4%, respectively. In cross-dataset generalization from nuScenes to KITTI, UGP achieved a state-of-the-art mean Registration Recall of 90.9%. Code will be available at https://github.com/peakpang/UGP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10149v1-abstract-full').style.display = 'none'; document.getElementById('2503.10149v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10086">arXiv:2503.10086</a> <span> [<a href="https://arxiv.org/pdf/2503.10086">pdf</a>, <a href="https://arxiv.org/format/2503.10086">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Efficient Adapter Tuning for Joint Singing Voice Beat and Downbeat Tracking with Self-supervised Learning Features </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+J">Jiajun Deng</a>, <a href="/search/cs?searchtype=author&query=Ju%2C+Y">Yaolong Ju</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jing Yang</a>, <a href="/search/cs?searchtype=author&query=Lui%2C+S">Simon Lui</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xunying Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10086v1-abstract-short" style="display: inline;"> Singing voice beat tracking is a challenging task, due to the lack of musical accompaniment that often contains robust rhythmic and harmonic patterns, something most existing beat tracking systems utilize and can be essential for estimating beats. In this paper, a novel temporal convolutional network-based beat-tracking approach featuring self-supervised learning (SSL) representations and adapter… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10086v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10086v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10086v1-abstract-full" style="display: none;"> Singing voice beat tracking is a challenging task, due to the lack of musical accompaniment that often contains robust rhythmic and harmonic patterns, something most existing beat tracking systems utilize and can be essential for estimating beats. In this paper, a novel temporal convolutional network-based beat-tracking approach featuring self-supervised learning (SSL) representations and adapter tuning is proposed to track the beat and downbeat of singing voices jointly. The SSL DistilHuBERT representations are utilized to capture the semantic information of singing voices and are further fused with the generic spectral features to facilitate beat estimation. Sources of variabilities that are particularly prominent with the non-homogeneous singing voice data are reduced by the efficient adapter tuning. Extensive experiments show that feature fusion and adapter tuning improve the performance individually, and the combination of both leads to significantly better performances than the un-adapted baseline system, with up to 31.6% and 42.4% absolute F1-score improvements on beat and downbeat tracking, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10086v1-abstract-full').style.display = 'none'; document.getElementById('2503.10086v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ISMIR2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09814">arXiv:2503.09814</a> <span> [<a href="https://arxiv.org/pdf/2503.09814">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.cossms.2025.101214">10.1016/j.cossms.2025.101214 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A practical guide to machine learning interatomic potentials -- Status and future </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jacobs%2C+R">Ryan Jacobs</a>, <a href="/search/cs?searchtype=author&query=Morgan%2C+D">Dane Morgan</a>, <a href="/search/cs?searchtype=author&query=Attarian%2C+S">Siamak Attarian</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+J">Jun Meng</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chen Shen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhenghao Wu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+C+Y">Clare Yijia Xie</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J+H">Julia H. Yang</a>, <a href="/search/cs?searchtype=author&query=Artrith%2C+N">Nongnuch Artrith</a>, <a href="/search/cs?searchtype=author&query=Blaiszik%2C+B">Ben Blaiszik</a>, <a href="/search/cs?searchtype=author&query=Ceder%2C+G">Gerbrand Ceder</a>, <a href="/search/cs?searchtype=author&query=Choudhary%2C+K">Kamal Choudhary</a>, <a href="/search/cs?searchtype=author&query=Csanyi%2C+G">Gabor Csanyi</a>, <a href="/search/cs?searchtype=author&query=Cubuk%2C+E+D">Ekin Dogus Cubuk</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+B">Bowen Deng</a>, <a href="/search/cs?searchtype=author&query=Drautz%2C+R">Ralf Drautz</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+X">Xiang Fu</a>, <a href="/search/cs?searchtype=author&query=Godwin%2C+J">Jonathan Godwin</a>, <a href="/search/cs?searchtype=author&query=Honavar%2C+V">Vasant Honavar</a>, <a href="/search/cs?searchtype=author&query=Isayev%2C+O">Olexandr Isayev</a>, <a href="/search/cs?searchtype=author&query=Johansson%2C+A">Anders Johansson</a>, <a href="/search/cs?searchtype=author&query=Kozinsky%2C+B">Boris Kozinsky</a>, <a href="/search/cs?searchtype=author&query=Martiniani%2C+S">Stefano Martiniani</a>, <a href="/search/cs?searchtype=author&query=Ong%2C+S+P">Shyue Ping Ong</a>, <a href="/search/cs?searchtype=author&query=Poltavsky%2C+I">Igor Poltavsky</a> , et al. (5 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09814v1-abstract-short" style="display: inline;"> The rapid development and large body of literature on machine learning interatomic potentials (MLIPs) can make it difficult to know how to proceed for researchers who are not experts but wish to use these tools. The spirit of this review is to help such researchers by serving as a practical, accessible guide to the state-of-the-art in MLIPs. This review paper covers a broad range of topics related… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09814v1-abstract-full').style.display = 'inline'; document.getElementById('2503.09814v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09814v1-abstract-full" style="display: none;"> The rapid development and large body of literature on machine learning interatomic potentials (MLIPs) can make it difficult to know how to proceed for researchers who are not experts but wish to use these tools. The spirit of this review is to help such researchers by serving as a practical, accessible guide to the state-of-the-art in MLIPs. This review paper covers a broad range of topics related to MLIPs, including (i) central aspects of how and why MLIPs are enablers of many exciting advancements in molecular modeling, (ii) the main underpinnings of different types of MLIPs, including their basic structure and formalism, (iii) the potentially transformative impact of universal MLIPs for both organic and inorganic systems, including an overview of the most recent advances, capabilities, downsides, and potential applications of this nascent class of MLIPs, (iv) a practical guide for estimating and understanding the execution speed of MLIPs, including guidance for users based on hardware availability, type of MLIP used, and prospective simulation size and time, (v) a manual for what MLIP a user should choose for a given application by considering hardware resources, speed requirements, energy and force accuracy requirements, as well as guidance for choosing pre-trained potentials or fitting a new potential from scratch, (vi) discussion around MLIP infrastructure, including sources of training data, pre-trained potentials, and hardware resources for training, (vii) summary of some key limitations of present MLIPs and current approaches to mitigate such limitations, including methods of including long-range interactions, handling magnetic systems, and treatment of excited states, and finally (viii) we finish with some more speculative thoughts on what the future holds for the development and application of MLIPs over the next 3-10+ years. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09814v1-abstract-full').style.display = 'none'; document.getElementById('2503.09814v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Current Opinion in Solid State and Materials Science, 35, 101214 (2025) </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yang%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yang%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository