Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 2,837 results for author: <span class="mathjax">Li, L</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Li%2C+L">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Li, L"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Li%2C+L&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Li, L"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Li%2C+L&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Li%2C+L&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+L&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+L&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+L&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+L&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19449">arXiv:2503.19449</a> <span> [<a href="https://arxiv.org/pdf/2503.19449">pdf</a>, <a href="https://arxiv.org/format/2503.19449">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> </div> </div> <p class="title is-5 mathjax"> VecTrans: LLM Transformation Framework for Better Auto-vectorization on High-performance CPU </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zhongchun Zheng</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+L">Long Cheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lu Li</a>, <a href="/search/cs?searchtype=author&query=Rocha%2C+R+C+O">Rodrigo C. O. Rocha</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianyi Liu</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+W">Wei Wei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xianwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yaoqing Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19449v1-abstract-short" style="display: inline;"> Large language models (LLMs) have demonstrated great capabilities in code generation, yet their effective application in compiler optimizations remains an open challenge due to issues such as hallucinations and a lack of domain-specific reasoning. Vectorization, a crucial optimization for enhancing code performance, often fails because of the compiler's inability to recognize complex code patterns… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19449v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19449v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19449v1-abstract-full" style="display: none;"> Large language models (LLMs) have demonstrated great capabilities in code generation, yet their effective application in compiler optimizations remains an open challenge due to issues such as hallucinations and a lack of domain-specific reasoning. Vectorization, a crucial optimization for enhancing code performance, often fails because of the compiler's inability to recognize complex code patterns, which commonly require extensive empirical expertise. LLMs, with their ability to capture intricate patterns, thus providing a promising solution to this challenge. This paper presents VecTrans, a novel framework that leverages LLMs to enhance compiler-based code vectorization. VecTrans first employs compiler analysis to identify potentially vectorizable code regions. It then utilizes an LLM to refactor these regions into patterns that are more amenable to the compiler's auto-vectorization. To ensure semantic correctness, VecTrans further integrates a hybrid validation mechanism at the intermediate representation (IR) level. With the above efforts, VecTrans combines the adaptability of LLMs with the precision of compiler vectorization, thereby effectively opening up the vectorization opportunities. Experimental results show that among all 50 TSVC functions unvectorizable by Clang, GCC, and BiShengCompiler, VecTrans successfully vectorizes 23 cases (46%) and achieves an average speedup of 2.02x, greatly surpassing state-of-the-art performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19449v1-abstract-full').style.display = 'none'; document.getElementById('2503.19449v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19312">arXiv:2503.19312</a> <span> [<a href="https://arxiv.org/pdf/2503.19312">pdf</a>, <a href="https://arxiv.org/format/2503.19312">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ImageGen-CoT: Enhancing Text-to-Image In-context Learning with Chain-of-Thought Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jiaqi Liao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhengyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Linjie Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dianqi Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+K">Kevin Lin</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yu Cheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lijuan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19312v1-abstract-short" style="display: inline;"> In this work, we study the problem of Text-to-Image In-Context Learning (T2I-ICL). While Unified Multimodal LLMs (MLLMs) have advanced rapidly in recent years, they struggle with contextual reasoning in T2I-ICL scenarios. To address this limitation, we propose a novel framework that incorporates a thought process called ImageGen-CoT prior to image generation. To avoid generating unstructured ineff… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19312v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19312v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19312v1-abstract-full" style="display: none;"> In this work, we study the problem of Text-to-Image In-Context Learning (T2I-ICL). While Unified Multimodal LLMs (MLLMs) have advanced rapidly in recent years, they struggle with contextual reasoning in T2I-ICL scenarios. To address this limitation, we propose a novel framework that incorporates a thought process called ImageGen-CoT prior to image generation. To avoid generating unstructured ineffective reasoning steps, we develop an automatic pipeline to curate a high-quality ImageGen-CoT dataset. We then fine-tune MLLMs using this dataset to enhance their contextual reasoning capabilities. To further enhance performance, we explore test-time scale-up strategies and propose a novel hybrid scaling approach. This approach first generates multiple ImageGen-CoT chains and then produces multiple images for each chain via sampling. Extensive experiments demonstrate the effectiveness of our proposed method. Notably, fine-tuning with the ImageGen-CoT dataset leads to a substantial 80\% performance gain for SEED-X on T2I-ICL tasks. See our project page at https://ImageGen-CoT.github.io/. Code and model weights will be open-sourced. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19312v1-abstract-full').style.display = 'none'; document.getElementById('2503.19312v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://ImageGen-CoT.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18948">arXiv:2503.18948</a> <span> [<a href="https://arxiv.org/pdf/2503.18948">pdf</a>, <a href="https://arxiv.org/format/2503.18948">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Equivariant Image Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+R">Ruixiao Dong</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mengde Xu</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+Z">Zigang Geng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Li Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Han Hu</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+S">Shuyang Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18948v1-abstract-short" style="display: inline;"> Current generative models, such as autoregressive and diffusion approaches, decompose high-dimensional data distribution learning into a series of simpler subtasks. However, inherent conflicts arise during the joint optimization of these subtasks, and existing solutions fail to resolve such conflicts without sacrificing efficiency or scalability. We propose a novel equivariant image modeling frame… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18948v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18948v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18948v1-abstract-full" style="display: none;"> Current generative models, such as autoregressive and diffusion approaches, decompose high-dimensional data distribution learning into a series of simpler subtasks. However, inherent conflicts arise during the joint optimization of these subtasks, and existing solutions fail to resolve such conflicts without sacrificing efficiency or scalability. We propose a novel equivariant image modeling framework that inherently aligns optimization targets across subtasks by leveraging the translation invariance of natural visual signals. Our method introduces (1) column-wise tokenization which enhances translational symmetry along the horizontal axis, and (2) windowed causal attention which enforces consistent contextual relationships across positions. Evaluated on class-conditioned ImageNet generation at 256x256 resolution, our approach achieves performance comparable to state-of-the-art AR models while using fewer computational resources. Systematic analysis demonstrates that enhanced equivariance reduces inter-task conflicts, significantly improving zero-shot generalization and enabling ultra-long image synthesis. This work establishes the first framework for task-aligned decomposition in generative modeling, offering insights into efficient parameter sharing and conflict-free optimization. The code and models are publicly available at https://github.com/drx-code/EquivariantModeling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18948v1-abstract-full').style.display = 'none'; document.getElementById('2503.18948v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18783">arXiv:2503.18783</a> <span> [<a href="https://arxiv.org/pdf/2503.18783">pdf</a>, <a href="https://arxiv.org/format/2503.18783">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Frequency Dynamic Convolution for Dense Image Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+L">Linwei Chen</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+L">Lin Gu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Liang Li</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+C">Chenggang Yan</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Ying Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18783v2-abstract-short" style="display: inline;"> While Dynamic Convolution (DY-Conv) has shown promising performance by enabling adaptive weight selection through multiple parallel weights combined with an attention mechanism, the frequency response of these weights tends to exhibit high similarity, resulting in high parameter costs but limited adaptability. In this work, we introduce Frequency Dynamic Convolution (FDConv), a novel approach that… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18783v2-abstract-full').style.display = 'inline'; document.getElementById('2503.18783v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18783v2-abstract-full" style="display: none;"> While Dynamic Convolution (DY-Conv) has shown promising performance by enabling adaptive weight selection through multiple parallel weights combined with an attention mechanism, the frequency response of these weights tends to exhibit high similarity, resulting in high parameter costs but limited adaptability. In this work, we introduce Frequency Dynamic Convolution (FDConv), a novel approach that mitigates these limitations by learning a fixed parameter budget in the Fourier domain. FDConv divides this budget into frequency-based groups with disjoint Fourier indices, enabling the construction of frequency-diverse weights without increasing the parameter cost. To further enhance adaptability, we propose Kernel Spatial Modulation (KSM) and Frequency Band Modulation (FBM). KSM dynamically adjusts the frequency response of each filter at the spatial level, while FBM decomposes weights into distinct frequency bands in the frequency domain and modulates them dynamically based on local content. Extensive experiments on object detection, segmentation, and classification validate the effectiveness of FDConv. We demonstrate that when applied to ResNet-50, FDConv achieves superior performance with a modest increase of +3.6M parameters, outperforming previous methods that require substantial increases in parameter budgets (e.g., CondConv +90M, KW +76.5M). Moreover, FDConv seamlessly integrates into a variety of architectures, including ConvNeXt, Swin-Transformer, offering a flexible and efficient solution for modern vision tasks. The code is made publicly available at https://github.com/Linwei-Chen/FDConv. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18783v2-abstract-full').style.display = 'none'; document.getElementById('2503.18783v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18718">arXiv:2503.18718</a> <span> [<a href="https://arxiv.org/pdf/2503.18718">pdf</a>, <a href="https://arxiv.org/format/2503.18718">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> GS-Marker: Generalizable and Robust Watermarking for 3D Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+L">Lijiang Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jinglu Wang</a>, <a href="/search/cs?searchtype=author&query=Ming%2C+X">Xiang Ming</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yan Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18718v1-abstract-short" style="display: inline;"> In the Generative AI era, safeguarding 3D models has become increasingly urgent. While invisible watermarking is well-established for 2D images with encoder-decoder frameworks, generalizable and robust solutions for 3D remain elusive. The main difficulty arises from the renderer between the 3D encoder and 2D decoder, which disrupts direct gradient flow and complicates training. Existing 3D methods… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18718v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18718v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18718v1-abstract-full" style="display: none;"> In the Generative AI era, safeguarding 3D models has become increasingly urgent. While invisible watermarking is well-established for 2D images with encoder-decoder frameworks, generalizable and robust solutions for 3D remain elusive. The main difficulty arises from the renderer between the 3D encoder and 2D decoder, which disrupts direct gradient flow and complicates training. Existing 3D methods typically rely on per-scene iterative optimization, resulting in time inefficiency and limited generalization. In this work, we propose a single-pass watermarking approach for 3D Gaussian Splatting (3DGS), a well-known yet underexplored representation for watermarking. We identify two major challenges: (1) ensuring effective training generalized across diverse 3D models, and (2) reliably extracting watermarks from free-view renderings, even under distortions. Our framework, named GS-Marker, incorporates a 3D encoder to embed messages, distortion layers to enhance resilience against various distortions, and a 2D decoder to extract watermarks from renderings. A key innovation is the Adaptive Marker Control mechanism that adaptively perturbs the initially optimized 3DGS, escaping local minima and improving both training stability and convergence. Extensive experiments show that GS-Marker outperforms per-scene training approaches in terms of decoding accuracy and model fidelity, while also significantly reducing computation time. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18718v1-abstract-full').style.display = 'none'; document.getElementById('2503.18718v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18461">arXiv:2503.18461</a> <span> [<a href="https://arxiv.org/pdf/2503.18461">pdf</a>, <a href="https://arxiv.org/format/2503.18461">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MuMA: 3D PBR Texturing via Multi-Channel Multi-View Generation and Agentic Post-Processing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Lingting Zhu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jingrui Ye</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Runze Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zeyu Hu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Y">Yingda Yin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lanjiong Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jinnan Chen</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+S">Shengju Qian</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+Q">Qingmin Liao</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+L">Lequan Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18461v1-abstract-short" style="display: inline;"> Current methods for 3D generation still fall short in physically based rendering (PBR) texturing, primarily due to limited data and challenges in modeling multi-channel materials. In this work, we propose MuMA, a method for 3D PBR texturing through Multi-channel Multi-view generation and Agentic post-processing. Our approach features two key innovations: 1) We opt to model shaded and albedo appear… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18461v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18461v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18461v1-abstract-full" style="display: none;"> Current methods for 3D generation still fall short in physically based rendering (PBR) texturing, primarily due to limited data and challenges in modeling multi-channel materials. In this work, we propose MuMA, a method for 3D PBR texturing through Multi-channel Multi-view generation and Agentic post-processing. Our approach features two key innovations: 1) We opt to model shaded and albedo appearance channels, where the shaded channels enables the integration intrinsic decomposition modules for material properties. 2) Leveraging multimodal large language models, we emulate artists' techniques for material assessment and selection. Experiments demonstrate that MuMA achieves superior results in visual quality and material fidelity compared to existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18461v1-abstract-full').style.display = 'none'; document.getElementById('2503.18461v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18283">arXiv:2503.18283</a> <span> [<a href="https://arxiv.org/pdf/2503.18283">pdf</a>, <a href="https://arxiv.org/format/2503.18283">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Voxel-based Point Cloud Geometry Compression with Space-to-Channel Context </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bojun Liu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yangzhi Ma</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+A">Ao Luo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Li Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dong Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18283v1-abstract-short" style="display: inline;"> Voxel-based methods are among the most efficient for point cloud geometry compression, particularly with dense point clouds. However, they face limitations due to a restricted receptive field, especially when handling high-bit depth point clouds. To overcome this issue, we introduce a stage-wise Space-to-Channel (S2C) context model for both dense point clouds and low-level sparse point clouds. Thi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18283v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18283v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18283v1-abstract-full" style="display: none;"> Voxel-based methods are among the most efficient for point cloud geometry compression, particularly with dense point clouds. However, they face limitations due to a restricted receptive field, especially when handling high-bit depth point clouds. To overcome this issue, we introduce a stage-wise Space-to-Channel (S2C) context model for both dense point clouds and low-level sparse point clouds. This model utilizes a channel-wise autoregressive strategy to effectively integrate neighborhood information at a coarse resolution. For high-level sparse point clouds, we further propose a level-wise S2C context model that addresses resolution limitations by incorporating Geometry Residual Coding (GRC) for consistent-resolution cross-level prediction. Additionally, we use the spherical coordinate system for its compact representation and enhance our GRC approach with a Residual Probability Approximation (RPA) module, which features a large kernel size. Experimental results show that our S2C context model not only achieves bit savings while maintaining or improving reconstruction quality but also reduces computational complexity compared to state-of-the-art voxel-based compression methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18283v1-abstract-full').style.display = 'none'; document.getElementById('2503.18283v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18055">arXiv:2503.18055</a> <span> [<a href="https://arxiv.org/pdf/2503.18055">pdf</a>, <a href="https://arxiv.org/format/2503.18055">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PolarFree: Polarization-based Reflection-free Imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+M">Mingde Yao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Menglu Wang</a>, <a href="/search/cs?searchtype=author&query=Tam%2C+K">King-Man Tam</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lingen Li</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+T">Tianfan Xue</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jinwei Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18055v1-abstract-short" style="display: inline;"> Reflection removal is challenging due to complex light interactions, where reflections obscure important details and hinder scene understanding. Polarization naturally provides a powerful cue to distinguish between reflected and transmitted light, enabling more accurate reflection removal. However, existing methods often rely on small-scale or synthetic datasets, which fail to capture the diversit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18055v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18055v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18055v1-abstract-full" style="display: none;"> Reflection removal is challenging due to complex light interactions, where reflections obscure important details and hinder scene understanding. Polarization naturally provides a powerful cue to distinguish between reflected and transmitted light, enabling more accurate reflection removal. However, existing methods often rely on small-scale or synthetic datasets, which fail to capture the diversity and complexity of real-world scenarios. To this end, we construct a large-scale dataset, PolaRGB, for Polarization-based reflection removal of RGB images, which enables us to train models that generalize effectively across a wide range of real-world scenarios. The PolaRGB dataset contains 6,500 well-aligned mixed-transmission image pairs, 8x larger than existing polarization datasets, and is the first to include both RGB and polarization images captured across diverse indoor and outdoor environments with varying lighting conditions. Besides, to fully exploit the potential of polarization cues for reflection removal, we introduce PolarFree, which leverages diffusion process to generate reflection-free cues for accurate reflection removal. Extensive experiments show that PolarFree significantly enhances image clarity in challenging reflective scenarios, setting a new benchmark for polarized imaging and reflection removal. Code and dataset are available at https://github.com/mdyao/PolarFree. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18055v1-abstract-full').style.display = 'none'; document.getElementById('2503.18055v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17870">arXiv:2503.17870</a> <span> [<a href="https://arxiv.org/pdf/2503.17870">pdf</a>, <a href="https://arxiv.org/format/2503.17870">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Statistical Mechanics">cond-mat.stat-mech</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Accelerating and enhancing thermodynamic simulations of electrochemical interfaces </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Du%2C+X">Xiaochen Du</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mengren Liu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+J">Jiayu Peng</a>, <a href="/search/cs?searchtype=author&query=Chun%2C+H">Hoje Chun</a>, <a href="/search/cs?searchtype=author&query=Hoffman%2C+A">Alexander Hoffman</a>, <a href="/search/cs?searchtype=author&query=Yildiz%2C+B">Bilge Yildiz</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lin Li</a>, <a href="/search/cs?searchtype=author&query=Bazant%2C+M+Z">Martin Z. Bazant</a>, <a href="/search/cs?searchtype=author&query=G%C3%B3mez-Bombarelli%2C+R">Rafael G贸mez-Bombarelli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17870v1-abstract-short" style="display: inline;"> Electrochemical interfaces are crucial in catalysis, energy storage, and corrosion, where their stability and reactivity depend on complex interactions between the electrode, adsorbates, and electrolyte. Predicting stable surface structures remains challenging, as traditional surface Pourbaix diagrams tend to either rely on expert knowledge or costly $\textit{ab initio}$ sampling, and neglect ther… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17870v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17870v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17870v1-abstract-full" style="display: none;"> Electrochemical interfaces are crucial in catalysis, energy storage, and corrosion, where their stability and reactivity depend on complex interactions between the electrode, adsorbates, and electrolyte. Predicting stable surface structures remains challenging, as traditional surface Pourbaix diagrams tend to either rely on expert knowledge or costly $\textit{ab initio}$ sampling, and neglect thermodynamic equilibration with the environment. Machine learning (ML) potentials can accelerate static modeling but often overlook dynamic surface transformations. Here, we extend the Virtual Surface Site Relaxation-Monte Carlo (VSSR-MC) method to autonomously sample surface reconstructions modeled under aqueous electrochemical conditions. Through fine-tuning foundational ML force fields, we accurately and efficiently predict surface energetics, recovering known Pt(111) phases and revealing new LaMnO$_\mathrm{3}$(001) surface reconstructions. By explicitly accounting for bulk-electrolyte equilibria, our framework enhances electrochemical stability predictions, offering a scalable approach to understanding and designing materials for electrochemical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17870v1-abstract-full').style.display = 'none'; document.getElementById('2503.17870v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages main text, 5 figures, supplementary information (SI) in ancillary files</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17690">arXiv:2503.17690</a> <span> [<a href="https://arxiv.org/pdf/2503.17690">pdf</a>, <a href="https://arxiv.org/format/2503.17690">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CountLLM: Towards Generalizable Repetitive Action Counting via Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+Z">Ziyu Yao</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xuxin Cheng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhiqi Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lei Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17690v1-abstract-short" style="display: inline;"> Repetitive action counting, which aims to count periodic movements in a video, is valuable for video analysis applications such as fitness monitoring. However, existing methods largely rely on regression networks with limited representational capacity, which hampers their ability to accurately capture variable periodic patterns. Additionally, their supervised learning on narrow, limited training s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17690v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17690v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17690v1-abstract-full" style="display: none;"> Repetitive action counting, which aims to count periodic movements in a video, is valuable for video analysis applications such as fitness monitoring. However, existing methods largely rely on regression networks with limited representational capacity, which hampers their ability to accurately capture variable periodic patterns. Additionally, their supervised learning on narrow, limited training sets leads to overfitting and restricts their ability to generalize across diverse scenarios. To address these challenges, we propose CountLLM, the first large language model (LLM)-based framework that takes video data and periodic text prompts as inputs and outputs the desired counting value. CountLLM leverages the rich clues from explicit textual instructions and the powerful representational capabilities of pre-trained LLMs for repetitive action counting. To effectively guide CountLLM, we develop a periodicity-based structured template for instructions that describes the properties of periodicity and implements a standardized answer format to ensure consistency. Additionally, we propose a progressive multimodal training paradigm to enhance the periodicity-awareness of the LLM. Empirical evaluations on widely recognized benchmarks demonstrate CountLLM's superior performance and generalization, particularly in handling novel and out-of-domain actions that deviate significantly from the training data, offering a promising avenue for repetitive action counting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17690v1-abstract-full').style.display = 'none'; document.getElementById('2503.17690v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17662">arXiv:2503.17662</a> <span> [<a href="https://arxiv.org/pdf/2503.17662">pdf</a>, <a href="https://arxiv.org/format/2503.17662">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Persona Consistency for LLMs' Role-Playing using Persona-Aware Contrastive Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ji%2C+K">Ke Ji</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+Y">Yixin Lian</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Linxu Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jingsheng Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weiyuan Li</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+B">Bin Dai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17662v2-abstract-short" style="display: inline;"> In recent years, large language models (LLMs) have achieved breakthrough progress in many dialogue generation tasks. However, their lack of emotion and fine-grained role awareness limits the model's ability to provide personalized and diverse interactions further. Current methods face high costs in collecting high-quality annotated data for scenarios such as role-playing, and traditional human ali… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17662v2-abstract-full').style.display = 'inline'; document.getElementById('2503.17662v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17662v2-abstract-full" style="display: none;"> In recent years, large language models (LLMs) have achieved breakthrough progress in many dialogue generation tasks. However, their lack of emotion and fine-grained role awareness limits the model's ability to provide personalized and diverse interactions further. Current methods face high costs in collecting high-quality annotated data for scenarios such as role-playing, and traditional human alignment methods are difficult to deploy due to the inherent diversity of model behavior in role-playing scenarios. Inspired by the alignment of models for safety behaviors through RLHF (Reinforcement Learning from Human Feedback), in this paper, we revisit model role-playing behavior from the perspective of persona alignment and propose a novel annotation-free framework named \textbf{\underline{P}}ersona-Aware \textbf{\underline{C}}ontrastive \textbf{\underline{L}}earning (PCL) to align LLMs' behavior during role-playing, enhancing the model's role consistency. Specifically, we first design a role chain method to encourage the model to self-question based on the role characteristics and dialogue context to adjust personality consistency. Then, we further enhance the model's role-playing strategy through iterative contrastive learning between the use of role characteristics and not. Experiments on both black-box and white-box LLMs show that LLMs equipped with PCL significantly outperform vanilla LLMs under automatic evaluation methods (CharEval \& GPT-4) and human expert evaluation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17662v2-abstract-full').style.display = 'none'; document.getElementById('2503.17662v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17651">arXiv:2503.17651</a> <span> [<a href="https://arxiv.org/pdf/2503.17651">pdf</a>, <a href="https://arxiv.org/format/2503.17651">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Collaborative Temporal Consistency Learning for Point-supervised Natural Language Video Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tao%2C+Z">Zhuo Tao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Liang Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qi Chen</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+Y">Yunbin Tu</a>, <a href="/search/cs?searchtype=author&query=Zha%2C+Z">Zheng-Jun Zha</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Ming-Hsuan Yang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Y">Yuankai Qi</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Q">Qingming Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17651v1-abstract-short" style="display: inline;"> Natural language video localization (NLVL) is a crucial task in video understanding that aims to localize the target moment in videos specified by a given language description. Recently, a point-supervised paradigm has been presented to address this task, requiring only a single annotated frame within the target moment rather than complete temporal boundaries. Compared with the fully-supervised pa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17651v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17651v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17651v1-abstract-full" style="display: none;"> Natural language video localization (NLVL) is a crucial task in video understanding that aims to localize the target moment in videos specified by a given language description. Recently, a point-supervised paradigm has been presented to address this task, requiring only a single annotated frame within the target moment rather than complete temporal boundaries. Compared with the fully-supervised paradigm, it offers a balance between localization accuracy and annotation cost. However, due to the absence of complete annotation, it is challenging to align the video content with language descriptions, consequently hindering accurate moment prediction. To address this problem, we propose a new COllaborative Temporal consistEncy Learning (COTEL) framework that leverages the synergy between saliency detection and moment localization to strengthen the video-language alignment. Specifically, we first design a frame- and a segment-level Temporal Consistency Learning (TCL) module that models semantic alignment across frame saliencies and sentence-moment pairs. Then, we design a cross-consistency guidance scheme, including a Frame-level Consistency Guidance (FCG) and a Segment-level Consistency Guidance (SCG), that enables the two temporal consistency learning paths to reinforce each other mutually. Further, we introduce a Hierarchical Contrastive Alignment Loss (HCAL) to comprehensively align the video and text query. Extensive experiments on two benchmarks demonstrate that our method performs favorably against SoTA approaches. We will release all the source codes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17651v1-abstract-full').style.display = 'none'; document.getElementById('2503.17651v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17281">arXiv:2503.17281</a> <span> [<a href="https://arxiv.org/pdf/2503.17281">pdf</a>, <a href="https://arxiv.org/format/2503.17281">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Learning disentangled representations for instrument-based music similarity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hashizume%2C+Y">Yuka Hashizume</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Li Li</a>, <a href="/search/cs?searchtype=author&query=Miyashita%2C+A">Atsushi Miyashita</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17281v1-abstract-short" style="display: inline;"> A flexible recommendation and retrieval system requires music similarity in terms of multiple partial elements of musical pieces to allow users to select the element they want to focus on. A method for music similarity learning using multiple networks with individual instrumental signals is effective but faces the problem that using each clean instrumental signal as a query is impractical for retr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17281v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17281v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17281v1-abstract-full" style="display: none;"> A flexible recommendation and retrieval system requires music similarity in terms of multiple partial elements of musical pieces to allow users to select the element they want to focus on. A method for music similarity learning using multiple networks with individual instrumental signals is effective but faces the problem that using each clean instrumental signal as a query is impractical for retrieval systems and using separated instrumental sounds reduces accuracy owing to artifacts. In this paper, we present instrumental-part-based music similarity learning with a single network that takes mixed sounds as input instead of individual instrumental sounds. Specifically, we designed a single similarity embedding space with disentangled dimensions for each instrument, extracted by Conditional Similarity Networks, which are trained using the triplet loss with masks. Experimental results showed that (1) the proposed method can obtain more accurate feature representation than using individual networks using separated sounds as input in the evaluation of an instrument that had low accuracy, (2) each sub-embedding space can hold the characteristics of the corresponding instrument, and (3) the selection of similar musical pieces focusing on each instrumental sound by the proposed method can obtain human acceptance, especially when focusing on timbre. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17281v1-abstract-full').style.display = 'none'; document.getElementById('2503.17281v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2404.06682</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16929">arXiv:2503.16929</a> <span> [<a href="https://arxiv.org/pdf/2503.16929">pdf</a>, <a href="https://arxiv.org/format/2503.16929">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> TEMPO: Temporal Preference Optimization of Video LLMs via Difficulty Scheduling and Pre-SFT Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+S">Shicheng Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lei Li</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+K">Kun Ouyang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+S">Shuhuai Ren</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuanxin Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuanxing Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fuzheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+L">Lingpeng Kong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qi Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xu Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16929v1-abstract-short" style="display: inline;"> Video Large Language Models (Video LLMs) have achieved significant success by leveraging a two-stage paradigm: pretraining on large-scale video-text data for vision-language alignment, followed by supervised fine-tuning (SFT) for task-specific capabilities. However, existing approaches struggle with temporal reasoning due to weak temporal correspondence in the data and reliance on the next-token p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16929v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16929v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16929v1-abstract-full" style="display: none;"> Video Large Language Models (Video LLMs) have achieved significant success by leveraging a two-stage paradigm: pretraining on large-scale video-text data for vision-language alignment, followed by supervised fine-tuning (SFT) for task-specific capabilities. However, existing approaches struggle with temporal reasoning due to weak temporal correspondence in the data and reliance on the next-token prediction paradigm during training. To address these limitations, we propose TEMPO (TEMporal Preference Optimization), a systematic framework that enhances Video LLMs' temporal reasoning capabilities through Direct Preference Optimization (DPO). To facilitate this, we introduce an automated preference data generation pipeline that systematically constructs preference pairs by selecting videos that are rich in temporal information, designing video-specific perturbation strategies, and finally evaluating model responses on clean and perturbed video inputs. Our temporal alignment features two key innovations: curriculum learning which that progressively increases perturbation difficulty to improve model robustness and adaptability; and ``Pre-SFT Alignment'', applying preference optimization before instruction tuning to prioritize fine-grained temporal comprehension. Extensive experiments demonstrate that our approach consistently improves Video LLM performance across multiple benchmarks with a relatively small set of self-generated DPO data. We further analyze the transferability of DPO data across architectures and the role of difficulty scheduling in optimization. Our findings highlight our TEMPO as a scalable and efficient complement to SFT-based methods, paving the way for developing reliable Video LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16929v1-abstract-full').style.display = 'none'; document.getElementById('2503.16929v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14402">arXiv:2503.14402</a> <span> [<a href="https://arxiv.org/pdf/2503.14402">pdf</a>, <a href="https://arxiv.org/format/2503.14402">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Diffusion-based Facial Aesthetics Enhancement with 3D Structure Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+L">Lisha Li</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+J">Jingwen Hou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Weide Liu</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yuming Fang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Jiebin Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14402v1-abstract-short" style="display: inline;"> Facial Aesthetics Enhancement (FAE) aims to improve facial attractiveness by adjusting the structure and appearance of a facial image while preserving its identity as much as possible. Most existing methods adopted deep feature-based or score-based guidance for generation models to conduct FAE. Although these methods achieved promising results, they potentially produced excessively beautified resu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14402v1-abstract-full').style.display = 'inline'; document.getElementById('2503.14402v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14402v1-abstract-full" style="display: none;"> Facial Aesthetics Enhancement (FAE) aims to improve facial attractiveness by adjusting the structure and appearance of a facial image while preserving its identity as much as possible. Most existing methods adopted deep feature-based or score-based guidance for generation models to conduct FAE. Although these methods achieved promising results, they potentially produced excessively beautified results with lower identity consistency or insufficiently improved facial attractiveness. To enhance facial aesthetics with less loss of identity, we propose the Nearest Neighbor Structure Guidance based on Diffusion (NNSG-Diffusion), a diffusion-based FAE method that beautifies a 2D facial image with 3D structure guidance. Specifically, we propose to extract FAE guidance from a nearest neighbor reference face. To allow for less change of facial structures in the FAE process, a 3D face model is recovered by referring to both the matched 2D reference face and the 2D input face, so that the depth and contour guidance can be extracted from the 3D face model. Then the depth and contour clues can provide effective guidance to Stable Diffusion with ControlNet for FAE. Extensive experiments demonstrate that our method is superior to previous relevant methods in enhancing facial aesthetics while preserving facial identity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14402v1-abstract-full').style.display = 'none'; document.getElementById('2503.14402v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14021">arXiv:2503.14021</a> <span> [<a href="https://arxiv.org/pdf/2503.14021">pdf</a>, <a href="https://arxiv.org/format/2503.14021">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> MP-GUI: Modality Perception with MLLMs for GUI Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziwei Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weizhi Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Leyang Yang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Sheng Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shengchu Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+H">Hanbei Zhan</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+J">Jiongchao Jin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Liangcheng Li</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zirui Shao</a>, <a href="/search/cs?searchtype=author&query=Bu%2C+J">Jiajun Bu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14021v1-abstract-short" style="display: inline;"> Graphical user interface (GUI) has become integral to modern society, making it crucial to be understood for human-centric systems. However, unlike natural images or documents, GUIs comprise artificially designed graphical elements arranged to convey specific semantic meanings. Current multi-modal large language models (MLLMs) already proficient in processing graphical and textual components suffe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14021v1-abstract-full').style.display = 'inline'; document.getElementById('2503.14021v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14021v1-abstract-full" style="display: none;"> Graphical user interface (GUI) has become integral to modern society, making it crucial to be understood for human-centric systems. However, unlike natural images or documents, GUIs comprise artificially designed graphical elements arranged to convey specific semantic meanings. Current multi-modal large language models (MLLMs) already proficient in processing graphical and textual components suffer from hurdles in GUI understanding due to the lack of explicit spatial structure modeling. Moreover, obtaining high-quality spatial structure data is challenging due to privacy issues and noisy environments. To address these challenges, we present MP-GUI, a specially designed MLLM for GUI understanding. MP-GUI features three precisely specialized perceivers to extract graphical, textual, and spatial modalities from the screen as GUI-tailored visual clues, with spatial structure refinement strategy and adaptively combined via a fusion gate to meet the specific preferences of different GUI understanding tasks. To cope with the scarcity of training data, we also introduce a pipeline for automatically data collecting. Extensive experiments demonstrate that MP-GUI achieves impressive results on various GUI understanding tasks with limited data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14021v1-abstract-full').style.display = 'none'; document.getElementById('2503.14021v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Paper accepted to CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13957">arXiv:2503.13957</a> <span> [<a href="https://arxiv.org/pdf/2503.13957">pdf</a>, <a href="https://arxiv.org/format/2503.13957">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DIFFVSGG: Diffusion-Driven Online Video Scene Graph Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mu Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Liulei Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenguan Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yi Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13957v1-abstract-short" style="display: inline;"> Top-leading solutions for Video Scene Graph Generation (VSGG) typically adopt an offline pipeline. Though demonstrating promising performance, they remain unable to handle real-time video streams and consume large GPU memory. Moreover, these approaches fall short in temporal reasoning, merely aggregating frame-level predictions over a temporal context. In response, we introduce DIFFVSGG, an online… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13957v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13957v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13957v1-abstract-full" style="display: none;"> Top-leading solutions for Video Scene Graph Generation (VSGG) typically adopt an offline pipeline. Though demonstrating promising performance, they remain unable to handle real-time video streams and consume large GPU memory. Moreover, these approaches fall short in temporal reasoning, merely aggregating frame-level predictions over a temporal context. In response, we introduce DIFFVSGG, an online VSGG solution that frames this task as an iterative scene graph update problem. Drawing inspiration from Latent Diffusion Models (LDMs) which generate images via denoising a latent feature embedding, we unify the decoding of object classification, bounding box regression, and graph generation three tasks using one shared feature embedding. Then, given an embedding containing unified features of object pairs, we conduct a step-wise Denoising on it within LDMs, so as to deliver a clean embedding which clearly indicates the relationships between objects. This embedding then serves as the input to task-specific heads for object classification, scene graph generation, etc. DIFFVSGG further facilitates continuous temporal reasoning, where predictions for subsequent frames leverage results of past frames as the conditional inputs of LDMs, to guide the reverse diffusion process for current frames. Extensive experiments on three setups of Action Genome demonstrate the superiority of DIFFVSGG. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13957v1-abstract-full').style.display = 'none'; document.getElementById('2503.13957v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2025, Code: https://github.com/kagawa588/DiffVsgg</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13946">arXiv:2503.13946</a> <span> [<a href="https://arxiv.org/pdf/2503.13946">pdf</a>, <a href="https://arxiv.org/format/2503.13946">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Is Discretization Fusion All You Need for Collaborative Perception? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kang Yang</a>, <a href="/search/cs?searchtype=author&query=Bu%2C+T">Tianci Bu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lantao Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chunxu Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yongcai Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Deying Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13946v1-abstract-short" style="display: inline;"> Collaborative perception in multi-agent system enhances overall perceptual capabilities by facilitating the exchange of complementary information among agents. Current mainstream collaborative perception methods rely on discretized feature maps to conduct fusion, which however, lacks flexibility in extracting and transmitting the informative features and can hardly focus on the informative feature… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13946v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13946v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13946v1-abstract-full" style="display: none;"> Collaborative perception in multi-agent system enhances overall perceptual capabilities by facilitating the exchange of complementary information among agents. Current mainstream collaborative perception methods rely on discretized feature maps to conduct fusion, which however, lacks flexibility in extracting and transmitting the informative features and can hardly focus on the informative features during fusion. To address these problems, this paper proposes a novel Anchor-Centric paradigm for Collaborative Object detection (ACCO). It avoids grid precision issues and allows more flexible and efficient anchor-centric communication and fusion. ACCO is composed by three main components: (1) Anchor featuring block (AFB) that targets to generate anchor proposals and projects prepared anchor queries to image features. (2) Anchor confidence generator (ACG) is designed to minimize communication by selecting only the features in the confident anchors to transmit. (3) A local-global fusion module, in which local fusion is anchor alignment-based fusion (LAAF) and global fusion is conducted by spatial-aware cross-attention (SACA). LAAF and SACA run in multi-layers, so agents conduct anchor-centric fusion iteratively to adjust the anchor proposals. Comprehensive experiments are conducted to evaluate ACCO on OPV2V and Dair-V2X datasets, which demonstrate ACCO's superiority in reducing the communication volume, and in improving the perception range and detection performances. Code can be found at: \href{https://github.com/sidiangongyuan/ACCO}{https://github.com/sidiangongyuan/ACCO}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13946v1-abstract-full').style.display = 'none'; document.getElementById('2503.13946v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13819">arXiv:2503.13819</a> <span> [<a href="https://arxiv.org/pdf/2503.13819">pdf</a>, <a href="https://arxiv.org/format/2503.13819">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> </div> <p class="title is-5 mathjax"> LLM-Empowered IoT for 6G Networks: Architecture, Challenges, and Solutions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaopei Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wen Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zuguang Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Liang Li</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+F">Fei Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13819v1-abstract-short" style="display: inline;"> The Internet of Things (IoT) in the sixth generation (6G) era is envisioned to evolve towards intelligence, ubiquity, and self-optimization. Large language models (LLMs) have demonstrated remarkable generalization capabilities across diverse domains, including natural language processing (NLP), computer vision (CV), and beyond. In this article, we propose an LLM-empowered IoT architecture for 6G n… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13819v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13819v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13819v1-abstract-full" style="display: none;"> The Internet of Things (IoT) in the sixth generation (6G) era is envisioned to evolve towards intelligence, ubiquity, and self-optimization. Large language models (LLMs) have demonstrated remarkable generalization capabilities across diverse domains, including natural language processing (NLP), computer vision (CV), and beyond. In this article, we propose an LLM-empowered IoT architecture for 6G networks to achieve intelligent autonomy while supporting advanced IoT applications. LLMs are pushed to the edge of the 6G network to support the synergy of LLMs and IoT. LLM solutions are tailored to both IoT application requirements and IoT management needs, i.e., LLM for IoT. On the other hand, edge inference and edge fine-tuning are discussed to support the deployment of LLMs, i.e., LLM on IoT. Furthermore, we propose a memory-efficient split federated learning (SFL) framework for LLM fine-tuning on heterogeneous IoT devices that alleviates memory pressures on both IoT devices and the edge server while achieving comparable performance and convergence time. Finally, a case study is presented, followed by a discussion about open issues of LLM-empowered IoT for 6G networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13819v1-abstract-full').style.display = 'none'; document.getElementById('2503.13819v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13434">arXiv:2503.13434</a> <span> [<a href="https://arxiv.org/pdf/2503.13434">pdf</a>, <a href="https://arxiv.org/format/2503.13434">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> BlobCtrl: A Unified and Flexible Framework for Element-level Image Generation and Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yaowei Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lingen Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhaoyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaoyu Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guangzhi Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongxiang Li</a>, <a href="/search/cs?searchtype=author&query=Cun%2C+X">Xiaodong Cun</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+Y">Ying Shan</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+Y">Yuexian Zou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13434v1-abstract-short" style="display: inline;"> Element-level visual manipulation is essential in digital content creation, but current diffusion-based methods lack the precision and flexibility of traditional tools. In this work, we introduce BlobCtrl, a framework that unifies element-level generation and editing using a probabilistic blob-based representation. By employing blobs as visual primitives, our approach effectively decouples and rep… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13434v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13434v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13434v1-abstract-full" style="display: none;"> Element-level visual manipulation is essential in digital content creation, but current diffusion-based methods lack the precision and flexibility of traditional tools. In this work, we introduce BlobCtrl, a framework that unifies element-level generation and editing using a probabilistic blob-based representation. By employing blobs as visual primitives, our approach effectively decouples and represents spatial location, semantic content, and identity information, enabling precise element-level manipulation. Our key contributions include: 1) a dual-branch diffusion architecture with hierarchical feature fusion for seamless foreground-background integration; 2) a self-supervised training paradigm with tailored data augmentation and score functions; and 3) controllable dropout strategies to balance fidelity and diversity. To support further research, we introduce BlobData for large-scale training and BlobBench for systematic evaluation. Experiments show that BlobCtrl excels in various element-level manipulation tasks while maintaining computational efficiency, offering a practical solution for precise and flexible visual content creation. Project page: https://liyaowei-stu.github.io/project/BlobCtrl/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13434v1-abstract-full').style.display = 'none'; document.getElementById('2503.13434v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Webpage: https://liyaowei-stu.github.io/project/BlobCtrl/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13004">arXiv:2503.13004</a> <span> [<a href="https://arxiv.org/pdf/2503.13004">pdf</a>, <a href="https://arxiv.org/format/2503.13004">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TFDM: Time-Variant Frequency-Based Point Cloud Diffusion with Mamba </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaxu Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Li Li</a>, <a href="/search/cs?searchtype=author&query=Shum%2C+H+P+H">Hubert P. H. Shum</a>, <a href="/search/cs?searchtype=author&query=Breckon%2C+T+P">Toby P. Breckon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13004v1-abstract-short" style="display: inline;"> Diffusion models currently demonstrate impressive performance over various generative tasks. Recent work on image diffusion highlights the strong capabilities of Mamba (state space models) due to its efficient handling of long-range dependencies and sequential data modeling. Unfortunately, joint consideration of state space models with 3D point cloud generation remains limited. To harness the powe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13004v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13004v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13004v1-abstract-full" style="display: none;"> Diffusion models currently demonstrate impressive performance over various generative tasks. Recent work on image diffusion highlights the strong capabilities of Mamba (state space models) due to its efficient handling of long-range dependencies and sequential data modeling. Unfortunately, joint consideration of state space models with 3D point cloud generation remains limited. To harness the powerful capabilities of the Mamba model for 3D point cloud generation, we propose a novel diffusion framework containing dual latent Mamba block (DM-Block) and a time-variant frequency encoder (TF-Encoder). The DM-Block apply a space-filling curve to reorder points into sequences suitable for Mamba state-space modeling, while operating in a latent space to mitigate the computational overhead that arises from direct 3D data processing. Meanwhile, the TF-Encoder takes advantage of the ability of the diffusion model to refine fine details in later recovery stages by prioritizing key points within the U-Net architecture. This frequency-based mechanism ensures enhanced detail quality in the final stages of generation. Experimental results on the ShapeNet-v2 dataset demonstrate that our method achieves state-of-the-art performance (ShapeNet-v2: 0.14\% on 1-NNA-Abs50 EMD and 57.90\% on COV EMD) on certain metrics for specific categories while reducing computational parameters and inference time by up to 10$\times$ and 9$\times$, respectively. Source code is available in Supplementary Materials and will be released upon accpetance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13004v1-abstract-full').style.display = 'none'; document.getElementById('2503.13004v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12886">arXiv:2503.12886</a> <span> [<a href="https://arxiv.org/pdf/2503.12886">pdf</a>, <a href="https://arxiv.org/format/2503.12886">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RGBAvatar: Reduced Gaussian Blendshapes for Online Modeling of Head Avatars </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+L">Linzhou Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yumeng Li</a>, <a href="/search/cs?searchtype=author&query=Weng%2C+Y">Yanlin Weng</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Youyi Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+K">Kun Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12886v1-abstract-short" style="display: inline;"> We present Reduced Gaussian Blendshapes Avatar (RGBAvatar), a method for reconstructing photorealistic, animatable head avatars at speeds sufficient for on-the-fly reconstruction. Unlike prior approaches that utilize linear bases from 3D morphable models (3DMM) to model Gaussian blendshapes, our method maps tracked 3DMM parameters into reduced blendshape weights with an MLP, leading to a compact s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12886v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12886v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12886v1-abstract-full" style="display: none;"> We present Reduced Gaussian Blendshapes Avatar (RGBAvatar), a method for reconstructing photorealistic, animatable head avatars at speeds sufficient for on-the-fly reconstruction. Unlike prior approaches that utilize linear bases from 3D morphable models (3DMM) to model Gaussian blendshapes, our method maps tracked 3DMM parameters into reduced blendshape weights with an MLP, leading to a compact set of blendshape bases. The learned compact base composition effectively captures essential facial details for specific individuals, and does not rely on the fixed base composition weights of 3DMM, leading to enhanced reconstruction quality and higher efficiency. To further expedite the reconstruction process, we develop a novel color initialization estimation method and a batch-parallel Gaussian rasterization process, achieving state-of-the-art quality with training throughput of about 630 images per second. Moreover, we propose a local-global sampling strategy that enables direct on-the-fly reconstruction, immediately reconstructing the model as video streams in real time while achieving quality comparable to offline settings. Our source code is available at https://github.com/gapszju/RGBAvatar. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12886v1-abstract-full').style.display = 'none'; document.getElementById('2503.12886v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12854">arXiv:2503.12854</a> <span> [<a href="https://arxiv.org/pdf/2503.12854">pdf</a>, <a href="https://arxiv.org/format/2503.12854">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Enhancing LLM Reasoning with Iterative DPO: A Comprehensive Empirical Investigation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tu%2C+S">Songjun Tu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jiahao Lin</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+X">Xiangyu Tian</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qichao Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Linjing Li</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yuqian Fu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+N">Nan Xu</a>, <a href="/search/cs?searchtype=author&query=He%2C+W">Wei He</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+X">Xiangyuan Lan</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+D">Dongmei Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+D">Dongbin Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12854v1-abstract-short" style="display: inline;"> Recent advancements in post-training methodologies for large language models (LLMs) have highlighted reinforcement learning (RL) as a critical component for enhancing reasoning. However, the substantial computational costs associated with RL-based approaches have led to growing interest in alternative paradigms, such as Direct Preference Optimization (DPO). In this study, we investigate the effect… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12854v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12854v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12854v1-abstract-full" style="display: none;"> Recent advancements in post-training methodologies for large language models (LLMs) have highlighted reinforcement learning (RL) as a critical component for enhancing reasoning. However, the substantial computational costs associated with RL-based approaches have led to growing interest in alternative paradigms, such as Direct Preference Optimization (DPO). In this study, we investigate the effectiveness of DPO in facilitating self-improvement for LLMs through iterative preference-based learning. We demonstrate that a single round of DPO with coarse filtering significantly enhances mathematical reasoning performance, particularly for strong base model. Furthermore, we design an iterative enhancement framework for both the generator and the reward model (RM), enabling their mutual improvement through online interaction across multiple rounds of DPO. Finally, with simple verifiable rewards, our model DPO-VP achieves RL-level performance with significantly lower computational overhead. These findings highlight DPO as a scalable and cost-effective alternative to RL, offering a practical solution for enhancing LLM reasoning in resource-constrained situations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12854v1-abstract-full').style.display = 'none'; document.getElementById('2503.12854v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Report number:</span> Submitted to COLM 2025 </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> COLM 2025 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12847">arXiv:2503.12847</a> <span> [<a href="https://arxiv.org/pdf/2503.12847">pdf</a>, <a href="https://arxiv.org/format/2503.12847">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Robust Audio-Visual Segmentation via Audio-Guided Visual Convergent Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chen Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Peike Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Liying Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Dadong Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lincheng Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xin Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12847v1-abstract-short" style="display: inline;"> Accurately localizing audible objects based on audio-visual cues is the core objective of audio-visual segmentation. Most previous methods emphasize spatial or temporal multi-modal modeling, yet overlook challenges from ambiguous audio-visual correspondences such as nearby visually similar but acoustically different objects and frequent shifts in objects' sounding status. Consequently, they may st… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12847v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12847v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12847v1-abstract-full" style="display: none;"> Accurately localizing audible objects based on audio-visual cues is the core objective of audio-visual segmentation. Most previous methods emphasize spatial or temporal multi-modal modeling, yet overlook challenges from ambiguous audio-visual correspondences such as nearby visually similar but acoustically different objects and frequent shifts in objects' sounding status. Consequently, they may struggle to reliably correlate audio and visual cues, leading to over- or under-segmentation. To address these limitations, we propose a novel framework with two primary components: an audio-guided modality alignment (AMA) module and an uncertainty estimation (UE) module. Instead of indiscriminately correlating audio-visual cues through a global attention mechanism, AMA performs audio-visual interactions within multiple groups and consolidates group features into compact representations based on their responsiveness to audio cues, effectively directing the model's attention to audio-relevant areas. Leveraging contrastive learning, AMA further distinguishes sounding regions from silent areas by treating features with strong audio responses as positive samples and weaker responses as negatives. Additionally, UE integrates spatial and temporal information to identify high-uncertainty regions caused by frequent changes in sound state, reducing prediction errors by lowering confidence in these areas. Experimental results demonstrate that our approach achieves superior accuracy compared to existing state-of-the-art methods, particularly in challenging scenarios where traditional approaches struggle to maintain reliable segmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12847v1-abstract-full').style.display = 'none'; document.getElementById('2503.12847v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12840">arXiv:2503.12840</a> <span> [<a href="https://arxiv.org/pdf/2503.12840">pdf</a>, <a href="https://arxiv.org/format/2503.12840">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Derivation and Elimination: Audio Visual Segmentation with Enhanced Audio Semantics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chen Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Liying Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Peike Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Dadong Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lincheng Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xin Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12840v1-abstract-short" style="display: inline;"> Sound-guided object segmentation has drawn considerable attention for its potential to enhance multimodal perception. Previous methods primarily focus on developing advanced architectures to facilitate effective audio-visual interactions, without fully addressing the inherent challenges posed by audio natures, \emph{\ie}, (1) feature confusion due to the overlapping nature of audio signals, and (2… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12840v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12840v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12840v1-abstract-full" style="display: none;"> Sound-guided object segmentation has drawn considerable attention for its potential to enhance multimodal perception. Previous methods primarily focus on developing advanced architectures to facilitate effective audio-visual interactions, without fully addressing the inherent challenges posed by audio natures, \emph{\ie}, (1) feature confusion due to the overlapping nature of audio signals, and (2) audio-visual matching difficulty from the varied sounds produced by the same object. To address these challenges, we propose Dynamic Derivation and Elimination (DDESeg): a novel audio-visual segmentation framework. Specifically, to mitigate feature confusion, DDESeg reconstructs the semantic content of the mixed audio signal by enriching the distinct semantic information of each individual source, deriving representations that preserve the unique characteristics of each sound. To reduce the matching difficulty, we introduce a discriminative feature learning module, which enhances the semantic distinctiveness of generated audio representations. Considering that not all derived audio representations directly correspond to visual features (e.g., off-screen sounds), we propose a dynamic elimination module to filter out non-matching elements. This module facilitates targeted interaction between sounding regions and relevant audio semantics. By scoring the interacted features, we identify and filter out irrelevant audio information, ensuring accurate audio-visual alignment. Comprehensive experiments demonstrate that our framework achieves superior performance in AVS datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12840v1-abstract-full').style.display = 'none'; document.getElementById('2503.12840v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12790">arXiv:2503.12790</a> <span> [<a href="https://arxiv.org/pdf/2503.12790">pdf</a>, <a href="https://arxiv.org/format/2503.12790">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Quantum-Enhanced LLM Efficient Fine Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kong%2C+X">Xiaofei Kong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lei Li</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+M">Menghan Dou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhaoyun Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yuchun Wu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+G">Guoping Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12790v1-abstract-short" style="display: inline;"> Low-Rank Adaptation (LoRA) enables efficient fine-tuning of pre-trained language models via low-rank matrix approximation, which is effective in many scenarios. However, its low-rank representation capacity is constrained in complex tasks or high-rank dependency settings, potentially limiting model adaptability. Addressing the expressive bottleneck of classical low-rank approximation in fine-tunin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12790v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12790v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12790v1-abstract-full" style="display: none;"> Low-Rank Adaptation (LoRA) enables efficient fine-tuning of pre-trained language models via low-rank matrix approximation, which is effective in many scenarios. However, its low-rank representation capacity is constrained in complex tasks or high-rank dependency settings, potentially limiting model adaptability. Addressing the expressive bottleneck of classical low-rank approximation in fine-tuning large language models, this paper proposes a parameter-efficient fine-tuning method based on a Quantum Weighted Tensor Hybrid Network (QWTHN), which leverages Quantum Neural Network (QNN). The study investigates quantum-classical hybrid parameter-efficient fine-tuning in low-rank spaces. QWTHN decomposes pre-trained weights into quantum neural network and tensor network representations, utilizing quantum state superposition and other methods to break through classical rank limitations. Experiments show that the proposed quantum fine-tuning technique for large models approaches or even surpasses the parameter efficiency of LoRA. On the CPsyCounD and R1-Distill-SFT datasets, QWTHN, compared to classical LoRA, reduces training loss by up to 15% while using 76% fewer parameters, and achieves an 8.4% performance improvement on the CPsyCounD test set. This research not only realizes lightweight and efficient adaptation of quantum resources to billion-parameter models but also validates the practical path of quantum hardware driven by large model tasks, laying the first engineering-ready technical foundation for future quantum-enhanced AGI systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12790v1-abstract-full').style.display = 'none'; document.getElementById('2503.12790v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12590">arXiv:2503.12590</a> <span> [<a href="https://arxiv.org/pdf/2503.12590">pdf</a>, <a href="https://arxiv.org/format/2503.12590">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Personalize Anything for Free with Diffusion Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+H">Haoran Feng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zehuan Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lin Li</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+H">Hairong Lv</a>, <a href="/search/cs?searchtype=author&query=Sheng%2C+L">Lu Sheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12590v1-abstract-short" style="display: inline;"> Personalized image generation aims to produce images of user-specified concepts while enabling flexible editing. Recent training-free approaches, while exhibit higher computational efficiency than training-based methods, struggle with identity preservation, applicability, and compatibility with diffusion transformers (DiTs). In this paper, we uncover the untapped potential of DiT, where simply rep… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12590v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12590v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12590v1-abstract-full" style="display: none;"> Personalized image generation aims to produce images of user-specified concepts while enabling flexible editing. Recent training-free approaches, while exhibit higher computational efficiency than training-based methods, struggle with identity preservation, applicability, and compatibility with diffusion transformers (DiTs). In this paper, we uncover the untapped potential of DiT, where simply replacing denoising tokens with those of a reference subject achieves zero-shot subject reconstruction. This simple yet effective feature injection technique unlocks diverse scenarios, from personalization to image editing. Building upon this observation, we propose \textbf{Personalize Anything}, a training-free framework that achieves personalized image generation in DiT through: 1) timestep-adaptive token replacement that enforces subject consistency via early-stage injection and enhances flexibility through late-stage regularization, and 2) patch perturbation strategies to boost structural diversity. Our method seamlessly supports layout-guided generation, multi-subject personalization, and mask-controlled editing. Evaluations demonstrate state-of-the-art performance in identity preservation and versatility. Our work establishes new insights into DiTs while delivering a practical paradigm for efficient personalization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12590v1-abstract-full').style.display = 'none'; document.getElementById('2503.12590v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">https://fenghora.github.io/Personalize-Anything-Page/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12042">arXiv:2503.12042</a> <span> [<a href="https://arxiv.org/pdf/2503.12042">pdf</a>, <a href="https://arxiv.org/format/2503.12042">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Prosody-Enhanced Acoustic Pre-training and Acoustic-Disentangled Prosody Adapting for Movie Dubbing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhedong Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Liang Li</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+C">Chenggang Yan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chunshan Liu</a>, <a href="/search/cs?searchtype=author&query=Hengel%2C+A+v+d">Anton van den Hengel</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Y">Yuankai Qi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12042v2-abstract-short" style="display: inline;"> Movie dubbing describes the process of transforming a script into speech that aligns temporally and emotionally with a given movie clip while exemplifying the speaker's voice demonstrated in a short reference audio clip. This task demands the model bridge character performances and complicated prosody structures to build a high-quality video-synchronized dubbing track. The limited scale of movie d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12042v2-abstract-full').style.display = 'inline'; document.getElementById('2503.12042v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12042v2-abstract-full" style="display: none;"> Movie dubbing describes the process of transforming a script into speech that aligns temporally and emotionally with a given movie clip while exemplifying the speaker's voice demonstrated in a short reference audio clip. This task demands the model bridge character performances and complicated prosody structures to build a high-quality video-synchronized dubbing track. The limited scale of movie dubbing datasets, along with the background noise inherent in audio data, hinder the acoustic modeling performance of trained models. To address these issues, we propose an acoustic-prosody disentangled two-stage method to achieve high-quality dubbing generation with precise prosody alignment. First, we propose a prosody-enhanced acoustic pre-training to develop robust acoustic modeling capabilities. Then, we freeze the pre-trained acoustic system and design a disentangled framework to model prosodic text features and dubbing style while maintaining acoustic quality. Additionally, we incorporate an in-domain emotion analysis module to reduce the impact of visual domain shifts across different movies, thereby enhancing emotion-prosody alignment. Extensive experiments show that our method performs favorably against the state-of-the-art models on two primary benchmarks. The demos are available at https://zzdoog.github.io/ProDubber/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12042v2-abstract-full').style.display = 'none'; document.getElementById('2503.12042v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12016">arXiv:2503.12016</a> <span> [<a href="https://arxiv.org/pdf/2503.12016">pdf</a>, <a href="https://arxiv.org/format/2503.12016">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> A Survey on Federated Fine-tuning of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yebo Wu</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+C">Chunlin Tian</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jingguang Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">He Sun</a>, <a href="/search/cs?searchtype=author&query=Tam%2C+K">Kahou Tam</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Li Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chengzhong Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12016v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have achieved remarkable success across a wide range of tasks, with fine-tuning playing a pivotal role in adapting them to specific downstream applications. Federated Learning (FL) offers a promising approach that enables collaborative model adaptation while ensuring data privacy, i.e., FedLLM. In this survey, we provide a systematic and thorough review of the integrat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12016v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12016v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12016v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have achieved remarkable success across a wide range of tasks, with fine-tuning playing a pivotal role in adapting them to specific downstream applications. Federated Learning (FL) offers a promising approach that enables collaborative model adaptation while ensuring data privacy, i.e., FedLLM. In this survey, we provide a systematic and thorough review of the integration of LLMs with FL. Specifically, we first trace the historical evolution of both LLMs and FL, while summarizing relevant prior surveys. We then present an in-depth analysis of the fundamental challenges encountered in deploying FedLLM. Following this, we conduct an extensive study of existing parameter-efficient fine-tuning (PEFT) methods and explore their applicability in FL. Furthermore, we introduce a comprehensive evaluation benchmark to rigorously assess FedLLM performance and discuss its diverse real-world applications across multiple domains. Finally, we identify critical open challenges and outline promising research directions to drive future advancements in FedLLM. We maintain an active \href{https://github.com/Clin0212/Awesome-Federated-LLM-Learning}{GitHub repository} tracking cutting-edge advancements. This survey serves as a foundational resource for researchers and practitioners, offering insights into the evolving landscape of federated fine-tuning for LLMs while guiding future innovations in privacy-preserving AI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12016v1-abstract-full').style.display = 'none'; document.getElementById('2503.12016v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11895">arXiv:2503.11895</a> <span> [<a href="https://arxiv.org/pdf/2503.11895">pdf</a>, <a href="https://arxiv.org/format/2503.11895">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Resolving UnderEdit & OverEdit with Iterative & Neighbor-Assisted Model Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Baghel%2C+B+K">Bhiman Kumar Baghel</a>, <a href="/search/cs?searchtype=author&query=Jordan%2C+S+M">Scott M. Jordan</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z+R">Zheyuan Ryan Shi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X+L">Xiang Lorraine Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11895v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) are used in various downstream language tasks, making it crucial to keep their knowledge up-to-date, but both retraining and fine-tuning the model can be costly. Model editing offers an efficient and effective alternative by a single update to only a key subset of model parameters. While being efficient, these methods are not perfect. Sometimes knowledge edits are unsu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11895v1-abstract-full').style.display = 'inline'; document.getElementById('2503.11895v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11895v1-abstract-full" style="display: none;"> Large Language Models (LLMs) are used in various downstream language tasks, making it crucial to keep their knowledge up-to-date, but both retraining and fine-tuning the model can be costly. Model editing offers an efficient and effective alternative by a single update to only a key subset of model parameters. While being efficient, these methods are not perfect. Sometimes knowledge edits are unsuccessful, i.e., UnderEdit, or the edit contaminated neighboring knowledge that should remain unchanged, i.e., OverEdit. To address these limitations, we propose iterative model editing, based on our hypothesis that a single parameter update is often insufficient, to mitigate UnderEdit, and neighbor-assisted model editing, which incorporates neighboring knowledge during editing to minimize OverEdit. Extensive experiments demonstrate that our methods effectively reduce UnderEdit up to 38 percentage points and OverEdit up to 6 percentage points across multiple model editing algorithms, LLMs, and benchmark datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11895v1-abstract-full').style.display = 'none'; document.getElementById('2503.11895v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under Review @ ACL'25</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11069">arXiv:2503.11069</a> <span> [<a href="https://arxiv.org/pdf/2503.11069">pdf</a>, <a href="https://arxiv.org/format/2503.11069">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> API Agents vs. GUI Agents: Divergence and Convergence </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chaoyun Zhang</a>, <a href="/search/cs?searchtype=author&query=He%2C+S">Shilin He</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Liqun Li</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+S">Si Qin</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+Y">Yu Kang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Q">Qingwei Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dongmei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11069v1-abstract-short" style="display: inline;"> Large language models (LLMs) have evolved beyond simple text generation to power software agents that directly translate natural language commands into tangible actions. While API-based LLM agents initially rose to prominence for their robust automation capabilities and seamless integration with programmatic endpoints, recent progress in multimodal LLM research has enabled GUI-based LLM agents tha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11069v1-abstract-full').style.display = 'inline'; document.getElementById('2503.11069v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11069v1-abstract-full" style="display: none;"> Large language models (LLMs) have evolved beyond simple text generation to power software agents that directly translate natural language commands into tangible actions. While API-based LLM agents initially rose to prominence for their robust automation capabilities and seamless integration with programmatic endpoints, recent progress in multimodal LLM research has enabled GUI-based LLM agents that interact with graphical user interfaces in a human-like manner. Although these two paradigms share the goal of enabling LLM-driven task automation, they diverge significantly in architectural complexity, development workflows, and user interaction models. This paper presents the first comprehensive comparative study of API-based and GUI-based LLM agents, systematically analyzing their divergence and potential convergence. We examine key dimensions and highlight scenarios in which hybrid approaches can harness their complementary strengths. By proposing clear decision criteria and illustrating practical use cases, we aim to guide practitioners and researchers in selecting, combining, or transitioning between these paradigms. Ultimately, we indicate that continuing innovations in LLM-based automation are poised to blur the lines between API- and GUI-driven agents, paving the way for more flexible, adaptive solutions in a wide range of real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11069v1-abstract-full').style.display = 'none'; document.getElementById('2503.11069v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10677">arXiv:2503.10677</a> <span> [<a href="https://arxiv.org/pdf/2503.10677">pdf</a>, <a href="https://arxiv.org/format/2503.10677">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Survey on Knowledge-Oriented Retrieval-Augmented Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+M">Mingyue Cheng</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yucong Luo</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+J">Jie Ouyang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qi Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huijie Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Li Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+S">Shuo Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bohou Zhang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jiawei Cao</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jie Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Daoyu Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+E">Enhong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10677v2-abstract-short" style="display: inline;"> Retrieval-Augmented Generation (RAG) has gained significant attention in recent years for its potential to enhance natural language understanding and generation by combining large-scale retrieval systems with generative models. RAG leverages external knowledge sources, such as documents, databases, or structured data, to improve model performance and generate more accurate and contextually relevan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10677v2-abstract-full').style.display = 'inline'; document.getElementById('2503.10677v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10677v2-abstract-full" style="display: none;"> Retrieval-Augmented Generation (RAG) has gained significant attention in recent years for its potential to enhance natural language understanding and generation by combining large-scale retrieval systems with generative models. RAG leverages external knowledge sources, such as documents, databases, or structured data, to improve model performance and generate more accurate and contextually relevant outputs. This survey aims to provide a comprehensive overview of RAG by examining its fundamental components, including retrieval mechanisms, generation processes, and the integration between the two. We discuss the key characteristics of RAG, such as its ability to augment generative models with dynamic external knowledge, and the challenges associated with aligning retrieved information with generative objectives. We also present a taxonomy that categorizes RAG methods, ranging from basic retrieval-augmented approaches to more advanced models incorporating multi-modal data and reasoning capabilities. Additionally, we review the evaluation benchmarks and datasets commonly used to assess RAG systems, along with a detailed exploration of its applications in fields such as question answering, summarization, and information retrieval. Finally, we highlight emerging research directions and opportunities for improving RAG systems, such as enhanced retrieval efficiency, model interpretability, and domain-specific adaptations. This paper concludes by outlining the prospects for RAG in addressing real-world challenges and its potential to drive further advancements in natural language processing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10677v2-abstract-full').style.display = 'none'; document.getElementById('2503.10677v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10618">arXiv:2503.10618</a> <span> [<a href="https://arxiv.org/pdf/2503.10618">pdf</a>, <a href="https://arxiv.org/format/2503.10618">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DiT-Air: Revisiting the Efficiency of Diffusion Model Architecture Design in Text to Image Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chen Chen</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+R">Rui Qian</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+W">Wenze Hu</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+T">Tsu-Jui Fu</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+J">Jialing Tong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinze Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lezhi Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bowen Zhang</a>, <a href="/search/cs?searchtype=author&query=Schwing%2C+A">Alex Schwing</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wei Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yinfei Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10618v2-abstract-short" style="display: inline;"> In this work, we empirically study Diffusion Transformers (DiTs) for text-to-image generation, focusing on architectural choices, text-conditioning strategies, and training protocols. We evaluate a range of DiT-based architectures--including PixArt-style and MMDiT variants--and compare them with a standard DiT variant which directly processes concatenated text and noise inputs. Surprisingly, our f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10618v2-abstract-full').style.display = 'inline'; document.getElementById('2503.10618v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10618v2-abstract-full" style="display: none;"> In this work, we empirically study Diffusion Transformers (DiTs) for text-to-image generation, focusing on architectural choices, text-conditioning strategies, and training protocols. We evaluate a range of DiT-based architectures--including PixArt-style and MMDiT variants--and compare them with a standard DiT variant which directly processes concatenated text and noise inputs. Surprisingly, our findings reveal that the performance of standard DiT is comparable with those specialized models, while demonstrating superior parameter-efficiency, especially when scaled up. Leveraging the layer-wise parameter sharing strategy, we achieve a further reduction of 66% in model size compared to an MMDiT architecture, with minimal performance impact. Building on an in-depth analysis of critical components such as text encoders and Variational Auto-Encoders (VAEs), we introduce DiT-Air and DiT-Air-Lite. With supervised and reward fine-tuning, DiT-Air achieves state-of-the-art performance on GenEval and T2I CompBench, while DiT-Air-Lite remains highly competitive, surpassing most existing models despite its compact size. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10618v2-abstract-full').style.display = 'none'; document.getElementById('2503.10618v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10334">arXiv:2503.10334</a> <span> [<a href="https://arxiv.org/pdf/2503.10334">pdf</a>, <a href="https://arxiv.org/format/2503.10334">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Enhanced View Planning for Robotic Harvesting: Tackling Occlusions with Imitation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+L">Lun Li</a>, <a href="/search/cs?searchtype=author&query=Kasaei%2C+H">Hamidreza Kasaei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10334v1-abstract-short" style="display: inline;"> In agricultural automation, inherent occlusion presents a major challenge for robotic harvesting. We propose a novel imitation learning-based viewpoint planning approach to actively adjust camera viewpoint and capture unobstructed images of the target crop. Traditional viewpoint planners and existing learning-based methods, depend on manually designed evaluation metrics or reward functions, often… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10334v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10334v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10334v1-abstract-full" style="display: none;"> In agricultural automation, inherent occlusion presents a major challenge for robotic harvesting. We propose a novel imitation learning-based viewpoint planning approach to actively adjust camera viewpoint and capture unobstructed images of the target crop. Traditional viewpoint planners and existing learning-based methods, depend on manually designed evaluation metrics or reward functions, often struggle to generalize to complex, unseen scenarios. Our method employs the Action Chunking with Transformer (ACT) algorithm to learn effective camera motion policies from expert demonstrations. This enables continuous six-degree-of-freedom (6-DoF) viewpoint adjustments that are smoother, more precise and reveal occluded targets. Extensive experiments in both simulated and real-world environments, featuring agricultural scenarios and a 6-DoF robot arm equipped with an RGB-D camera, demonstrate our method's superior success rate and efficiency, especially in complex occlusion conditions, as well as its ability to generalize across different crops without reprogramming. This study advances robotic harvesting by providing a practical "learn from demonstration" (LfD) solution to occlusion challenges, ultimately enhancing autonomous harvesting performance and productivity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10334v1-abstract-full').style.display = 'none'; document.getElementById('2503.10334v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ICRA 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08661">arXiv:2503.08661</a> <span> [<a href="https://arxiv.org/pdf/2503.08661">pdf</a>, <a href="https://arxiv.org/format/2503.08661">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Task-Oriented Co-Design of Communication, Computing, and Control for Edge-Enabled Industrial Cyber-Physical Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Diao%2C+Y">Yufeng Diao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yichi Zhang</a>, <a href="/search/cs?searchtype=author&query=De+Martini%2C+D">Daniele De Martini</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+P+G">Philip Guodong Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+E+L">Emma Liying Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08661v1-abstract-short" style="display: inline;"> This paper proposes a task-oriented co-design framework that integrates communication, computing, and control to address the key challenges of bandwidth limitations, noise interference, and latency in mission-critical industrial Cyber-Physical Systems (CPS). To improve communication efficiency and robustness, we design a task-oriented Joint Source-Channel Coding (JSCC) using Information Bottleneck… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08661v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08661v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08661v1-abstract-full" style="display: none;"> This paper proposes a task-oriented co-design framework that integrates communication, computing, and control to address the key challenges of bandwidth limitations, noise interference, and latency in mission-critical industrial Cyber-Physical Systems (CPS). To improve communication efficiency and robustness, we design a task-oriented Joint Source-Channel Coding (JSCC) using Information Bottleneck (IB) to enhance data transmission efficiency by prioritizing task-specific information. To mitigate the perceived End-to-End (E2E) delays, we develop a Delay-Aware Trajectory-Guided Control Prediction (DTCP) strategy that integrates trajectory planning with control prediction, predicting commands based on E2E delay. Moreover, the DTCP is co-designed with task-oriented JSCC, focusing on transmitting task-specific information for timely and reliable autonomous driving. Experimental results in the CARLA simulator demonstrate that, under an E2E delay of 1 second (20 time slots), the proposed framework achieves a driving score of 48.12, which is 31.59 points higher than using Better Portable Graphics (BPG) while reducing bandwidth usage by 99.19%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08661v1-abstract-full').style.display = 'none'; document.getElementById('2503.08661v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted for publication in IEEE Journal on Selected Areas in Communications (JSAC), with publication expected in 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08156">arXiv:2503.08156</a> <span> [<a href="https://arxiv.org/pdf/2503.08156">pdf</a>, <a href="https://arxiv.org/format/2503.08156">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Towards Large-scale Chemical Reaction Image Parsing via a Multimodal Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yufan Chen</a>, <a href="/search/cs?searchtype=author&query=Leung%2C+C+T">Ching Ting Leung</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jianwei Sun</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yong Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Linyan Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Hanyu Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08156v1-abstract-short" style="display: inline;"> Artificial intelligence (AI) has demonstrated significant promise in advancing organic chemistry research; however, its effectiveness depends on the availability of high-quality chemical reaction data. Currently, most published chemical reactions are not available in machine-readable form, limiting the broader application of AI in this field. The extraction of published chemical reactions into str… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08156v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08156v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08156v1-abstract-full" style="display: none;"> Artificial intelligence (AI) has demonstrated significant promise in advancing organic chemistry research; however, its effectiveness depends on the availability of high-quality chemical reaction data. Currently, most published chemical reactions are not available in machine-readable form, limiting the broader application of AI in this field. The extraction of published chemical reactions into structured databases still relies heavily on manual curation, and robust automatic parsing of chemical reaction images into machine-readable data remains a significant challenge. To address this, we introduce the Reaction Image Multimodal large language model (RxnIM), the first multimodal large language model specifically designed to parse chemical reaction images into machine-readable reaction data. RxnIM not only extracts key chemical components from reaction images but also interprets the textual content that describes reaction conditions. Together with specially designed large-scale dataset generation method to support model training, our approach achieves excellent performance, with an average F1 score of 88% on various benchmarks, surpassing literature methods by 5%. This represents a crucial step toward the automatic construction of large databases of machine-readable reaction data parsed from images in the chemistry literature, providing essential data resources for AI research in chemistry. The source code, model checkpoints, and datasets developed in this work are released under permissive licenses. An instance of the RxnIM web application can be accessed at https://huggingface.co/spaces/CYF200127/RxnIM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08156v1-abstract-full').style.display = 'none'; document.getElementById('2503.08156v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07969">arXiv:2503.07969</a> <span> [<a href="https://arxiv.org/pdf/2503.07969">pdf</a>, <a href="https://arxiv.org/format/2503.07969">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> 7ABAW-Compound Expression Recognition via Curriculum Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chen Liu</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+F">Feng Qiu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lincheng Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Dadong Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xin Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07969v1-abstract-short" style="display: inline;"> With the advent of deep learning, expression recognition has made significant advancements. However, due to the limited availability of annotated compound expression datasets and the subtle variations of compound expressions, Compound Emotion Recognition (CE) still holds considerable potential for exploration. To advance this task, the 7th Affective Behavior Analysis in-the-wild (ABAW) competition… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07969v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07969v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07969v1-abstract-full" style="display: none;"> With the advent of deep learning, expression recognition has made significant advancements. However, due to the limited availability of annotated compound expression datasets and the subtle variations of compound expressions, Compound Emotion Recognition (CE) still holds considerable potential for exploration. To advance this task, the 7th Affective Behavior Analysis in-the-wild (ABAW) competition introduces the Compound Expression Challenge based on C-EXPR-DB, a limited dataset without labels. In this paper, we present a curriculum learning-based framework that initially trains the model on single-expression tasks and subsequently incorporates multi-expression data. This design ensures that our model first masters the fundamental features of basic expressions before being exposed to the complexities of compound emotions. Specifically, our designs can be summarized as follows: 1) Single-Expression Pre-training: The model is first trained on datasets containing single expressions to learn the foundational facial features associated with basic emotions. 2) Dynamic Compound Expression Generation: Given the scarcity of annotated compound expression datasets, we employ CutMix and Mixup techniques on the original single-expression images to create hybrid images exhibiting characteristics of multiple basic emotions. 3) Incremental Multi-Expression Integration: After performing well on single-expression tasks, the model is progressively exposed to multi-expression data, allowing the model to adapt to the complexity and variability of compound expressions. The official results indicate that our method achieves the \textbf{best} performance in this competition track with an F-score of 0.6063. Our code is released at https://github.com/YenanLiu/ABAW7th. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07969v1-abstract-full').style.display = 'none'; document.getElementById('2503.07969v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ECCVWorkshop as the report of the first place in 7th ABAW Track2 Competition</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07807">arXiv:2503.07807</a> <span> [<a href="https://arxiv.org/pdf/2503.07807">pdf</a>, <a href="https://arxiv.org/format/2503.07807">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Training Domain Draft Models for Speculative Decoding: Best Practices and Insights </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hong%2C+F">Fenglu Hong</a>, <a href="/search/cs?searchtype=author&query=Raju%2C+R">Ravi Raju</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J+L">Jonathan Lingjie Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bo Li</a>, <a href="/search/cs?searchtype=author&query=Thakker%2C+U">Urmish Thakker</a>, <a href="/search/cs?searchtype=author&query=Ravichandran%2C+A">Avinash Ravichandran</a>, <a href="/search/cs?searchtype=author&query=Jain%2C+S">Swayambhoo Jain</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+C">Changran Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07807v1-abstract-short" style="display: inline;"> Speculative decoding is an effective method for accelerating inference of large language models (LLMs) by employing a small draft model to predict the output of a target model. However, when adapting speculative decoding to domain-specific target models, the acceptance rate of the generic draft model drops significantly due to domain shift. In this work, we systematically investigate knowledge dis… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07807v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07807v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07807v1-abstract-full" style="display: none;"> Speculative decoding is an effective method for accelerating inference of large language models (LLMs) by employing a small draft model to predict the output of a target model. However, when adapting speculative decoding to domain-specific target models, the acceptance rate of the generic draft model drops significantly due to domain shift. In this work, we systematically investigate knowledge distillation techniques for training domain draft models to improve their speculation accuracy. We compare white-box and black-box distillation approaches and explore their effectiveness in various data accessibility scenarios, including historical user queries, curated domain data, and synthetically generated alignment data. Our experiments across Function Calling, Biology, and Chinese domains show that offline distillation consistently outperforms online distillation by 11% to 25%, white-box distillation surpasses black-box distillation by 2% to 10%, and data scaling trends hold across domains. Additionally, we find that synthetic data can effectively align draft models and achieve 80% to 93% of the performance of training on historical user queries. These findings provide practical guidelines for training domain-specific draft models to improve speculative decoding efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07807v1-abstract-full').style.display = 'none'; document.getElementById('2503.07807v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published as a workshop paper at SCOPE - ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07703">arXiv:2503.07703</a> <span> [<a href="https://arxiv.org/pdf/2503.07703">pdf</a>, <a href="https://arxiv.org/format/2503.07703">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Seedream 2.0: A Native Chinese-English Bilingual Image Generation Foundation Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gong%2C+L">Lixue Gong</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+X">Xiaoxia Hou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F">Fanshi Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Liang Li</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+X">Xiaochen Lian</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+F">Fei Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Liyang Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wei Liu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+W">Wei Lu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yichun Shi</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+S">Shiqi Sun</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yu Tian</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Z">Zhi Tian</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peng Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xun Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Ye Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+G">Guofeng Wu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jie Wu</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+X">Xin Xia</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+X">Xuefeng Xiao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Linjie Yang</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+Z">Zhonghua Zhai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuwei Zhang</a> , et al. (3 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07703v1-abstract-short" style="display: inline;"> Rapid advancement of diffusion models has catalyzed remarkable progress in the field of image generation. However, prevalent models such as Flux, SD3.5 and Midjourney, still grapple with issues like model bias, limited text rendering capabilities, and insufficient understanding of Chinese cultural nuances. To address these limitations, we present Seedream 2.0, a native Chinese-English bilingual im… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07703v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07703v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07703v1-abstract-full" style="display: none;"> Rapid advancement of diffusion models has catalyzed remarkable progress in the field of image generation. However, prevalent models such as Flux, SD3.5 and Midjourney, still grapple with issues like model bias, limited text rendering capabilities, and insufficient understanding of Chinese cultural nuances. To address these limitations, we present Seedream 2.0, a native Chinese-English bilingual image generation foundation model that excels across diverse dimensions, which adeptly manages text prompt in both Chinese and English, supporting bilingual image generation and text rendering. We develop a powerful data system that facilitates knowledge integration, and a caption system that balances the accuracy and richness for image description. Particularly, Seedream is integrated with a self-developed bilingual large language model as a text encoder, allowing it to learn native knowledge directly from massive data. This enable it to generate high-fidelity images with accurate cultural nuances and aesthetic expressions described in either Chinese or English. Beside, Glyph-Aligned ByT5 is applied for flexible character-level text rendering, while a Scaled ROPE generalizes well to untrained resolutions. Multi-phase post-training optimizations, including SFT and RLHF iterations, further improve the overall capability. Through extensive experimentation, we demonstrate that Seedream 2.0 achieves state-of-the-art performance across multiple aspects, including prompt-following, aesthetics, text rendering, and structural correctness. Furthermore, Seedream 2.0 has been optimized through multiple RLHF iterations to closely align its output with human preferences, as revealed by its outstanding ELO score. In addition, it can be readily adapted to an instruction-based image editing model, such as SeedEdit, with strong editing capability that balances instruction-following and image consistency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07703v1-abstract-full').style.display = 'none'; document.getElementById('2503.07703v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Official Page: https://team.doubao.com/tech/seedream</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07695">arXiv:2503.07695</a> <span> [<a href="https://arxiv.org/pdf/2503.07695">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Instilling Doubts About Truth: Measuring the Impact of Tucker Carlson's Interview with Vladimir Putin Using Machine Learning and Natural Language Processing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hagen%2C+L">Loni Hagen</a>, <a href="/search/cs?searchtype=author&query=Dinh%2C+L">Ly Dinh</a>, <a href="/search/cs?searchtype=author&query=Alexopoulos%2C+G">Golfo Alexopoulos</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lingyao Li</a>, <a href="/search/cs?searchtype=author&query=Ford%2C+D">Diego Ford</a>, <a href="/search/cs?searchtype=author&query=Chong%2C+M">Miyoung Chong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07695v1-abstract-short" style="display: inline;"> On February 7, 2024, Russian President Vladimir Putin gave a two-hour interview with conservative political commentator, Tucker Carlson. This study investigated the impact of the Carlson- Putin interview on the US X audience. We proposed a framework of social media impact using machine learning (ML) and natural language processing (NLP) by measuring changes in audience, structure, and content. Tri… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07695v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07695v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07695v1-abstract-full" style="display: none;"> On February 7, 2024, Russian President Vladimir Putin gave a two-hour interview with conservative political commentator, Tucker Carlson. This study investigated the impact of the Carlson- Putin interview on the US X audience. We proposed a framework of social media impact using machine learning (ML) and natural language processing (NLP) by measuring changes in audience, structure, and content. Triangulation methods were used to validate the process and results. The interview had a considerable impact among segments of the American public: 1) the reach and engagement of far-right influencers increased after the interview, suggesting Kremlin narratives gained traction within these circles, 2) the communication structure became more vulnerable to disinformation spread after the interview, and 3) the public discourse changed from support for Ukraine funding to conversations about Putin, Russia, and the issue of "truth" or the veracity of Putin's claims. This research contributes to methods development for social media studies and aids scholars in analyzing how public opinion shapes policy debates. The Carlson-Putin interview sparked a broader discussion about truth-telling. Far from being muted, the broad impact of the interview appears considerable and poses challenges for foreign affairs leaders who depend on public support and buy-in when formulating national policy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07695v1-abstract-full').style.display = 'none'; document.getElementById('2503.07695v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07440">arXiv:2503.07440</a> <span> [<a href="https://arxiv.org/pdf/2503.07440">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> Early signs of stuck pipe detection based on Crossformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+B">Bo Cao</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yu Song</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jin Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lei Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07440v1-abstract-short" style="display: inline;"> Stuck pipe incidents are one of the major challenges in drilling engineering,leading to massive time loss and additional costs.To address the limitations of insufficient long sequence modeling capability,the difficulty in accurately establishing warning threshold,and the lack of model interpretability in existing methods,we utilize Crossformer for early signs of detection indicating potential stuc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07440v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07440v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07440v1-abstract-full" style="display: none;"> Stuck pipe incidents are one of the major challenges in drilling engineering,leading to massive time loss and additional costs.To address the limitations of insufficient long sequence modeling capability,the difficulty in accurately establishing warning threshold,and the lack of model interpretability in existing methods,we utilize Crossformer for early signs of detection indicating potential stuck events in order to provide guidance for on-site drilling engineers and prevent stuck pipe incidents.The sliding window technique is integrated into Crossformer to allow it to output and display longer outputs,the improved Crossformer model is trained using normal time series drilling data to generate predictions for various parameters at each time step.The relative reconstruction error of model is regard as the risk of stuck pipe,thereby considering data that the model can't predict as anomalies,which represent the early signs of stuck pipe incidents.The multi-step prediction capability of Crossformer and relative reconstruction error are combined to assess stuck pipe risk at each time step in advance.We partition the reconstruction error into modeling error and error due to anomalous data fluctuations,furthermore,the dynamic warning threshold and warning time for stuck pipe incidents are determined using the probability density function of reconstruction errors from normal drilling data.The results indicate that our method can effectively detect early signs of stuck pipe incidents during the drilling process.Crossformer exhibits superior modeling and predictive capabilities compared with other deep learning models.Transformer-based models with multi-step prediction capability are more suitable for stuck pipe prediction compared to the current single-step prediction models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07440v1-abstract-full').style.display = 'none'; document.getElementById('2503.07440v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">33 pages,9 figure</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Report number:</span> 6265674 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07300">arXiv:2503.07300</a> <span> [<a href="https://arxiv.org/pdf/2503.07300">pdf</a>, <a href="https://arxiv.org/format/2503.07300">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Goal Conditioned Reinforcement Learning for Photo Finishing Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiarui Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yujin Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lingen Li</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Z">Zhang Fan</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+T">Tianfan Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07300v1-abstract-short" style="display: inline;"> Photo finishing tuning aims to automate the manual tuning process of the photo finishing pipeline, like Adobe Lightroom or Darktable. Previous works either use zeroth-order optimization, which is slow when the set of parameters increases, or rely on a differentiable proxy of the target finishing pipeline, which is hard to train. To overcome these challenges, we propose a novel goal-conditioned rei… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07300v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07300v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07300v1-abstract-full" style="display: none;"> Photo finishing tuning aims to automate the manual tuning process of the photo finishing pipeline, like Adobe Lightroom or Darktable. Previous works either use zeroth-order optimization, which is slow when the set of parameters increases, or rely on a differentiable proxy of the target finishing pipeline, which is hard to train. To overcome these challenges, we propose a novel goal-conditioned reinforcement learning framework for efficiently tuning parameters using a goal image as a condition. Unlike previous approaches, our tuning framework does not rely on any proxy and treats the photo finishing pipeline as a black box. Utilizing a trained reinforcement learning policy, it can efficiently find the desired set of parameters within just 10 queries, while optimization based approaches normally take 200 queries. Furthermore, our architecture utilizes a goal image to guide the iterative tuning of pipeline parameters, allowing for flexible conditioning on pixel-aligned target images, style images, or any other visually representable goals. We conduct detailed experiments on photo finishing tuning and photo stylization tuning tasks, demonstrating the advantages of our method. Project website: https://openimaginglab.github.io/RLPixTuner/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07300v1-abstract-full').style.display = 'none'; document.getElementById('2503.07300v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">38th Conference on Neural Information Processing Systems (NeurIPS 2024)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Advances in Neural Information Processing Systems 37 (2024): 46294-46318 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07170">arXiv:2503.07170</a> <span> [<a href="https://arxiv.org/pdf/2503.07170">pdf</a>, <a href="https://arxiv.org/format/2503.07170">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DeFine: A Decomposed and Fine-Grained Annotated Dataset for Long-form Article Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+M">Ming Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fang Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+M">Minghao Hu</a>, <a href="/search/cs?searchtype=author&query=He%2C+L">Li He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haiyang Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jun Zhang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+T">Tianwei Yan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Li Li</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Z">Zhunchen Luo</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+W">Wei Luo</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+X">Xiaoying Bai</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+G">Guotong Geng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07170v1-abstract-short" style="display: inline;"> Long-form article generation (LFAG) presents challenges such as maintaining logical consistency, comprehensive topic coverage, and narrative coherence across extended articles. Existing datasets often lack both the hierarchical structure and fine-grained annotation needed to effectively decompose tasks, resulting in shallow, disorganized article generation. To address these limitations, we introdu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07170v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07170v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07170v1-abstract-full" style="display: none;"> Long-form article generation (LFAG) presents challenges such as maintaining logical consistency, comprehensive topic coverage, and narrative coherence across extended articles. Existing datasets often lack both the hierarchical structure and fine-grained annotation needed to effectively decompose tasks, resulting in shallow, disorganized article generation. To address these limitations, we introduce DeFine, a Decomposed and Fine-grained annotated dataset for long-form article generation. DeFine is characterized by its hierarchical decomposition strategy and the integration of domain-specific knowledge with multi-level annotations, ensuring granular control and enhanced depth in article generation. To construct the dataset, a multi-agent collaborative pipeline is proposed, which systematically segments the generation process into four parts: Data Miner, Cite Retreiver, Q&A Annotator and Data Cleaner. To validate the effectiveness of DeFine, we designed and tested three LFAG baselines: the web retrieval, the local retrieval, and the grounded reference. We fine-tuned the Qwen2-7b-Instruct model using the DeFine training dataset. The experimental results showed significant improvements in text quality, specifically in topic coverage, depth of information, and content fidelity. Our dataset publicly available to facilitate future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07170v1-abstract-full').style.display = 'none'; document.getElementById('2503.07170v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07158">arXiv:2503.07158</a> <span> [<a href="https://arxiv.org/pdf/2503.07158">pdf</a>, <a href="https://arxiv.org/format/2503.07158">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Generative AI in Transportation Planning: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Da%2C+L">Longchao Da</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tiejin Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuoheng Li</a>, <a href="/search/cs?searchtype=author&query=Bachiraju%2C+S">Shreyas Bachiraju</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+H">Huaiyuan Yao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Li Li</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yushun Dong</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiyang Hu</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+Z">Zhengzhong Tu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Dongjie Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yue Zhao</a>, <a href="/search/cs?searchtype=author&query=Xuanyu"> Xuanyu</a>, <a href="/search/cs?searchtype=author&query=Zhou"> Zhou</a>, <a href="/search/cs?searchtype=author&query=Pendyala%2C+R">Ram Pendyala</a>, <a href="/search/cs?searchtype=author&query=Stabler%2C+B">Benjamin Stabler</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yezhou Yang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xuesong Zhou</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+H">Hua Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07158v4-abstract-short" style="display: inline;"> The integration of generative artificial intelligence (GenAI) into transportation planning has the potential to revolutionize tasks such as demand forecasting, infrastructure design, policy evaluation, and traffic simulation. However, there is a critical need for a systematic framework to guide the adoption of GenAI in this interdisciplinary domain. In this survey, we, a multidisciplinary team of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07158v4-abstract-full').style.display = 'inline'; document.getElementById('2503.07158v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07158v4-abstract-full" style="display: none;"> The integration of generative artificial intelligence (GenAI) into transportation planning has the potential to revolutionize tasks such as demand forecasting, infrastructure design, policy evaluation, and traffic simulation. However, there is a critical need for a systematic framework to guide the adoption of GenAI in this interdisciplinary domain. In this survey, we, a multidisciplinary team of researchers spanning computer science and transportation engineering, present the first comprehensive framework for leveraging GenAI in transportation planning. Specifically, we introduce a new taxonomy that categorizes existing applications and methodologies into two perspectives: transportation planning tasks and computational techniques. From the transportation planning perspective, we examine the role of GenAI in automating descriptive, predictive, generative, simulation, and explainable tasks to enhance mobility systems. From the computational perspective, we detail advancements in data preparation, domain-specific fine-tuning, and inference strategies, such as retrieval-augmented generation and zero-shot learning tailored to transportation applications. Additionally, we address critical challenges, including data scarcity, explainability, bias mitigation, and the development of domain-specific evaluation frameworks that align with transportation goals like sustainability, equity, and system efficiency. This survey aims to bridge the gap between traditional transportation planning methodologies and modern AI techniques, fostering collaboration and innovation. By addressing these challenges and opportunities, we seek to inspire future research that ensures ethical, equitable, and impactful use of generative AI in transportation planning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07158v4-abstract-full').style.display = 'none'; document.getElementById('2503.07158v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">55 pages</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T99; 90B06 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6; I.2.8; I.6.3; J.2 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07094">arXiv:2503.07094</a> <span> [<a href="https://arxiv.org/pdf/2503.07094">pdf</a>, <a href="https://arxiv.org/format/2503.07094">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Novel Ophthalmic Benchmark for Evaluating Multimodal Large Language Models with Fundus Photographs and OCT Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiaoyi Liang</a>, <a href="/search/cs?searchtype=author&query=Bian%2C+M">Mouxiao Bian</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Moxin Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lihao Liu</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Junjun He</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jie Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07094v1-abstract-short" style="display: inline;"> In recent years, large language models (LLMs) have demonstrated remarkable potential across various medical applications. Building on this foundation, multimodal large language models (MLLMs) integrate LLMs with visual models to process diverse inputs, including clinical data and medical images. In ophthalmology, LLMs have been explored for analyzing optical coherence tomography (OCT) reports, ass… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07094v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07094v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07094v1-abstract-full" style="display: none;"> In recent years, large language models (LLMs) have demonstrated remarkable potential across various medical applications. Building on this foundation, multimodal large language models (MLLMs) integrate LLMs with visual models to process diverse inputs, including clinical data and medical images. In ophthalmology, LLMs have been explored for analyzing optical coherence tomography (OCT) reports, assisting in disease classification, and even predicting treatment outcomes. However, existing MLLM benchmarks often fail to capture the complexities of real-world clinical practice, particularly in the analysis of OCT images. Many suffer from limitations such as small sample sizes, a lack of diverse OCT datasets, and insufficient expert validation. These shortcomings hinder the accurate assessment of MLLMs' ability to interpret OCT scans and their broader applicability in ophthalmology. Our dataset, curated through rigorous quality control and expert annotation, consists of 439 fundus images and 75 OCT images. Using a standardized API-based framework, we assessed seven mainstream MLLMs and observed significant variability in diagnostic accuracy across different diseases. While some models performed well in diagnosing conditions such as diabetic retinopathy and age-related macular degeneration, they struggled with others, including choroidal neovascularization and myopia, highlighting inconsistencies in performance and the need for further refinement. Our findings emphasize the importance of developing clinically relevant benchmarks to provide a more accurate assessment of MLLMs' capabilities. By refining these models and expanding their scope, we can enhance their potential to transform ophthalmic diagnosis and treatment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07094v1-abstract-full').style.display = 'none'; document.getElementById('2503.07094v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06869">arXiv:2503.06869</a> <span> [<a href="https://arxiv.org/pdf/2503.06869">pdf</a>, <a href="https://arxiv.org/format/2503.06869">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Collective Behavior Clone with Visual Attention via Neural Interaction Graph Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+K">Kai Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zhao Ma</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Liang Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shiyu Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06869v1-abstract-short" style="display: inline;"> In this paper, we propose a framework, collective behavioral cloning (CBC), to learn the underlying interaction mechanism and control policy of a swarm system. Given the trajectory data of a swarm system, we propose a graph variational autoencoder (GVAE) to learn the local interaction graph. Based on the interaction graph and swarm trajectory, we use behavioral cloning to learn the control policy… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06869v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06869v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06869v1-abstract-full" style="display: none;"> In this paper, we propose a framework, collective behavioral cloning (CBC), to learn the underlying interaction mechanism and control policy of a swarm system. Given the trajectory data of a swarm system, we propose a graph variational autoencoder (GVAE) to learn the local interaction graph. Based on the interaction graph and swarm trajectory, we use behavioral cloning to learn the control policy of the swarm system. To demonstrate the practicality of CBC, we deploy it on a real-world decentralized vision-based robot swarm system. A visual attention network is trained based on the learned interaction graph for online neighbor selection. Experimental results show that our method outperforms previous approaches in predicting both the interaction graph and swarm actions with higher accuracy. This work offers a promising approach for understanding interaction mechanisms and swarm dynamics in future swarm robotics research. Code and data are available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06869v1-abstract-full').style.display = 'none'; document.getElementById('2503.06869v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06687">arXiv:2503.06687</a> <span> [<a href="https://arxiv.org/pdf/2503.06687">pdf</a>, <a href="https://arxiv.org/format/2503.06687">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biological Physics">physics.bio-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Chemical Physics">physics.chem-ph</span> </div> </div> <p class="title is-5 mathjax"> UniGenX: Unified Generation of Sequence and Structure with Autoregressive Diffusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Gongbo Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yanting Li</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Renqian Luo</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+P">Pipi Hu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zeru Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lingbo Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+G">Guoqing Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zun Wang</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+R">Ran Bi</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+K">Kaiyuan Gao</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+L">Liya Guo</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yu Xie</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chang Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jia Zhang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+T">Tian Xie</a>, <a href="/search/cs?searchtype=author&query=Pinsler%2C+R">Robert Pinsler</a>, <a href="/search/cs?searchtype=author&query=Zeni%2C+C">Claudio Zeni</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Ziheng Lu</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Yingce Xia</a>, <a href="/search/cs?searchtype=author&query=Segler%2C+M">Marwin Segler</a>, <a href="/search/cs?searchtype=author&query=Riechert%2C+M">Maik Riechert</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+L">Li Yuan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lei Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haiguang Liu</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+T">Tao Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06687v1-abstract-short" style="display: inline;"> Unified generation of sequence and structure for scientific data (e.g., materials, molecules, proteins) is a critical task. Existing approaches primarily rely on either autoregressive sequence models or diffusion models, each offering distinct advantages and facing notable limitations. Autoregressive models, such as GPT, Llama, and Phi-4, have demonstrated remarkable success in natural language ge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06687v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06687v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06687v1-abstract-full" style="display: none;"> Unified generation of sequence and structure for scientific data (e.g., materials, molecules, proteins) is a critical task. Existing approaches primarily rely on either autoregressive sequence models or diffusion models, each offering distinct advantages and facing notable limitations. Autoregressive models, such as GPT, Llama, and Phi-4, have demonstrated remarkable success in natural language generation and have been extended to multimodal tasks (e.g., image, video, and audio) using advanced encoders like VQ-VAE to represent complex modalities as discrete sequences. However, their direct application to scientific domains is challenging due to the high precision requirements and the diverse nature of scientific data. On the other hand, diffusion models excel at generating high-dimensional scientific data, such as protein, molecule, and material structures, with remarkable accuracy. Yet, their inability to effectively model sequences limits their potential as general-purpose multimodal foundation models. To address these challenges, we propose UniGenX, a unified framework that combines autoregressive next-token prediction with conditional diffusion models. This integration leverages the strengths of autoregressive models to ease the training of conditional diffusion models, while diffusion-based generative heads enhance the precision of autoregressive predictions. We validate the effectiveness of UniGenX on material and small molecule generation tasks, achieving a significant leap in state-of-the-art performance for material crystal structure prediction and establishing new state-of-the-art results for small molecule structure prediction, de novo design, and conditional generation. Notably, UniGenX demonstrates significant improvements, especially in handling long sequences for complex structures, showcasing its efficacy as a versatile tool for scientific data generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06687v1-abstract-full').style.display = 'none'; document.getElementById('2503.06687v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06455">arXiv:2503.06455</a> <span> [<a href="https://arxiv.org/pdf/2503.06455">pdf</a>, <a href="https://arxiv.org/format/2503.06455">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Privacy Protection in Prosumer Energy Management Based on Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunfeng Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X+L+Z">Xiaolin Li Zhitao Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Gangqiang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06455v1-abstract-short" style="display: inline;"> With the booming development of prosumers, there is an urgent need for a prosumer energy management system to take full advantage of the flexibility of prosumers and take into account the interests of other parties. However, building such a system will undoubtedly reveal users' privacy. In this paper, by solving the non-independent and identical distribution of data (Non-IID) problem in federated… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06455v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06455v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06455v1-abstract-full" style="display: none;"> With the booming development of prosumers, there is an urgent need for a prosumer energy management system to take full advantage of the flexibility of prosumers and take into account the interests of other parties. However, building such a system will undoubtedly reveal users' privacy. In this paper, by solving the non-independent and identical distribution of data (Non-IID) problem in federated learning with federated cluster average(FedClusAvg) algorithm, prosumers' information can efficiently participate in the intelligent decision making of the system without revealing privacy. In the proposed FedClusAvg algorithm, each client performs cluster stratified sampling and multiple iterations. Then, the average weight of the parameters of the sub-server is determined according to the degree of deviation of the parameter from the average parameter. Finally, the sub-server multiple local iterations and updates, and then upload to the main server. The advantages of FedClusAvg algorithm are the following two parts. First, the accuracy of the model in the case of Non-IID is improved through the method of clustering and parameter weighted average. Second, local multiple iterations and three-tier framework can effectively reduce communication rounds. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06455v1-abstract-full').style.display = 'none'; document.getElementById('2503.06455v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06431">arXiv:2503.06431</a> <span> [<a href="https://arxiv.org/pdf/2503.06431">pdf</a>, <a href="https://arxiv.org/format/2503.06431">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Methodology">stat.ME</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Fairness-aware organ exchange and kidney paired donation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingrui Zhang</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+X">Xiaowu Dai</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lexin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06431v1-abstract-short" style="display: inline;"> The kidney paired donation (KPD) program provides an innovative solution to overcome incompatibility challenges in kidney transplants by matching incompatible donor-patient pairs and facilitating kidney exchanges. To address unequal access to transplant opportunities, there are two widely used fairness criteria: group fairness and individual fairness. However, these criteria do not consider protec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06431v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06431v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06431v1-abstract-full" style="display: none;"> The kidney paired donation (KPD) program provides an innovative solution to overcome incompatibility challenges in kidney transplants by matching incompatible donor-patient pairs and facilitating kidney exchanges. To address unequal access to transplant opportunities, there are two widely used fairness criteria: group fairness and individual fairness. However, these criteria do not consider protected patient features, which refer to characteristics legally or ethically recognized as needing protection from discrimination, such as race and gender. Motivated by the calibration principle in machine learning, we introduce a new fairness criterion: the matching outcome should be conditionally independent of the protected feature, given the sensitization level. We integrate this fairness criterion as a constraint within the KPD optimization framework and propose a computationally efficient solution. Theoretically, we analyze the associated price of fairness using random graph models. Empirically, we compare our fairness criterion with group fairness and individual fairness through both simulations and a real-data example. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06431v1-abstract-full').style.display = 'none'; document.getElementById('2503.06431v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.05281">arXiv:2503.05281</a> <span> [<a href="https://arxiv.org/pdf/2503.05281">pdf</a>, <a href="https://arxiv.org/format/2503.05281">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Similarity-Based Domain Adaptation with LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+J">Jie He</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Wendi Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X+L">Xiang Lorraine Li</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+J+Z">Jeff Z. Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.05281v1-abstract-short" style="display: inline;"> Unsupervised domain adaptation leverages abundant labeled data from various source domains to generalize onto unlabeled target data. Prior research has primarily focused on learning domain-invariant features across the source and target domains. However, these methods often require training a model using source domain data, which is time-consuming and can limit model usage for applications with di… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05281v1-abstract-full').style.display = 'inline'; document.getElementById('2503.05281v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.05281v1-abstract-full" style="display: none;"> Unsupervised domain adaptation leverages abundant labeled data from various source domains to generalize onto unlabeled target data. Prior research has primarily focused on learning domain-invariant features across the source and target domains. However, these methods often require training a model using source domain data, which is time-consuming and can limit model usage for applications with different source data. This paper introduces a simple framework that utilizes the impressive generalization capabilities of Large Language Models (LLMs) for target data annotation without the need of source model training, followed by a novel similarity-based knowledge distillation loss. Our extensive experiments on cross-domain text classification reveal that our framework achieves impressive performance, specifically, 2.44\% accuracy improvement when compared to the SOTA method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05281v1-abstract-full').style.display = 'none'; document.getElementById('2503.05281v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Li%2C+L&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Li%2C+L&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+L&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+L&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+L&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+L&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository