Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 2,417 results for author: <span class="mathjax">Yang, X</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Yang%2C+X">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Yang, X"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Yang%2C+X&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Yang, X"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yang%2C+X&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yang%2C+X&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+X&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+X&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+X&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+X&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13509">arXiv:2502.13509</a> <span> [<a href="https://arxiv.org/pdf/2502.13509">pdf</a>, <a href="https://arxiv.org/format/2502.13509">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Unlocking Multimodal Integration in EHRs: A Prompt Learning Framework for Language and Time Series Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Niu%2C+S">Shuai Niu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jing Ma</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hongzhan Lin</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+L">Liang Bai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhihua Wang</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+W">Wei Bi</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yida Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guo Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xian Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13509v1-abstract-short" style="display: inline;"> Large language models (LLMs) have shown remarkable performance in vision-language tasks, but their application in the medical field remains underexplored, particularly for integrating structured time series data with unstructured clinical notes. In clinical practice, dynamic time series data such as lab test results capture critical temporal patterns, while clinical notes provide rich semantic con… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13509v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13509v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13509v1-abstract-full" style="display: none;"> Large language models (LLMs) have shown remarkable performance in vision-language tasks, but their application in the medical field remains underexplored, particularly for integrating structured time series data with unstructured clinical notes. In clinical practice, dynamic time series data such as lab test results capture critical temporal patterns, while clinical notes provide rich semantic context. Merging these modalities is challenging due to the inherent differences between continuous signals and discrete text. To bridge this gap, we introduce ProMedTS, a novel self-supervised multimodal framework that employs prompt-guided learning to unify these heterogeneous data types. Our approach leverages lightweight anomaly detection to generate anomaly captions that serve as prompts, guiding the encoding of raw time series data into informative embeddings. These embeddings are aligned with textual representations in a shared latent space, preserving fine-grained temporal nuances alongside semantic insights. Furthermore, our framework incorporates tailored self-supervised objectives to enhance both intra- and inter-modal alignment. We evaluate ProMedTS on disease diagnosis tasks using real-world datasets, and the results demonstrate that our method consistently outperforms state-of-the-art approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13509v1-abstract-full').style.display = 'none'; document.getElementById('2502.13509v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 5 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T50 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13474">arXiv:2502.13474</a> <span> [<a href="https://arxiv.org/pdf/2502.13474">pdf</a>, <a href="https://arxiv.org/format/2502.13474">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Towards Lightweight, Adaptive and Attribute-Aware Multi-Aspect Controllable Text Generation with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+C">Chenyu Zhu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yefeng Liu</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+C">Chenyang Lyu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xue Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guanhua Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Longyue Wang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+W">Weihua Luo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaifu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13474v1-abstract-short" style="display: inline;"> Multi-aspect controllable text generation aims to control text generation in attributes from multiple aspects, making it a complex but powerful task in natural language processing. Supervised fine-tuning methods are often employed for this task due to their simplicity and effectiveness. However, they still have some limitations: low rank adaptation (LoRA) only fine-tunes a few parameters and has s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13474v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13474v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13474v1-abstract-full" style="display: none;"> Multi-aspect controllable text generation aims to control text generation in attributes from multiple aspects, making it a complex but powerful task in natural language processing. Supervised fine-tuning methods are often employed for this task due to their simplicity and effectiveness. However, they still have some limitations: low rank adaptation (LoRA) only fine-tunes a few parameters and has suboptimal control effects, while full fine-tuning (FFT) requires significant computational resources and is susceptible to overfitting, particularly when data is limited. Moreover, existing works typically train multi-aspect controllable text generation models using only single-aspect annotated data, which results in discrepancies in data distribution; at the same time, accurately generating text with specific attributes is a challenge that requires strong attribute-aware capabilities. To address these limitations, we propose a lightweight, adaptive and attribute-aware framework for multi-aspect controllable text generation. Our framework can dynamically adjust model parameters according to different aspects of data to achieve controllable text generation, aiming to optimize performance across multiple aspects. Experimental results show that our framework outperforms other strong baselines, achieves state-of-the-art performance, adapts well to data discrepancies, and is more accurate in attribute perception. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13474v1-abstract-full').style.display = 'none'; document.getElementById('2502.13474v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages,9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12791">arXiv:2502.12791</a> <span> [<a href="https://arxiv.org/pdf/2502.12791">pdf</a>, <a href="https://arxiv.org/format/2502.12791">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Beyond Timesteps: A Novel Activation-wise Membrane Potential Propagation Mechanism for Spiking Neural Networks in 3D cloud </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+J">Jian Song</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+B">Boxuan Zheng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiangfei Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Donglin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12791v1-abstract-short" style="display: inline;"> Due to the similar characteristics between event-based visual data and point clouds, recent studies have emerged that treat event data as event clouds to learn based on point cloud analysis. Additionally, some works approach point clouds from the perspective of event vision, employing Spiking Neural Network (SNN) due to their asynchronous nature. However, these contributions are often domain-speci… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12791v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12791v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12791v1-abstract-full" style="display: none;"> Due to the similar characteristics between event-based visual data and point clouds, recent studies have emerged that treat event data as event clouds to learn based on point cloud analysis. Additionally, some works approach point clouds from the perspective of event vision, employing Spiking Neural Network (SNN) due to their asynchronous nature. However, these contributions are often domain-specific, making it difficult to extend their applicability to other intersecting fields. Moreover, while SNN-based visual tasks have seen significant growth, the conventional timestep-wise iterative activation strategy largely limits their real-world applications by large timesteps, resulting in significant delays and increased computational costs. Although some innovative methods achieve good performance with short timesteps (<10), few have fundamentally restructured the update strategy of spiking neurons to completely overcome the limitations of timesteps. In response to these concerns, we propose a novel and general activation strategy for spiking neurons called Activation-wise Membrane Potential Propagation (AMP2). This approach extends the concept of timesteps from a manually crafted parameter within the activation function to any existing network structure. In experiments on common point cloud tasks (classification, object, and scene segmentation) and event cloud tasks (action recognition), we found that AMP2 stabilizes SNN training, maintains competitive performance, and reduces latency compared to the traditional timestep-wise activation paradigm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12791v1-abstract-full').style.display = 'none'; document.getElementById('2502.12791v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12674">arXiv:2502.12674</a> <span> [<a href="https://arxiv.org/pdf/2502.12674">pdf</a>, <a href="https://arxiv.org/format/2502.12674">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SATA: Safe and Adaptive Torque-Based Locomotion Policies Inspired by Animal Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+P">Peizhuo Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongyi Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+G">Ge Sun</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jin Cheng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xinrong Yang</a>, <a href="/search/cs?searchtype=author&query=Bellegarda%2C+G">Guillaume Bellegarda</a>, <a href="/search/cs?searchtype=author&query=Shafiee%2C+M">Milad Shafiee</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yuhong Cao</a>, <a href="/search/cs?searchtype=author&query=Ijspeert%2C+A">Auke Ijspeert</a>, <a href="/search/cs?searchtype=author&query=Sartoretti%2C+G">Guillaume Sartoretti</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12674v1-abstract-short" style="display: inline;"> Despite recent advances in learning-based controllers for legged robots, deployments in human-centric environments remain limited by safety concerns. Most of these approaches use position-based control, where policies output target joint angles that must be processed by a low-level controller (e.g., PD or impedance controllers) to compute joint torques. Although impressive results have been achiev… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12674v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12674v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12674v1-abstract-full" style="display: none;"> Despite recent advances in learning-based controllers for legged robots, deployments in human-centric environments remain limited by safety concerns. Most of these approaches use position-based control, where policies output target joint angles that must be processed by a low-level controller (e.g., PD or impedance controllers) to compute joint torques. Although impressive results have been achieved in controlled real-world scenarios, these methods often struggle with compliance and adaptability when encountering environments or disturbances unseen during training, potentially resulting in extreme or unsafe behaviors. Inspired by how animals achieve smooth and adaptive movements by controlling muscle extension and contraction, torque-based policies offer a promising alternative by enabling precise and direct control of the actuators in torque space. In principle, this approach facilitates more effective interactions with the environment, resulting in safer and more adaptable behaviors. However, challenges such as a highly nonlinear state space and inefficient exploration during training have hindered their broader adoption. To address these limitations, we propose SATA, a bio-inspired framework that mimics key biomechanical principles and adaptive learning mechanisms observed in animal locomotion. Our approach effectively addresses the inherent challenges of learning torque-based policies by significantly improving early-stage exploration, leading to high-performance final policies. Remarkably, our method achieves zero-shot sim-to-real transfer. Our experimental results indicate that SATA demonstrates remarkable compliance and safety, even in challenging environments such as soft/slippery terrain or narrow passages, and under significant external disturbances, highlighting its potential for practical deployments in human-centric and safety-critical scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12674v1-abstract-full').style.display = 'none'; document.getElementById('2502.12674v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12530">arXiv:2502.12530</a> <span> [<a href="https://arxiv.org/pdf/2502.12530">pdf</a>, <a href="https://arxiv.org/format/2502.12530">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Policy-to-Language: Train LLMs to Explain Decisions with Flow-Matching Generated Rewards </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xinyi Yang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+L">Liang Zeng</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+H">Heng Dong</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+C">Chao Yu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiaoran Wu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Huazhong Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yu Wang</a>, <a href="/search/cs?searchtype=author&query=Tambe%2C+M">Milind Tambe</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tonghan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12530v1-abstract-short" style="display: inline;"> As humans increasingly share environments with diverse agents powered by RL, LLMs, and beyond, the ability to explain their policies in natural language will be vital for reliable coexistence. In this paper, we build a model-agnostic explanation generator based on an LLM. The technical novelty is that the rewards for training this LLM are generated by a generative flow matching model. This model h… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12530v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12530v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12530v1-abstract-full" style="display: none;"> As humans increasingly share environments with diverse agents powered by RL, LLMs, and beyond, the ability to explain their policies in natural language will be vital for reliable coexistence. In this paper, we build a model-agnostic explanation generator based on an LLM. The technical novelty is that the rewards for training this LLM are generated by a generative flow matching model. This model has a specially designed structure with a hidden layer merged with an LLM to harness the linguistic cues of explanations into generating appropriate rewards. Experiments on both RL and LLM tasks demonstrate that our method can generate dense and effective rewards while saving on expensive human feedback; it thus enables effective explanations and even improves the accuracy of the decisions in original tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12530v1-abstract-full').style.display = 'none'; document.getElementById('2502.12530v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12514">arXiv:2502.12514</a> <span> [<a href="https://arxiv.org/pdf/2502.12514">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Memory-updated-based Framework for 100% Reliable Flexible Flat Cables Insertion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ling%2C+Z">Zhengrong Ling</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiong Yang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D">Dong Guo</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+H">Hongyuan Chang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tieshan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruijia Zhang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yajing Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12514v1-abstract-short" style="display: inline;"> Automatic assembly lines have increasingly replaced human labor in various tasks; however, the automation of Flexible Flat Cable (FFC) insertion remains unrealized due to its high requirement for effective feedback and dynamic operation, limiting approximately 11% of global industrial capacity. Despite lots of approaches, like vision-based tactile sensors and reinforcement learning, having been pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12514v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12514v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12514v1-abstract-full" style="display: none;"> Automatic assembly lines have increasingly replaced human labor in various tasks; however, the automation of Flexible Flat Cable (FFC) insertion remains unrealized due to its high requirement for effective feedback and dynamic operation, limiting approximately 11% of global industrial capacity. Despite lots of approaches, like vision-based tactile sensors and reinforcement learning, having been proposed, the implementation of human-like high-reliable insertion (i.e., with a 100% success rate in completed insertion) remains a big challenge. Drawing inspiration from human behavior in FFC insertion, which involves sensing three-dimensional forces, translating them into physical concepts, and continuously improving estimates, we propose a novel framework. This framework includes a sensing module for collecting three-dimensional tactile data, a perception module for interpreting this data into meaningful physical signals, and a memory module based on Bayesian theory for reliability estimation and control. This strategy enables the robot to accurately assess its physical state and generate reliable status estimations and corrective actions. Experimental results demonstrate that the robot using this framework can detect alignment errors of 0.5 mm with an accuracy of 97.92% and then achieve a 100% success rate in all completed tests after a few iterations. This work addresses the challenges of unreliable perception and control in complex insertion tasks, highlighting the path toward the development of fully automated production lines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12514v1-abstract-full').style.display = 'none'; document.getElementById('2502.12514v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12488">arXiv:2502.12488</a> <span> [<a href="https://arxiv.org/pdf/2502.12488">pdf</a>, <a href="https://arxiv.org/format/2502.12488">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Audio-Visual Spiking Neural Networks through Semantic-Alignment and Cross-Modal Residual Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+X">Xiang He</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+D">Dongcheng Zhao</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yiting Dong</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+G">Guobin Shen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xin Yang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yi Zeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12488v1-abstract-short" style="display: inline;"> Humans interpret and perceive the world by integrating sensory information from multiple modalities, such as vision and hearing. Spiking Neural Networks (SNNs), as brain-inspired computational models, exhibit unique advantages in emulating the brain's information processing mechanisms. However, existing SNN models primarily focus on unimodal processing and lack efficient cross-modal information fu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12488v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12488v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12488v1-abstract-full" style="display: none;"> Humans interpret and perceive the world by integrating sensory information from multiple modalities, such as vision and hearing. Spiking Neural Networks (SNNs), as brain-inspired computational models, exhibit unique advantages in emulating the brain's information processing mechanisms. However, existing SNN models primarily focus on unimodal processing and lack efficient cross-modal information fusion, thereby limiting their effectiveness in real-world multimodal scenarios. To address this challenge, we propose a semantic-alignment cross-modal residual learning (S-CMRL) framework, a Transformer-based multimodal SNN architecture designed for effective audio-visual integration. S-CMRL leverages a spatiotemporal spiking attention mechanism to extract complementary features across modalities, and incorporates a cross-modal residual learning strategy to enhance feature integration. Additionally, a semantic alignment optimization mechanism is introduced to align cross-modal features within a shared semantic space, improving their consistency and complementarity. Extensive experiments on three benchmark datasets CREMA-D, UrbanSound8K-AV, and MNISTDVS-NTIDIGITS demonstrate that S-CMRL significantly outperforms existing multimodal SNN methods, achieving the state-of-the-art performance. The code is publicly available at https://github.com/Brain-Cog-Lab/S-CMRL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12488v1-abstract-full').style.display = 'none'; document.getElementById('2502.12488v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The manuscript is under review and the code is available https://github.com/Brain-Cog-Lab/S-CMRL</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12176">arXiv:2502.12176</a> <span> [<a href="https://arxiv.org/pdf/2502.12176">pdf</a>, <a href="https://arxiv.org/format/2502.12176">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Ten Challenging Problems in Federated Foundation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fan%2C+T">Tao Fan</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+H">Hanlin Gu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xuemei Cao</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+C+S">Chee Seng Chan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yiqiang Chen</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yihui Feng</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yang Gu</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+J">Jiaxiang Geng</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+B">Bing Luo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shuoling Liu</a>, <a href="/search/cs?searchtype=author&query=Ong%2C+W+K">Win Kent Ong</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+C">Chao Ren</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+J">Jiaqi Shao</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+C">Chuan Sun</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xiaoli Tang</a>, <a href="/search/cs?searchtype=author&query=Tae%2C+H+X">Hong Xi Tae</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+Y">Yongxin Tong</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+S">Shuyue Wei</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fan Wu</a>, <a href="/search/cs?searchtype=author&query=Xi%2C+W">Wei Xi</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mingcong Xu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">He Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xin Yang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Jiangpeng Yan</a> , et al. (8 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12176v1-abstract-short" style="display: inline;"> Federated Foundation Models (FedFMs) represent a distributed learning paradigm that fuses general competences of foundation models as well as privacy-preserving capabilities of federated learning. This combination allows the large foundation models and the small local domain models at the remote clients to learn from each other in a teacher-student learning setting. This paper provides a comprehen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12176v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12176v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12176v1-abstract-full" style="display: none;"> Federated Foundation Models (FedFMs) represent a distributed learning paradigm that fuses general competences of foundation models as well as privacy-preserving capabilities of federated learning. This combination allows the large foundation models and the small local domain models at the remote clients to learn from each other in a teacher-student learning setting. This paper provides a comprehensive summary of the ten challenging problems inherent in FedFMs, encompassing foundational theory, utilization of private data, continual learning, unlearning, Non-IID and graph data, bidirectional knowledge transfer, incentive mechanism design, game mechanism design, model watermarking, and efficiency. The ten challenging problems manifest in five pivotal aspects: ``Foundational Theory," which aims to establish a coherent and unifying theoretical framework for FedFMs. ``Data," addressing the difficulties in leveraging domain-specific knowledge from private data while maintaining privacy; ``Heterogeneity," examining variations in data, model, and computational resources across clients; ``Security and Privacy," focusing on defenses against malicious attacks and model theft; and ``Efficiency," highlighting the need for improvements in training, communication, and parameter efficiency. For each problem, we offer a clear mathematical definition on the objective function, analyze existing methods, and discuss the key challenges and potential solutions. This in-depth exploration aims to advance the theoretical foundations of FedFMs, guide practical implementations, and inspire future research to overcome these obstacles, thereby enabling the robust, efficient, and privacy-preserving FedFMs in various real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12176v1-abstract-full').style.display = 'none'; document.getElementById('2502.12176v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12066">arXiv:2502.12066</a> <span> [<a href="https://arxiv.org/pdf/2502.12066">pdf</a>, <a href="https://arxiv.org/format/2502.12066">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> CONSTRUCTA: Automating Commercial Construction Schedules in Fabrication Facilities with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yifan Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xue Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12066v1-abstract-short" style="display: inline;"> Automating planning with LLMs presents transformative opportunities for traditional industries, yet remains underexplored. In commercial construction, the complexity of automated scheduling often requires manual intervention to ensure precision. We propose CONSTRUCTA, a novel framework leveraging LLMs to optimize construction schedules in complex projects like semiconductor fabrication. CONSTRUCTA… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12066v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12066v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12066v1-abstract-full" style="display: none;"> Automating planning with LLMs presents transformative opportunities for traditional industries, yet remains underexplored. In commercial construction, the complexity of automated scheduling often requires manual intervention to ensure precision. We propose CONSTRUCTA, a novel framework leveraging LLMs to optimize construction schedules in complex projects like semiconductor fabrication. CONSTRUCTA addresses key challenges by: (1) integrating construction-specific knowledge through static RAG; (2) employing context-sampling techniques inspired by architectural expertise to provide relevant input; and (3) deploying Construction DPO to align schedules with expert preferences using RLHF. Experiments on proprietary data demonstrate performance improvements of +42.3% in missing value prediction, +79.1% in dependency analysis, and +28.9% in automated planning compared to baseline methods, showcasing its potential to revolutionize construction workflows and inspire domain-specific LLM advancements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12066v1-abstract-full').style.display = 'none'; document.getElementById('2502.12066v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11946">arXiv:2502.11946</a> <span> [<a href="https://arxiv.org/pdf/2502.11946">pdf</a>, <a href="https://arxiv.org/format/2502.11946">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Step-Audio: Unified Understanding and Generation in Intelligent Speech Interaction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+A">Ailin Huang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Boyong Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bruce Wang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+C">Chao Yan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+C">Chen Hu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chengli Feng</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+F">Fei Tian</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+F">Feiyu Shen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jingbei Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mingrui Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peng Liu</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+R">Ruihang Miao</a>, <a href="/search/cs?searchtype=author&query=You%2C+W">Wang You</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xi Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xuerui Yang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yechang Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuxiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Z">Zheng Gong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zixin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hongyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jianjian Sun</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Brian Li</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chengting Feng</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+C">Changyi Wan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hanpeng Hu</a> , et al. (120 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11946v2-abstract-short" style="display: inline;"> Real-time speech interaction, serving as a fundamental interface for human-machine collaboration, holds immense potential. However, current open-source models face limitations such as high costs in voice data collection, weakness in dynamic control, and limited intelligence. To address these challenges, this paper introduces Step-Audio, the first production-ready open-source solution. Key contribu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11946v2-abstract-full').style.display = 'inline'; document.getElementById('2502.11946v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11946v2-abstract-full" style="display: none;"> Real-time speech interaction, serving as a fundamental interface for human-machine collaboration, holds immense potential. However, current open-source models face limitations such as high costs in voice data collection, weakness in dynamic control, and limited intelligence. To address these challenges, this paper introduces Step-Audio, the first production-ready open-source solution. Key contributions include: 1) a 130B-parameter unified speech-text multi-modal model that achieves unified understanding and generation, with the Step-Audio-Chat version open-sourced; 2) a generative speech data engine that establishes an affordable voice cloning framework and produces the open-sourced lightweight Step-Audio-TTS-3B model through distillation; 3) an instruction-driven fine control system enabling dynamic adjustments across dialects, emotions, singing, and RAP; 4) an enhanced cognitive architecture augmented with tool calling and role-playing abilities to manage complex tasks effectively. Based on our new StepEval-Audio-360 evaluation benchmark, Step-Audio achieves state-of-the-art performance in human evaluations, especially in terms of instruction following. On open-source benchmarks like LLaMA Question, shows 9.3% average performance improvement, demonstrating our commitment to advancing the development of open-source multi-modal language technologies. Our code and models are available at https://github.com/stepfun-ai/Step-Audio. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11946v2-abstract-full').style.display = 'none'; document.getElementById('2502.11946v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11506">arXiv:2502.11506</a> <span> [<a href="https://arxiv.org/pdf/2502.11506">pdf</a>, <a href="https://arxiv.org/format/2502.11506">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Learning Surrogate Potential Mean Field Games via Gaussian Processes: A Data-Driven Approach to Ill-Posed Inverse Problems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingguo Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xianjin Yang</a>, <a href="/search/cs?searchtype=author&query=Mou%2C+C">Chenchen Mou</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+C">Chao Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11506v1-abstract-short" style="display: inline;"> Mean field games (MFGs) describe the collective behavior of large populations of interacting agents. In this work, we tackle ill-posed inverse problems in potential MFGs, aiming to recover the agents' population, momentum, and environmental setup from limited, noisy measurements and partial observations. These problems are ill-posed because multiple MFG configurations can explain the same data, or… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11506v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11506v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11506v1-abstract-full" style="display: none;"> Mean field games (MFGs) describe the collective behavior of large populations of interacting agents. In this work, we tackle ill-posed inverse problems in potential MFGs, aiming to recover the agents' population, momentum, and environmental setup from limited, noisy measurements and partial observations. These problems are ill-posed because multiple MFG configurations can explain the same data, or different parameters can yield nearly identical observations. Nonetheless, they remain crucial in practice for real-world scenarios where data are inherently sparse or noisy, or where the MFG structure is not fully determined. Our focus is on finding surrogate MFGs that accurately reproduce the observed data despite these challenges. We propose two Gaussian process (GP)-based frameworks: an inf-sup formulation and a bilevel approach. The choice between them depends on whether the unknown parameters introduce concavity in the objective. In the inf-sup framework, we use the linearity of GPs and their parameterization structure to maintain convex-concave properties, allowing us to apply standard convex optimization algorithms. In the bilevel framework, we employ a gradient-descent-based algorithm and introduce two methods for computing the outer gradient. The first method leverages an existing solver for the inner potential MFG and applies automatic differentiation, while the second adopts an adjoint-based strategy that computes the outer gradient independently of the inner solver. Our numerical experiments show that when sufficient prior information is available, the unknown parameters can be accurately recovered. Otherwise, if prior information is limited, the inverse problem is ill-posed, but our frameworks can still produce surrogate MFG models that closely match observed data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11506v1-abstract-full').style.display = 'none'; document.getElementById('2502.11506v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">36 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11390">arXiv:2502.11390</a> <span> [<a href="https://arxiv.org/pdf/2502.11390">pdf</a>, <a href="https://arxiv.org/format/2502.11390">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MARS: Mesh AutoRegressive Model for 3D Shape Detailization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jingnan Gao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Weizhe Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+W">Weixuan Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Senbo Wang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xibin Song</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+T">Taizhang Shang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shenzhou Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongdong Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaokang Yang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yichao Yan</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+P">Pan Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11390v1-abstract-short" style="display: inline;"> State-of-the-art methods for mesh detailization predominantly utilize Generative Adversarial Networks (GANs) to generate detailed meshes from coarse ones. These methods typically learn a specific style code for each category or similar categories without enforcing geometry supervision across different Levels of Detail (LODs). Consequently, such methods often fail to generalize across a broader ran… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11390v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11390v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11390v1-abstract-full" style="display: none;"> State-of-the-art methods for mesh detailization predominantly utilize Generative Adversarial Networks (GANs) to generate detailed meshes from coarse ones. These methods typically learn a specific style code for each category or similar categories without enforcing geometry supervision across different Levels of Detail (LODs). Consequently, such methods often fail to generalize across a broader range of categories and cannot ensure shape consistency throughout the detailization process. In this paper, we introduce MARS, a novel approach for 3D shape detailization. Our method capitalizes on a novel multi-LOD, multi-category mesh representation to learn shape-consistent mesh representations in latent space across different LODs. We further propose a mesh autoregressive model capable of generating such latent representations through next-LOD token prediction. This approach significantly enhances the realism of the generated shapes. Extensive experiments conducted on the challenging 3D Shape Detailization benchmark demonstrate that our proposed MARS model achieves state-of-the-art performance, surpassing existing methods in both qualitative and quantitative assessments. Notably, the model's capability to generate fine-grained details while preserving the overall shape integrity is particularly commendable. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11390v1-abstract-full').style.display = 'none'; document.getElementById('2502.11390v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11090">arXiv:2502.11090</a> <span> [<a href="https://arxiv.org/pdf/2502.11090">pdf</a>, <a href="https://arxiv.org/format/2502.11090">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SafeDialBench: A Fine-Grained Safety Benchmark for Large Language Models in Multi-Turn Dialogues with Diverse Jailbreak Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+H">Hongye Cao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanming Wang</a>, <a href="/search/cs?searchtype=author&query=Jing%2C+S">Sijia Jing</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Ziyue Peng</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Z">Zhixin Bai</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Z">Zhe Cao</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+M">Meng Fang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+F">Fan Feng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Boyan Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaheng Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tianpei Yang</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jing Huo</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+F">Fanyu Meng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xi Yang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+C">Chao Deng</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+J">Junlan Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11090v2-abstract-short" style="display: inline;"> With the rapid advancement of Large Language Models (LLMs), the safety of LLMs has been a critical concern requiring precise assessment. Current benchmarks primarily concentrate on single-turn dialogues or a single jailbreak attack method to assess the safety. Additionally, these benchmarks have not taken into account the LLM's capability of identifying and handling unsafe information in detail. T… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11090v2-abstract-full').style.display = 'inline'; document.getElementById('2502.11090v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11090v2-abstract-full" style="display: none;"> With the rapid advancement of Large Language Models (LLMs), the safety of LLMs has been a critical concern requiring precise assessment. Current benchmarks primarily concentrate on single-turn dialogues or a single jailbreak attack method to assess the safety. Additionally, these benchmarks have not taken into account the LLM's capability of identifying and handling unsafe information in detail. To address these issues, we propose a fine-grained benchmark SafeDialBench for evaluating the safety of LLMs across various jailbreak attacks in multi-turn dialogues. Specifically, we design a two-tier hierarchical safety taxonomy that considers 6 safety dimensions and generates more than 4000 multi-turn dialogues in both Chinese and English under 22 dialogue scenarios. We employ 7 jailbreak attack strategies, such as reference attack and purpose reverse, to enhance the dataset quality for dialogue generation. Notably, we construct an innovative assessment framework of LLMs, measuring capabilities in detecting, and handling unsafe information and maintaining consistency when facing jailbreak attacks. Experimental results across 17 LLMs reveal that Yi-34B-Chat and GLM4-9B-Chat demonstrate superior safety performance, while Llama3.1-8B-Instruct and o3-mini exhibit safety vulnerabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11090v2-abstract-full').style.display = 'none'; document.getElementById('2502.11090v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10723">arXiv:2502.10723</a> <span> [<a href="https://arxiv.org/pdf/2502.10723">pdf</a>, <a href="https://arxiv.org/format/2502.10723">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Mathematics Framework of Artificial Shifted Population Risk and Its Further Understanding Related to Consistency Regularization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiliang Yang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+S">Shenyang Deng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shicong Liu</a>, <a href="/search/cs?searchtype=author&query=Suo%2C+Y">Yuanchi Suo</a>, <a href="/search/cs?searchtype=author&query=NG%2C+W+W+Y">Wing. W. Y NG</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianjun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10723v1-abstract-short" style="display: inline;"> Data augmentation is an important technique in training deep neural networks as it enhances their ability to generalize and remain robust. While data augmentation is commonly used to expand the sample size and act as a consistency regularization term, there is a lack of research on the relationship between them. To address this gap, this paper introduces a more comprehensive mathematical framework… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10723v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10723v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10723v1-abstract-full" style="display: none;"> Data augmentation is an important technique in training deep neural networks as it enhances their ability to generalize and remain robust. While data augmentation is commonly used to expand the sample size and act as a consistency regularization term, there is a lack of research on the relationship between them. To address this gap, this paper introduces a more comprehensive mathematical framework for data augmentation. Through this framework, we establish that the expected risk of the shifted population is the sum of the original population risk and a gap term, which can be interpreted as a consistency regularization term. The paper also provides a theoretical understanding of this gap, highlighting its negative effects on the early stages of training. We also propose a method to mitigate these effects. To validate our approach, we conducted experiments using same data augmentation techniques and computing resources under several scenarios, including standard training, out-of-distribution, and imbalanced classification. The results demonstrate that our methods surpass compared methods under all scenarios in terms of generalization ability and convergence stability. We provide our code implementation at the following link: https://github.com/ydlsfhll/ASPR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10723v1-abstract-full').style.display = 'none'; document.getElementById('2502.10723v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10435">arXiv:2502.10435</a> <span> [<a href="https://arxiv.org/pdf/2502.10435">pdf</a>, <a href="https://arxiv.org/format/2502.10435">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> RAMer: Reconstruction-based Adversarial Model for Multi-party Multi-modal Multi-label Emotion Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xudong Yang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yizhang Zhu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+N">Nan Tang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yuyu Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10435v1-abstract-short" style="display: inline;"> Conventional multi-modal multi-label emotion recognition (MMER) from videos typically assumes full availability of visual, textual, and acoustic modalities. However, real-world multi-party settings often violate this assumption, as non-speakers frequently lack acoustic and textual inputs, leading to a significant degradation in model performance. Existing approaches also tend to unify heterogeneou… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10435v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10435v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10435v1-abstract-full" style="display: none;"> Conventional multi-modal multi-label emotion recognition (MMER) from videos typically assumes full availability of visual, textual, and acoustic modalities. However, real-world multi-party settings often violate this assumption, as non-speakers frequently lack acoustic and textual inputs, leading to a significant degradation in model performance. Existing approaches also tend to unify heterogeneous modalities into a single representation, overlooking each modality's unique characteristics. To address these challenges, we propose RAMer (Reconstruction-based Adversarial Model for Emotion Recognition), which leverages adversarial learning to refine multi-modal representations by exploring both modality commonality and specificity through reconstructed features enhanced by contrastive learning. RAMer also introduces a personality auxiliary task to complement missing modalities using modality-level attention, improving emotion reasoning. To further strengthen the model's ability to capture label and modality interdependency, we propose a stack shuffle strategy to enrich correlations between labels and modality-specific features. Experiments on three benchmarks, i.e., MEmoR, CMU-MOSEI, and $M^3$ED, demonstrate that RAMer achieves state-of-the-art performance in dyadic and multi-party MMER scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10435v1-abstract-full').style.display = 'none'; document.getElementById('2502.10435v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10248">arXiv:2502.10248</a> <span> [<a href="https://arxiv.org/pdf/2502.10248">pdf</a>, <a href="https://arxiv.org/format/2502.10248">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Step-Video-T2V Technical Report: The Practice, Challenges, and Future of Video Foundation Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+G">Guoqing Ma</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Haoyang Huang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+K">Kun Yan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liangyu Chen</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+N">Nan Duan</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+S">Shengming Yin</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+C">Changyi Wan</a>, <a href="/search/cs?searchtype=author&query=Ming%2C+R">Ranchen Ming</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xiaoniu Song</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xing Chen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yu Zhou</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+D">Deshan Sun</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+D">Deyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jian Zhou</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+K">Kaijun Tan</a>, <a href="/search/cs?searchtype=author&query=An%2C+K">Kang An</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mei Chen</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+W">Wei Ji</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qiling Wu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+W">Wen Sun</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xin Han</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yanan Wei</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+Z">Zheng Ge</a>, <a href="/search/cs?searchtype=author&query=Li%2C+A">Aojie Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bin Wang</a> , et al. (90 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10248v2-abstract-short" style="display: inline;"> We present Step-Video-T2V, a state-of-the-art text-to-video pre-trained model with 30B parameters and the ability to generate videos up to 204 frames in length. A deep compression Variational Autoencoder, Video-VAE, is designed for video generation tasks, achieving 16x16 spatial and 8x temporal compression ratios, while maintaining exceptional video reconstruction quality. User prompts are encoded… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10248v2-abstract-full').style.display = 'inline'; document.getElementById('2502.10248v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10248v2-abstract-full" style="display: none;"> We present Step-Video-T2V, a state-of-the-art text-to-video pre-trained model with 30B parameters and the ability to generate videos up to 204 frames in length. A deep compression Variational Autoencoder, Video-VAE, is designed for video generation tasks, achieving 16x16 spatial and 8x temporal compression ratios, while maintaining exceptional video reconstruction quality. User prompts are encoded using two bilingual text encoders to handle both English and Chinese. A DiT with 3D full attention is trained using Flow Matching and is employed to denoise input noise into latent frames. A video-based DPO approach, Video-DPO, is applied to reduce artifacts and improve the visual quality of the generated videos. We also detail our training strategies and share key observations and insights. Step-Video-T2V's performance is evaluated on a novel video generation benchmark, Step-Video-T2V-Eval, demonstrating its state-of-the-art text-to-video quality when compared with both open-source and commercial engines. Additionally, we discuss the limitations of current diffusion-based model paradigm and outline future directions for video foundation models. We make both Step-Video-T2V and Step-Video-T2V-Eval available at https://github.com/stepfun-ai/Step-Video-T2V. The online version can be accessed from https://yuewen.cn/videos as well. Our goal is to accelerate the innovation of video foundation models and empower video content creators. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10248v2-abstract-full').style.display = 'none'; document.getElementById('2502.10248v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">36 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09471">arXiv:2502.09471</a> <span> [<a href="https://arxiv.org/pdf/2502.09471">pdf</a>, <a href="https://arxiv.org/format/2502.09471">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Wholly-WOOD: Wholly Leveraging Diversified-quality Labels for Weakly-supervised Oriented Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yi Yu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xue Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yansheng Li</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhenjun Han</a>, <a href="/search/cs?searchtype=author&query=Da%2C+F">Feipeng Da</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Junchi Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09471v1-abstract-short" style="display: inline;"> Accurately estimating the orientation of visual objects with compact rotated bounding boxes (RBoxes) has become a prominent demand, which challenges existing object detection paradigms that only use horizontal bounding boxes (HBoxes). To equip the detectors with orientation awareness, supervised regression/classification modules have been introduced at the high cost of rotation annotation. Meanwhi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09471v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09471v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09471v1-abstract-full" style="display: none;"> Accurately estimating the orientation of visual objects with compact rotated bounding boxes (RBoxes) has become a prominent demand, which challenges existing object detection paradigms that only use horizontal bounding boxes (HBoxes). To equip the detectors with orientation awareness, supervised regression/classification modules have been introduced at the high cost of rotation annotation. Meanwhile, some existing datasets with oriented objects are already annotated with horizontal boxes or even single points. It becomes attractive yet remains open for effectively utilizing weaker single point and horizontal annotations to train an oriented object detector (OOD). We develop Wholly-WOOD, a weakly-supervised OOD framework, capable of wholly leveraging various labeling forms (Points, HBoxes, RBoxes, and their combination) in a unified fashion. By only using HBox for training, our Wholly-WOOD achieves performance very close to that of the RBox-trained counterpart on remote sensing and other areas, significantly reducing the tedious efforts on labor-intensive annotation for oriented objects. The source codes are available at https://github.com/VisionXLab/whollywood (PyTorch-based) and https://github.com/VisionXLab/whollywood-jittor (Jittor-based). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09471v1-abstract-full').style.display = 'none'; document.getElementById('2502.09471v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 9 figures, 9 tables, accepted by TPAMI</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09447">arXiv:2502.09447</a> <span> [<a href="https://arxiv.org/pdf/2502.09447">pdf</a>, <a href="https://arxiv.org/format/2502.09447">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Pixel-Level Reasoning Segmentation via Multi-turn Conversations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cai%2C+D">Dexian Cai</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaocui Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yongkang Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Daling Wang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+S">Shi Feng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yifei Zhang</a>, <a href="/search/cs?searchtype=author&query=Poria%2C+S">Soujanya Poria</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09447v1-abstract-short" style="display: inline;"> Existing visual perception systems focus on region-level segmentation in single-turn dialogues, relying on complex and explicit query instructions. Such systems cannot reason at the pixel level and comprehend dynamic user intent that changes over interaction. Our work tackles this issue by introducing a novel task, Pixel-level Reasoning Segmentation (Pixel-level RS) based on multi-turn conversatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09447v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09447v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09447v1-abstract-full" style="display: none;"> Existing visual perception systems focus on region-level segmentation in single-turn dialogues, relying on complex and explicit query instructions. Such systems cannot reason at the pixel level and comprehend dynamic user intent that changes over interaction. Our work tackles this issue by introducing a novel task, Pixel-level Reasoning Segmentation (Pixel-level RS) based on multi-turn conversations, tracking evolving user intent via multi-turn interactions for fine-grained segmentation. To establish a benchmark for this novel task, we build a Pixel-level ReasonIng Segmentation Dataset Based on Multi-Turn Conversations (PRIST), comprising 24k utterances from 8.3k multi-turn conversational scenarios with segmentation targets. Building on PRIST, we further propose MIRAS, a Multi-turn Interactive ReAsoning Segmentation framework, integrates pixel-level segmentation with robust multi-turn conversation understanding, generating pixel-grounded explanations aligned with user intent. The PRIST dataset and MIRSA framework fill the gap in pixel-level reasoning segmentation. Experimental results on the PRIST dataset demonstrate that our method outperforms current segmentation-specific baselines in terms of segmentation and LLM-based reasoning metrics. The code and data are available at: https://github.com/ccccai239/PixelRIST. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09447v1-abstract-full').style.display = 'none'; document.getElementById('2502.09447v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09313">arXiv:2502.09313</a> <span> [<a href="https://arxiv.org/pdf/2502.09313">pdf</a>, <a href="https://arxiv.org/format/2502.09313">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Delay Performance Analysis with Short Packets in Intelligent Machine Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+W">Wenyan Xu</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Z">Zhiqing Wei</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zhiqun Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yixin Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haotian Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Ying Zhou</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaoyu Yang</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+Y">Yashan Pang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09313v2-abstract-short" style="display: inline;"> With the rapid development of delay-sensitive services happened in industrial manufacturing, Internet of Vehicles, and smart logistics, more stringent delay requirements are put forward for the intelligent machine (IM) network. Short packet transmissions are widely adopted to reduce delay in IM networks. However, the delay performance of an IM network has not been sufficiently analyzed. This paper… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09313v2-abstract-full').style.display = 'inline'; document.getElementById('2502.09313v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09313v2-abstract-full" style="display: none;"> With the rapid development of delay-sensitive services happened in industrial manufacturing, Internet of Vehicles, and smart logistics, more stringent delay requirements are put forward for the intelligent machine (IM) network. Short packet transmissions are widely adopted to reduce delay in IM networks. However, the delay performance of an IM network has not been sufficiently analyzed. This paper applies queuing theory and stochastic geometry to construct network model and transmission model for downlink communication, respectively, proposes and derives the following three metrics, e.g., the transmission success probability (with delay as the threshold), expected delay, and delay jitter. To accurately characterize the transmission delay with short packets, the finite blocklength capacity is used to measure the channel transmission rate. Simulation results show that the increase of packet length and IM density significantly deteriorates the three metrics. Short packets are needed to improve the three metrics, especially in high IM density scenarios. The outcomes of this paper provide an important theoretical basis for the optimization design and performance improvement of IM networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09313v2-abstract-full').style.display = 'none'; document.getElementById('2502.09313v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09296">arXiv:2502.09296</a> <span> [<a href="https://arxiv.org/pdf/2502.09296">pdf</a>, <a href="https://arxiv.org/format/2502.09296">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> </div> </div> <p class="title is-5 mathjax"> A Physics-Informed Deep Learning Model for MRI Brain Motion Correction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Safari%2C+M">Mojtaba Safari</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shansong Wang</a>, <a href="/search/cs?searchtype=author&query=Eidex%2C+Z">Zach Eidex</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+R">Richard Qiu</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+C">Chih-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+D+S">David S. Yu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaofeng Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09296v1-abstract-short" style="display: inline;"> Background: MRI is crucial for brain imaging but is highly susceptible to motion artifacts due to long acquisition times. This study introduces PI-MoCoNet, a physics-informed motion correction network that integrates spatial and k-space information to remove motion artifacts without explicit motion parameter estimation, enhancing image fidelity and diagnostic reliability. Materials and Methods: PI… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09296v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09296v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09296v1-abstract-full" style="display: none;"> Background: MRI is crucial for brain imaging but is highly susceptible to motion artifacts due to long acquisition times. This study introduces PI-MoCoNet, a physics-informed motion correction network that integrates spatial and k-space information to remove motion artifacts without explicit motion parameter estimation, enhancing image fidelity and diagnostic reliability. Materials and Methods: PI-MoCoNet consists of a motion detection network (U-net with spatial averaging) to identify corrupted k-space lines and a motion correction network (U-net with Swin Transformer blocks) to reconstruct motion-free images. The correction is guided by three loss functions: reconstruction (L1), perceptual (LPIPS), and data consistency (Ldc). Motion artifacts were simulated via rigid phase encoding perturbations and evaluated on IXI and MR-ART datasets against Pix2Pix, CycleGAN, and U-net using PSNR, SSIM, and NMSE. Results: PI-MoCoNet significantly improved image quality. On IXI, for minor artifacts, PSNR increased from 34.15 dB to 45.95 dB, SSIM from 0.87 to 1.00, and NMSE reduced from 0.55% to 0.04%. For moderate artifacts, PSNR improved from 30.23 dB to 42.16 dB, SSIM from 0.80 to 0.99, and NMSE from 1.32% to 0.09%. For heavy artifacts, PSNR rose from 27.99 dB to 36.01 dB, SSIM from 0.75 to 0.97, and NMSE decreased from 2.21% to 0.36%. On MR-ART, PI-MoCoNet achieved PSNR gains of ~10 dB and SSIM improvements of up to 0.20, with NMSE reductions of ~6%. Ablation studies confirmed the importance of data consistency and perceptual losses, yielding a 1 dB PSNR gain and 0.17% NMSE reduction. Conclusions: PI-MoCoNet effectively mitigates motion artifacts in brain MRI, outperforming existing methods. Its ability to integrate spatial and k-space information makes it a promising tool for clinical use in motion-prone settings. Code: https://github.com/mosaf/PI-MoCoNet.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09296v1-abstract-full').style.display = 'none'; document.getElementById('2502.09296v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08248">arXiv:2502.08248</a> <span> [<a href="https://arxiv.org/pdf/2502.08248">pdf</a>, <a href="https://arxiv.org/ps/2502.08248">ps</a>, <a href="https://arxiv.org/format/2502.08248">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Theoretical Economics">econ.TH</span> </div> </div> <p class="title is-5 mathjax"> Mechanism Design in Max-Flows </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+S">Shengyuan Huang</a>, <a href="/search/cs?searchtype=author&query=Mei%2C+W">Wenjun Mei</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaoguang Yang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Z">Zhigang Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08248v1-abstract-short" style="display: inline;"> This paper studies allocation mechanisms in max-flow games with players' capacities as private information. We first show that no core-selection mechanism is truthful: there may exist a player whose payoff increases if she under-reports her capacity when a core-section mechanism is adopted. We then introduce five desirable properties for mechanisms in max-flow games: DSIC (truthful reporting is a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08248v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08248v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08248v1-abstract-full" style="display: none;"> This paper studies allocation mechanisms in max-flow games with players' capacities as private information. We first show that no core-selection mechanism is truthful: there may exist a player whose payoff increases if she under-reports her capacity when a core-section mechanism is adopted. We then introduce five desirable properties for mechanisms in max-flow games: DSIC (truthful reporting is a dominant strategy), SIR (individual rationality and positive payoff for each player contributing positively to at least one coalition), SP (no edge has an incentive to split into parallel edges), MP (no parallel edges have incentives to merge), and CM (a player's payoff does not decrease as another player's capacity and max-flow increase). While the Shapley value mechanism satisfies DSIC and SIR, it fails to meet SP, MP and CM. We propose a new mechanism based on minimal cuts that satisfies all five properties. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08248v1-abstract-full').style.display = 'none'; document.getElementById('2502.08248v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 9 figures, 1 table</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07620">arXiv:2502.07620</a> <span> [<a href="https://arxiv.org/pdf/2502.07620">pdf</a>, <a href="https://arxiv.org/format/2502.07620">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Causal-Informed Contrastive Learning: Towards Bias-Resilient Pre-training under Concept Drift </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaoyu Yang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jie Lu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+E">En Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07620v1-abstract-short" style="display: inline;"> The evolution of large-scale contrastive pre-training propelled by top-tier datasets has reached a transition point in the scaling law. Consequently, sustaining and enhancing a model's pre-training capabilities in drift environments have surfaced as a notable challenge. In this paper, we initially uncover that contrastive pre-training methods are significantly impacted by concept drift wherein dis… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07620v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07620v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07620v1-abstract-full" style="display: none;"> The evolution of large-scale contrastive pre-training propelled by top-tier datasets has reached a transition point in the scaling law. Consequently, sustaining and enhancing a model's pre-training capabilities in drift environments have surfaced as a notable challenge. In this paper, we initially uncover that contrastive pre-training methods are significantly impacted by concept drift wherein distributions change unpredictably, resulting in notable biases in the feature space of the pre-trained model. Empowered by causal inference, we construct a structural causal graph to analyze the impact of concept drift to contrastive pre-training systemically, and propose the causal interventional contrastive objective. Upon achieving this, we devise a resilient contrastive pre-training approach to accommodate the data stream of concept drift, with simple and scalable implementation. Extensive experiments on various downstream tasks demonstrate our resilient contrastive pre-training effectively mitigates the bias stemming from the concept drift data stream. Codes are available at https://anonymous.4open.science/r/ResilientCL/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07620v1-abstract-full').style.display = 'none'; document.getElementById('2502.07620v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07599">arXiv:2502.07599</a> <span> [<a href="https://arxiv.org/pdf/2502.07599">pdf</a>, <a href="https://arxiv.org/format/2502.07599">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> DPO-Shift: Shifting the Distribution of Direct Preference Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiliang Yang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+F">Feng Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qianen Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Lei Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07599v1-abstract-short" style="display: inline;"> Direct Preference Optimization (DPO) and its variants have become increasingly popular for aligning language models with human preferences. These methods aim to teach models to better distinguish between chosen (or preferred) and rejected (or dispreferred) responses. However, prior research has identified that the probability of chosen responses often decreases during training, and this phenomenon… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07599v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07599v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07599v1-abstract-full" style="display: none;"> Direct Preference Optimization (DPO) and its variants have become increasingly popular for aligning language models with human preferences. These methods aim to teach models to better distinguish between chosen (or preferred) and rejected (or dispreferred) responses. However, prior research has identified that the probability of chosen responses often decreases during training, and this phenomenon is known as likelihood displacement. To tackle this challenge, in this work we introduce \method to controllably shift the distribution of the chosen probability. Then, we show that \method exhibits a fundamental trade-off between improving the chosen probability and sacrificing the reward margin, as supported by both theoretical analysis and experimental validation. Furthermore, we demonstrate the superiority of \method over DPO on downstream tasks such as MT-Bench and a designed win rate experiment. We believe this study shows that the likelihood displacement issue of DPO can be effectively mitigated with a simple, theoretically grounded solution. Our code is available at https://github.com/Meaquadddd/DPO-Shift. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07599v1-abstract-full').style.display = 'none'; document.getElementById('2502.07599v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07411">arXiv:2502.07411</a> <span> [<a href="https://arxiv.org/pdf/2502.07411">pdf</a>, <a href="https://arxiv.org/format/2502.07411">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> EgoTextVQA: Towards Egocentric Scene-Text Aware Video Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Sheng Zhou</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Junbin Xiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qingyun Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yicong Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xun Yang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D">Dan Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Meng Wang</a>, <a href="/search/cs?searchtype=author&query=Chua%2C+T">Tat-Seng Chua</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+A">Angela Yao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07411v1-abstract-short" style="display: inline;"> We introduce EgoTextVQA, a novel and rigorously constructed benchmark for egocentric QA assistance involving scene text. EgoTextVQA contains 1.5K ego-view videos and 7K scene-text aware questions that reflect real-user needs in outdoor driving and indoor house-keeping activities. The questions are designed to elicit identification and reasoning on scene text in an egocentric and dynamic environmen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07411v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07411v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07411v1-abstract-full" style="display: none;"> We introduce EgoTextVQA, a novel and rigorously constructed benchmark for egocentric QA assistance involving scene text. EgoTextVQA contains 1.5K ego-view videos and 7K scene-text aware questions that reflect real-user needs in outdoor driving and indoor house-keeping activities. The questions are designed to elicit identification and reasoning on scene text in an egocentric and dynamic environment. With EgoTextVQA, we comprehensively evaluate 10 prominent multimodal large language models. Currently, all models struggle, and the best results (Gemini 1.5 Pro) are around 33% accuracy, highlighting the severe deficiency of these techniques in egocentric QA assistance. Our further investigations suggest that precise temporal grounding and multi-frame reasoning, along with high resolution and auxiliary scene-text inputs, are key for better performance. With thorough analyses and heuristic suggestions, we hope EgoTextVQA can serve as a solid testbed for research in egocentric scene-text QA assistance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07411v1-abstract-full').style.display = 'none'; document.getElementById('2502.07411v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07259">arXiv:2502.07259</a> <span> [<a href="https://arxiv.org/pdf/2502.07259">pdf</a>, <a href="https://arxiv.org/format/2502.07259">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Instrumentation and Methods for Astrophysics">astro-ph.IM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Solar and Stellar Astrophysics">astro-ph.SR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.3847/1538-4357/adadff">10.3847/1538-4357/adadff <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Flat U-Net: An Efficient Ultralightweight Model for Solar Filament Segmentation in Full-disk H$伪$ Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+G">GaoFei Zhu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+G">GangHua Lin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiao Yang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+C">Cheng Zeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07259v1-abstract-short" style="display: inline;"> Solar filaments are one of the most prominent features observed on the Sun, and their evolutions are closely related to various solar activities, such as flares and coronal mass ejections. Real-time automated identification of solar filaments is the most effective approach to managing large volumes of data. Existing models of filament identification are characterized by large parameter sizes and h… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07259v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07259v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07259v1-abstract-full" style="display: none;"> Solar filaments are one of the most prominent features observed on the Sun, and their evolutions are closely related to various solar activities, such as flares and coronal mass ejections. Real-time automated identification of solar filaments is the most effective approach to managing large volumes of data. Existing models of filament identification are characterized by large parameter sizes and high computational costs, which limit their future applications in highly integrated and intelligent ground-based and space-borne observation devices. Consequently, the design of more lightweight models will facilitate the advancement of intelligent observation equipment. In this study, we introduce Flat U-Net, a novel and highly efficient ultralightweight model that incorporates simplified channel attention (SCA) and channel self-attention (CSA) convolutional blocks for the segmentation of solar filaments in full-disk H$伪$ images. Feature information from each network layer is fully extracted to reconstruct interchannel feature representations. Each block effectively optimizes the channel features from the previous layer, significantly reducing parameters. The network architecture presents an elegant flattening, improving its efficiency, and simplifying the overall design. Experimental validation demonstrates that a model composed of pure SCAs achieves a precision of approximately 0.93, with dice similarity coefficient (DSC) and recall rates of 0.76 and 0.64, respectively, significantly outperforming the classical U-Net. Introducing a certain number of CSA blocks improves the DSC and recall rates to 0.82 and 0.74, respectively, which demonstrates a pronounced advantage, particularly concerning model weight size and detection effectiveness. The data set, models, and code are available as open-source resources. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07259v1-abstract-full').style.display = 'none'; document.getElementById('2502.07259v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 5 figures, 3 tables, accepted for publication in ApJ</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ApJ 980, 176 (2025) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07216">arXiv:2502.07216</a> <span> [<a href="https://arxiv.org/pdf/2502.07216">pdf</a>, <a href="https://arxiv.org/format/2502.07216">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3664647.3681043">10.1145/3664647.3681043 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> SparseFormer: Detecting Objects in HRW Shots via Sparse Vision Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenxi Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yuchen Guo</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+J">Jilai Zheng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haozhe Lin</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chao Ma</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+L">Lu Fang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaokang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07216v1-abstract-short" style="display: inline;"> Recent years have seen an increase in the use of gigapixel-level image and video capture systems and benchmarks with high-resolution wide (HRW) shots. However, unlike close-up shots in the MS COCO dataset, the higher resolution and wider field of view raise unique challenges, such as extreme sparsity and huge scale changes, causing existing close-up detectors inaccuracy and inefficiency. In this p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07216v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07216v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07216v1-abstract-full" style="display: none;"> Recent years have seen an increase in the use of gigapixel-level image and video capture systems and benchmarks with high-resolution wide (HRW) shots. However, unlike close-up shots in the MS COCO dataset, the higher resolution and wider field of view raise unique challenges, such as extreme sparsity and huge scale changes, causing existing close-up detectors inaccuracy and inefficiency. In this paper, we present a novel model-agnostic sparse vision transformer, dubbed SparseFormer, to bridge the gap of object detection between close-up and HRW shots. The proposed SparseFormer selectively uses attentive tokens to scrutinize the sparsely distributed windows that may contain objects. In this way, it can jointly explore global and local attention by fusing coarse- and fine-grained features to handle huge scale changes. SparseFormer also benefits from a novel Cross-slice non-maximum suppression (C-NMS) algorithm to precisely localize objects from noisy windows and a simple yet effective multi-scale strategy to improve accuracy. Extensive experiments on two HRW benchmarks, PANDA and DOTA-v1.0, demonstrate that the proposed SparseFormer significantly improves detection accuracy (up to 5.8%) and speed (up to 3x) over the state-of-the-art approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07216v1-abstract-full').style.display = 'none'; document.getElementById('2502.07216v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is accepted to ACM MM 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06816">arXiv:2502.06816</a> <span> [<a href="https://arxiv.org/pdf/2502.06816">pdf</a>, <a href="https://arxiv.org/format/2502.06816">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DeepCell: Multiview Representation Learning for Post-Mapping Netlists </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zhengyuan Shi</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chengyu Ma</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Ziyang Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Lingfeng Zhou</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+H">Hongyang Pan</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+W">Wentao Jiang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+F">Fan Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaoyan Yang</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+Z">Zhufei Chu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qiang Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06816v1-abstract-short" style="display: inline;"> Representation learning for post-mapping (PM) netlists is a critical challenge in Electronic Design Automation (EDA), driven by the diverse and complex nature of modern circuit designs. Existing approaches focus on intermediate representations like And-Inverter Graphs (AIGs), limiting their applicability to post-synthesis stages. We introduce DeepCell, a multiview representation learning framework… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06816v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06816v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06816v1-abstract-full" style="display: none;"> Representation learning for post-mapping (PM) netlists is a critical challenge in Electronic Design Automation (EDA), driven by the diverse and complex nature of modern circuit designs. Existing approaches focus on intermediate representations like And-Inverter Graphs (AIGs), limiting their applicability to post-synthesis stages. We introduce DeepCell, a multiview representation learning framework that integrates structural and functional insights from both PM netlists and AIGs to learn rich, generalizable embeddings. At its core, DeepCell employs the novel Mask Circuit Modeling (MCM) mechanism, which refines PM netlist representations in a self-supervised manner using pretrained AIG encoders. DeepCell sets a new benchmark in PM netlist representation, outperforming existing methods in predictive accuracy and reconstruction fidelity. To validate its efficacy, we apply DeepCell to functional Engineering Change Orders (ECO), achieving significant reductions in patch generation costs and runtime while improving patch quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06816v1-abstract-full').style.display = 'none'; document.getElementById('2502.06816v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06572">arXiv:2502.06572</a> <span> [<a href="https://arxiv.org/pdf/2502.06572">pdf</a>, <a href="https://arxiv.org/format/2502.06572">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LawGPT: Knowledge-Guided Data Generation and Its Application to Legal LLM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhi Zhou</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+K">Kun-Yang Yu</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+S">Shi-Yu Tian</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiao-Wen Yang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jiang-Xin Shi</a>, <a href="/search/cs?searchtype=author&query=Song%2C+P">Pengxiao Song</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Yi-Xuan Jin</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+L">Lan-Zhe Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yu-Feng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06572v2-abstract-short" style="display: inline;"> Large language models (LLMs), both proprietary and open-source, have demonstrated remarkable capabilities across various natural language processing tasks. However, they face significant limitations in legal reasoning tasks. Proprietary models introduce data privacy risks and high inference costs, while open-source models underperform due to insufficient legal domain training data. To address thes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06572v2-abstract-full').style.display = 'inline'; document.getElementById('2502.06572v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06572v2-abstract-full" style="display: none;"> Large language models (LLMs), both proprietary and open-source, have demonstrated remarkable capabilities across various natural language processing tasks. However, they face significant limitations in legal reasoning tasks. Proprietary models introduce data privacy risks and high inference costs, while open-source models underperform due to insufficient legal domain training data. To address these limitations, we study data generation for legal reasoning to improve the legal reasoning performance of open-source LLMs with the help of proprietary LLMs. This is challenging due to the lack of legal knowledge in proprietary LLMs and the difficulty in verifying the generated data. We propose KgDG, a knowledge-guided data generation framework for legal reasoning. Our framework enables leveraging legal knowledge to enhance generation diversity and introduces a refinement and verification process to ensure the quality of generated data. Moreover, we expand the generated dataset to further enhance the LLM reasoning capabilities. Using KgDG, we create a synthetic legal reasoning dataset containing 50K high-quality examples. Our trained model LawGPT outperforms existing legal-specific LLMs and achieves performance comparable to proprietary LLMs, demonstrating the effectiveness of KgDG and LawGPT. Our code and resources is publicly available at https://github.com/LAMDASZ-ML/Knowledge-Guide-Data-Generation . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06572v2-abstract-full').style.display = 'none'; document.getElementById('2502.06572v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05772">arXiv:2502.05772</a> <span> [<a href="https://arxiv.org/pdf/2502.05772">pdf</a>, <a href="https://arxiv.org/format/2502.05772">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Effective Black-Box Multi-Faceted Attacks Breach Vision Large Language Model Guardrails </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yijun Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lichao Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiao Yang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lanqing Hong</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05772v1-abstract-short" style="display: inline;"> Vision Large Language Models (VLLMs) integrate visual data processing, expanding their real-world applications, but also increasing the risk of generating unsafe responses. In response, leading companies have implemented Multi-Layered safety defenses, including alignment training, safety system prompts, and content moderation. However, their effectiveness against sophisticated adversarial attacks… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05772v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05772v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05772v1-abstract-full" style="display: none;"> Vision Large Language Models (VLLMs) integrate visual data processing, expanding their real-world applications, but also increasing the risk of generating unsafe responses. In response, leading companies have implemented Multi-Layered safety defenses, including alignment training, safety system prompts, and content moderation. However, their effectiveness against sophisticated adversarial attacks remains largely unexplored. In this paper, we propose MultiFaceted Attack, a novel attack framework designed to systematically bypass Multi-Layered Defenses in VLLMs. It comprises three complementary attack facets: Visual Attack that exploits the multimodal nature of VLLMs to inject toxic system prompts through images; Alignment Breaking Attack that manipulates the model's alignment mechanism to prioritize the generation of contrasting responses; and Adversarial Signature that deceives content moderators by strategically placing misleading information at the end of the response. Extensive evaluations on eight commercial VLLMs in a black-box setting demonstrate that MultiFaceted Attack achieves a 61.56% attack success rate, surpassing state-of-the-art methods by at least 42.18%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05772v1-abstract-full').style.display = 'none'; document.getElementById('2502.05772v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05612">arXiv:2502.05612</a> <span> [<a href="https://arxiv.org/pdf/2502.05612">pdf</a>, <a href="https://arxiv.org/format/2502.05612">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Rambler in the Wild: A Diary Study of LLM-Assisted Writing With Speech </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xuyu Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wengxi Li</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+M+G">Matthew G. Lee</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuoyang Li</a>, <a href="/search/cs?searchtype=author&query=Zamfirescu-Pereira%2C+J+D">J. D. Zamfirescu-Pereira</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Can Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05612v1-abstract-short" style="display: inline;"> Speech-to-text technologies have been shown to improve text input efficiency and potentially lower the barriers to writing. Recent LLM-assisted dictation tools aim to support writing with speech by bridging the gaps between speaking and traditional writing. This case study reports on the real-world writing experiences of twelve academic or creative writers using one such tool, Rambler, to write va… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05612v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05612v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05612v1-abstract-full" style="display: none;"> Speech-to-text technologies have been shown to improve text input efficiency and potentially lower the barriers to writing. Recent LLM-assisted dictation tools aim to support writing with speech by bridging the gaps between speaking and traditional writing. This case study reports on the real-world writing experiences of twelve academic or creative writers using one such tool, Rambler, to write various pieces such as blog posts, diaries, screenplays, notes, or fictional stories, etc. Through a ten-day diary study, we identified the participants' in-context writing strategies using Rambler, such as how they expanded from an outline or organized their loose thoughts for different writing goals. The interviews uncovered the psychological and productivity affordances of writing with speech, pointing to future directions of designing for this writing modality and the utilization of AI support. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05612v1-abstract-full').style.display = 'none'; document.getElementById('2502.05612v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05431">arXiv:2502.05431</a> <span> [<a href="https://arxiv.org/pdf/2502.05431">pdf</a>, <a href="https://arxiv.org/format/2502.05431">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> APE: Faster and Longer Context-Augmented Generation via Adaptive Parallel Encoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xinyu Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianqi Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Beidi Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05431v2-abstract-short" style="display: inline;"> Context-augmented generation (CAG) techniques, including RAG and ICL, require the efficient combination of multiple contexts to generate responses to user queries. Directly inputting these contexts as a sequence introduces a considerable computational burden by re-encoding the combined selection of contexts for every request. To address this, we explore the promising potential of parallel encoding… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05431v2-abstract-full').style.display = 'inline'; document.getElementById('2502.05431v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05431v2-abstract-full" style="display: none;"> Context-augmented generation (CAG) techniques, including RAG and ICL, require the efficient combination of multiple contexts to generate responses to user queries. Directly inputting these contexts as a sequence introduces a considerable computational burden by re-encoding the combined selection of contexts for every request. To address this, we explore the promising potential of parallel encoding to independently pre-compute and cache each context's KV states. This approach enables the direct loading of cached states during inference while accommodating more contexts through position reuse across contexts. However, due to misalignments in attention distribution, directly applying parallel encoding results in a significant performance drop. To enable effective and efficient CAG, we propose Adaptive Parallel Encoding ($\textbf{APE}$), which brings shared prefix, attention temperature, and scaling factor to align the distribution of parallel encoding with sequential encoding. Results on RAG and ICL tasks demonstrate that APE can preserve 98% and 93% sequential encoding performance using the same inputs while outperforming parallel encoding by 3.6% and 7.9%, respectively. It also scales to many-shot CAG, effectively encoding hundreds of contexts in parallel. Efficiency evaluation shows that APE can achieve an end-to-end 4.5$\times$ speedup by reducing 28$\times$ prefilling time for a 128K-length context. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05431v2-abstract-full').style.display = 'none'; document.getElementById('2502.05431v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05236">arXiv:2502.05236</a> <span> [<a href="https://arxiv.org/pdf/2502.05236">pdf</a>, <a href="https://arxiv.org/format/2502.05236">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Koel-TTS: Enhancing LLM based Speech Generation with Preference Alignment and Classifier Free Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hussain%2C+S">Shehzeen Hussain</a>, <a href="/search/cs?searchtype=author&query=Neekhara%2C+P">Paarth Neekhara</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xuesong Yang</a>, <a href="/search/cs?searchtype=author&query=Casanova%2C+E">Edresson Casanova</a>, <a href="/search/cs?searchtype=author&query=Ghosh%2C+S">Subhankar Ghosh</a>, <a href="/search/cs?searchtype=author&query=Desta%2C+M+T">Mikyas T. Desta</a>, <a href="/search/cs?searchtype=author&query=Fejgin%2C+R">Roy Fejgin</a>, <a href="/search/cs?searchtype=author&query=Valle%2C+R">Rafael Valle</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jason Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05236v1-abstract-short" style="display: inline;"> While autoregressive speech token generation models produce speech with remarkable variety and naturalness, their inherent lack of controllability often results in issues such as hallucinations and undesired vocalizations that do not conform to conditioning inputs. We introduce Koel-TTS, a suite of enhanced encoder-decoder Transformer TTS models that address these challenges by incorporating prefe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05236v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05236v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05236v1-abstract-full" style="display: none;"> While autoregressive speech token generation models produce speech with remarkable variety and naturalness, their inherent lack of controllability often results in issues such as hallucinations and undesired vocalizations that do not conform to conditioning inputs. We introduce Koel-TTS, a suite of enhanced encoder-decoder Transformer TTS models that address these challenges by incorporating preference alignment techniques guided by automatic speech recognition and speaker verification models. Additionally, we incorporate classifier-free guidance to further improve synthesis adherence to the transcript and reference speaker audio. Our experiments demonstrate that these optimizations significantly enhance target speaker similarity, intelligibility, and naturalness of synthesized speech. Notably, Koel-TTS directly maps text and context audio to acoustic tokens, and on the aforementioned metrics, outperforms state-of-the-art TTS models, despite being trained on a significantly smaller dataset. Audio samples and demos are available on our website. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05236v1-abstract-full').style.display = 'none'; document.getElementById('2502.05236v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05174">arXiv:2502.05174</a> <span> [<a href="https://arxiv.org/pdf/2502.05174">pdf</a>, <a href="https://arxiv.org/format/2502.05174">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MELON: Indirect Prompt Injection Defense via Masked Re-execution and Tool Comparison </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+K">Kaijie Zhu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xianjun Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jindong Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+W">Wenbo Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W+Y">William Yang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05174v1-abstract-short" style="display: inline;"> Recent research has explored that LLM agents are vulnerable to indirect prompt injection (IPI) attacks, where malicious tasks embedded in tool-retrieved information can redirect the agent to take unauthorized actions. Existing defenses against IPI have significant limitations: either require essential model training resources, lack effectiveness against sophisticated attacks, or harm the normal ut… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05174v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05174v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05174v1-abstract-full" style="display: none;"> Recent research has explored that LLM agents are vulnerable to indirect prompt injection (IPI) attacks, where malicious tasks embedded in tool-retrieved information can redirect the agent to take unauthorized actions. Existing defenses against IPI have significant limitations: either require essential model training resources, lack effectiveness against sophisticated attacks, or harm the normal utilities. We present MELON (Masked re-Execution and TooL comparisON), a novel IPI defense. Our approach builds on the observation that under a successful attack, the agent's next action becomes less dependent on user tasks and more on malicious tasks. Following this, we design MELON to detect attacks by re-executing the agent's trajectory with a masked user prompt modified through a masking function. We identify an attack if the actions generated in the original and masked executions are similar. We also include three key designs to reduce the potential false positives and false negatives. Extensive evaluation on the IPI benchmark AgentDojo demonstrates that MELON outperforms SOTA defenses in both attack prevention and utility preservation. Moreover, we show that combining MELON with a SOTA prompt augmentation defense (denoted as MELON-Aug) further improves its performance. We also conduct a detailed ablation study to validate our key designs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05174v1-abstract-full').style.display = 'none'; document.getElementById('2502.05174v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04719">arXiv:2502.04719</a> <span> [<a href="https://arxiv.org/pdf/2502.04719">pdf</a>, <a href="https://arxiv.org/format/2502.04719">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Tolerance-Aware Deep Optics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dai%2C+J">Jun Dai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liqun Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xinge Yang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yuyao Hu</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jinwei Gu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+T">Tianfan Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04719v1-abstract-short" style="display: inline;"> Deep optics has emerged as a promising approach by co-designing optical elements with deep learning algorithms. However, current research typically overlooks the analysis and optimization of manufacturing and assembly tolerances. This oversight creates a significant performance gap between designed and fabricated optical systems. To address this challenge, we present the first end-to-end tolerance… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04719v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04719v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04719v1-abstract-full" style="display: none;"> Deep optics has emerged as a promising approach by co-designing optical elements with deep learning algorithms. However, current research typically overlooks the analysis and optimization of manufacturing and assembly tolerances. This oversight creates a significant performance gap between designed and fabricated optical systems. To address this challenge, we present the first end-to-end tolerance-aware optimization framework that incorporates multiple tolerance types into the deep optics design pipeline. Our method combines physics-informed modelling with data-driven training to enhance optical design by accounting for and compensating for structural deviations in manufacturing and assembly. We validate our approach through computational imaging applications, demonstrating results in both simulations and real-world experiments. We further examine how our proposed solution improves the robustness of optical systems and vision algorithms against tolerances through qualitative and quantitative analyses. Code and additional visual results are available at openimaginglab.github.io/LensTolerance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04719v1-abstract-full').style.display = 'none'; document.getElementById('2502.04719v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04404">arXiv:2502.04404</a> <span> [<a href="https://arxiv.org/pdf/2502.04404">pdf</a>, <a href="https://arxiv.org/format/2502.04404">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Step Back to Leap Forward: Self-Backtracking for Boosting Reasoning of Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiao-Wen Yang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xuan-Yi Zhu</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+W">Wen-Da Wei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Ding-Chu Zhang</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+J">Jie-Jing Shao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhi Zhou</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+L">Lan-Zhe Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yu-Feng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04404v1-abstract-short" style="display: inline;"> The integration of slow-thinking mechanisms into large language models (LLMs) offers a promising way toward achieving Level 2 AGI Reasoners, as exemplified by systems like OpenAI's o1. However, several significant challenges remain, including inefficient overthinking and an overreliance on auxiliary reward models. We point out that these limitations stem from LLMs' inability to internalize the sea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04404v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04404v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04404v1-abstract-full" style="display: none;"> The integration of slow-thinking mechanisms into large language models (LLMs) offers a promising way toward achieving Level 2 AGI Reasoners, as exemplified by systems like OpenAI's o1. However, several significant challenges remain, including inefficient overthinking and an overreliance on auxiliary reward models. We point out that these limitations stem from LLMs' inability to internalize the search process, a key component of effective reasoning. A critical step toward addressing this issue is enabling LLMs to autonomously determine when and where to backtrack, a fundamental operation in traditional search algorithms. To this end, we propose a self-backtracking mechanism that equips LLMs with the ability to backtrack during both training and inference. This mechanism not only enhances reasoning ability but also efficiency by transforming slow-thinking processes into fast-thinking through self-improvement. Empirical evaluations demonstrate that our proposal significantly enhances the reasoning capabilities of LLMs, achieving a performance gain of over 40 percent compared to the optimal-path supervised fine-tuning method. We believe this study introduces a novel and promising pathway for developing more advanced and robust Reasoners. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04404v1-abstract-full').style.display = 'none'; document.getElementById('2502.04404v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This is a preprint under review, 15 pages, 13 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04268">arXiv:2502.04268</a> <span> [<a href="https://arxiv.org/pdf/2502.04268">pdf</a>, <a href="https://arxiv.org/format/2502.04268">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Point2RBox-v2: Rethinking Point-supervised Oriented Object Detection with Spatial Layout Among Instances </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yi Yu</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+B">Botao Ren</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Peiyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mingxin Liu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+J">Junwei Luo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shaofeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Da%2C+F">Feipeng Da</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Junchi Yan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xue Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04268v2-abstract-short" style="display: inline;"> With the rapidly increasing demand for oriented object detection (OOD), recent research involving weakly-supervised detectors for learning OOD from point annotations has gained great attention. In this paper, we rethink this challenging task setting with the layout among instances and present Point2RBox-v2. At the core are three principles: 1) Gaussian overlap loss. It learns an upper bound for ea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04268v2-abstract-full').style.display = 'inline'; document.getElementById('2502.04268v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04268v2-abstract-full" style="display: none;"> With the rapidly increasing demand for oriented object detection (OOD), recent research involving weakly-supervised detectors for learning OOD from point annotations has gained great attention. In this paper, we rethink this challenging task setting with the layout among instances and present Point2RBox-v2. At the core are three principles: 1) Gaussian overlap loss. It learns an upper bound for each instance by treating objects as 2D Gaussian distributions and minimizing their overlap. 2) Voronoi watershed loss. It learns a lower bound for each instance through watershed on Voronoi tessellation. 3) Consistency loss. It learns the size/rotation variation between two output sets with respect to an input image and its augmented view. Supplemented by a few devised techniques, e.g. edge loss and copy-paste, the detector is further enhanced. To our best knowledge, Point2RBox-v2 is the first approach to explore the spatial layout among instances for learning point-supervised OOD. Our solution is elegant and lightweight, yet it is expected to give a competitive performance especially in densely packed scenes: 62.61%/86.15%/34.71% on DOTA/HRSC/FAIR1M. Code is available at https://github.com/VisionXLab/point2rbox-v2. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04268v2-abstract-full').style.display = 'none'; document.getElementById('2502.04268v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 5 figures, 10 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03544">arXiv:2502.03544</a> <span> [<a href="https://arxiv.org/pdf/2502.03544">pdf</a>, <a href="https://arxiv.org/format/2502.03544">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Gold-medalist Performance in Solving Olympiad Geometry with AlphaGeometry2 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chervonyi%2C+Y">Yuri Chervonyi</a>, <a href="/search/cs?searchtype=author&query=Trinh%2C+T+H">Trieu H. Trinh</a>, <a href="/search/cs?searchtype=author&query=Ol%C5%A1%C3%A1k%2C+M">Miroslav Ol拧谩k</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaomeng Yang</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+H">Hoang Nguyen</a>, <a href="/search/cs?searchtype=author&query=Menegali%2C+M">Marcelo Menegali</a>, <a href="/search/cs?searchtype=author&query=Jung%2C+J">Junehyuk Jung</a>, <a href="/search/cs?searchtype=author&query=Verma%2C+V">Vikas Verma</a>, <a href="/search/cs?searchtype=author&query=Le%2C+Q+V">Quoc V. Le</a>, <a href="/search/cs?searchtype=author&query=Luong%2C+T">Thang Luong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03544v1-abstract-short" style="display: inline;"> We present AlphaGeometry2, a significantly improved version of AlphaGeometry introduced in Trinh et al. (2024), which has now surpassed an average gold medalist in solving Olympiad geometry problems. To achieve this, we first extend the original AlphaGeometry language to tackle harder problems involving movements of objects, and problems containing linear equations of angles, ratios, and distances… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03544v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03544v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03544v1-abstract-full" style="display: none;"> We present AlphaGeometry2, a significantly improved version of AlphaGeometry introduced in Trinh et al. (2024), which has now surpassed an average gold medalist in solving Olympiad geometry problems. To achieve this, we first extend the original AlphaGeometry language to tackle harder problems involving movements of objects, and problems containing linear equations of angles, ratios, and distances. This, together with other additions, has markedly improved the coverage rate of the AlphaGeometry language on International Math Olympiads (IMO) 2000-2024 geometry problems from 66% to 88%. The search process of AlphaGeometry2 has also been greatly improved through the use of Gemini architecture for better language modeling, and a novel knowledge-sharing mechanism that combines multiple search trees. Together with further enhancements to the symbolic engine and synthetic data generation, we have significantly boosted the overall solving rate of AlphaGeometry2 to 84% for $\textit{all}$ geometry problems over the last 25 years, compared to 54% previously. AlphaGeometry2 was also part of the system that achieved silver-medal standard at IMO 2024 https://dpmd.ai/imo-silver. Last but not least, we report progress towards using AlphaGeometry2 as a part of a fully automated system that reliably solves geometry problems directly from natural language input. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03544v1-abstract-full').style.display = 'none'; document.getElementById('2502.03544v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">28 pages, 16 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02384">arXiv:2502.02384</a> <span> [<a href="https://arxiv.org/pdf/2502.02384">pdf</a>, <a href="https://arxiv.org/format/2502.02384">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> STAIR: Improving Safety Alignment with Introspective Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yichi Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Siyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yao Huang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Z">Zeyu Xia</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhengwei Fang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiao Yang</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+R">Ranjie Duan</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+D">Dong Yan</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yinpeng Dong</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02384v1-abstract-short" style="display: inline;"> Ensuring the safety and harmlessness of Large Language Models (LLMs) has become equally critical as their performance in applications. However, existing safety alignment methods typically suffer from safety-performance trade-offs and the susceptibility to jailbreak attacks, primarily due to their reliance on direct refusals for malicious queries. In this paper, we propose STAIR, a novel framework… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02384v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02384v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02384v1-abstract-full" style="display: none;"> Ensuring the safety and harmlessness of Large Language Models (LLMs) has become equally critical as their performance in applications. However, existing safety alignment methods typically suffer from safety-performance trade-offs and the susceptibility to jailbreak attacks, primarily due to their reliance on direct refusals for malicious queries. In this paper, we propose STAIR, a novel framework that integrates SafeTy Alignment with Itrospective Reasoning. We enable LLMs to identify safety risks through step-by-step analysis by self-improving chain-of-thought (CoT) reasoning with safety awareness. STAIR first equips the model with a structured reasoning capability and then advances safety alignment via iterative preference optimization on step-level reasoning data generated using our newly proposed Safety-Informed Monte Carlo Tree Search (SI-MCTS). We further train a process reward model on this data to guide test-time searches for improved responses. Extensive experiments show that STAIR effectively mitigates harmful outputs while better preserving helpfulness, compared to instinctive alignment strategies. With test-time scaling, STAIR achieves a safety performance comparable to Claude-3.5 against popular jailbreak attacks. Relevant resources in this work are available at https://github.com/thu-ml/STAIR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02384v1-abstract-full').style.display = 'none'; document.getElementById('2502.02384v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02225">arXiv:2502.02225</a> <span> [<a href="https://arxiv.org/pdf/2502.02225">pdf</a>, <a href="https://arxiv.org/format/2502.02225">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Exploring the latent space of diffusion models directly through singular value decomposition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+L">Li Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+B">Boyan Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yanran Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhao Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaosong Yang</a>, <a href="/search/cs?searchtype=author&query=Clifton%2C+D+A">David A. Clifton</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jun Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02225v1-abstract-short" style="display: inline;"> Despite the groundbreaking success of diffusion models in generating high-fidelity images, their latent space remains relatively under-explored, even though it holds significant promise for enabling versatile and interpretable image editing capabilities. The complicated denoising trajectory and high dimensionality of the latent space make it extremely challenging to interpret. Existing methods mai… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02225v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02225v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02225v1-abstract-full" style="display: none;"> Despite the groundbreaking success of diffusion models in generating high-fidelity images, their latent space remains relatively under-explored, even though it holds significant promise for enabling versatile and interpretable image editing capabilities. The complicated denoising trajectory and high dimensionality of the latent space make it extremely challenging to interpret. Existing methods mainly explore the feature space of U-Net in Diffusion Models (DMs) instead of the latent space itself. In contrast, we directly investigate the latent space via Singular Value Decomposition (SVD) and discover three useful properties that can be used to control generation results without the requirements of data collection and maintain identity fidelity generated images. Based on these properties, we propose a novel image editing framework that is capable of learning arbitrary attributes from one pair of latent codes destined by text prompts in Stable Diffusion Models. To validate our approach, extensive experiments are conducted to demonstrate its effectiveness and flexibility in image editing. We will release our codes soon to foster further research and applications in this area. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02225v1-abstract-full').style.display = 'none'; document.getElementById('2502.02225v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02088">arXiv:2502.02088</a> <span> [<a href="https://arxiv.org/pdf/2502.02088">pdf</a>, <a href="https://arxiv.org/format/2502.02088">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> IPO: Iterative Preference Optimization for Text-to-Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaomeng Yang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhiyu Tan</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+X">Xuecheng Nie</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02088v2-abstract-short" style="display: inline;"> Video foundation models have achieved significant advancement with the help of network upgrade as well as model scale-up. However, they are still hard to meet requirements of applications due to unsatisfied generation quality. To solve this problem, we propose to align video foundation models with human preferences from the perspective of post-training in this paper. Consequently, we introduce an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02088v2-abstract-full').style.display = 'inline'; document.getElementById('2502.02088v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02088v2-abstract-full" style="display: none;"> Video foundation models have achieved significant advancement with the help of network upgrade as well as model scale-up. However, they are still hard to meet requirements of applications due to unsatisfied generation quality. To solve this problem, we propose to align video foundation models with human preferences from the perspective of post-training in this paper. Consequently, we introduce an Iterative Preference Optimization strategy to enhance generated video quality by incorporating human feedback. Specifically, IPO exploits a critic model to justify video generations for pairwise ranking as in Direct Preference Optimization or point-wise scoring as in Kahneman-Tversky Optimization. Given this, IPO optimizes video foundation models with guidance of signals from preference feedback, which helps improve generated video quality in subject consistency, motion smoothness and aesthetic quality, etc. In addition, IPO incorporates the critic model with the multi-modality large language model, which enables it to automatically assign preference labels without need of retraining or relabeling. In this way, IPO can efficiently perform multi-round preference optimization in an iterative manner, without the need of tediously manual labeling. Comprehensive experiments demonstrate that the proposed IPO can effectively improve the video generation quality of a pretrained model and help a model with only 2B parameters surpass the one with 5B parameters. Besides, IPO achieves new state-of-the-art performance on VBench benchmark. We will release our source codes, models as well as dataset to advance future research and applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02088v2-abstract-full').style.display = 'none'; document.getElementById('2502.02088v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01986">arXiv:2502.01986</a> <span> [<a href="https://arxiv.org/pdf/2502.01986">pdf</a>, <a href="https://arxiv.org/format/2502.01986">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> DCT-Mamba3D: Spectral Decorrelation and Spatial-Spectral Feature Extraction for Hyperspectral Image Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+W">Weijia Cao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaofei Yang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yicong Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zheng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01986v1-abstract-short" style="display: inline;"> Hyperspectral image classification presents challenges due to spectral redundancy and complex spatial-spectral dependencies. This paper proposes a novel framework, DCT-Mamba3D, for hyperspectral image classification. DCT-Mamba3D incorporates: (1) a 3D spectral-spatial decorrelation module that applies 3D discrete cosine transform basis functions to reduce both spectral and spatial redundancy, enha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01986v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01986v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01986v1-abstract-full" style="display: none;"> Hyperspectral image classification presents challenges due to spectral redundancy and complex spatial-spectral dependencies. This paper proposes a novel framework, DCT-Mamba3D, for hyperspectral image classification. DCT-Mamba3D incorporates: (1) a 3D spectral-spatial decorrelation module that applies 3D discrete cosine transform basis functions to reduce both spectral and spatial redundancy, enhancing feature clarity across dimensions; (2) a 3D-Mamba module that leverages a bidirectional state-space model to capture intricate spatial-spectral dependencies; and (3) a global residual enhancement module that stabilizes feature representation, improving robustness and convergence. Extensive experiments on benchmark datasets show that our DCT-Mamba3D outperforms the state-of-the-art methods in challenging scenarios such as the same object in different spectra and different objects in the same spectra. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01986v1-abstract-full').style.display = 'none'; document.getElementById('2502.01986v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01662">arXiv:2502.01662</a> <span> [<a href="https://arxiv.org/pdf/2502.01662">pdf</a>, <a href="https://arxiv.org/format/2502.01662">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Speculative Ensemble: Fast Large Language Model Ensemble via Speculation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fu%2C+J">Jiale Fu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yuchu Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junkai Chen</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+J">Jiaming Fan</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+X">Xin Geng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01662v1-abstract-short" style="display: inline;"> Ensemble methods enhance Large Language Models (LLMs) by combining multiple models but suffer from high computational costs. In this paper, we introduce Speculative Ensemble, a novel framework that accelerates LLM ensembles without sacrificing performance, inspired by Speculative Decoding-where a small proposal model generates tokens sequentially, and a larger target model verifies them in paralle… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01662v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01662v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01662v1-abstract-full" style="display: none;"> Ensemble methods enhance Large Language Models (LLMs) by combining multiple models but suffer from high computational costs. In this paper, we introduce Speculative Ensemble, a novel framework that accelerates LLM ensembles without sacrificing performance, inspired by Speculative Decoding-where a small proposal model generates tokens sequentially, and a larger target model verifies them in parallel. Our approach builds on two key insights: (1) the verification distribution can be the ensemble distribution of both the proposal and target models, and (2) alternating each model as the proposer and verifier can further enhance efficiency. We generalize this method to ensembles with n models and theoretically prove that SE is never slower than a standard ensemble, typically achieving faster speed. Extensive experiments demonstrate speed improvements of 1.11x-2.23x over standard ensemble techniques without compromising generation quality. Our code is available at https://github.com/Kamichanw/Speculative-Ensemble/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01662v1-abstract-full').style.display = 'none'; document.getElementById('2502.01662v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01655">arXiv:2502.01655</a> <span> [<a href="https://arxiv.org/pdf/2502.01655">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> A binary PSO based ensemble under-sampling model for rebalancing imbalanced training data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinyan Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yaoyang Wu</a>, <a href="/search/cs?searchtype=author&query=Fong%2C+S">Simon Fong</a>, <a href="/search/cs?searchtype=author&query=Tall%C3%B3n-Ballesteros%2C+A+J">Antonio J. Tall贸n-Ballesteros</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xin-she Yang</a>, <a href="/search/cs?searchtype=author&query=Mohammed%2C+S">Sabah Mohammed</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Feng Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01655v1-abstract-short" style="display: inline;"> Ensemble technique and under-sampling technique are both effective tools used for imbalanced dataset classification problems. In this paper, a novel ensemble method combining the advantages of both ensemble learning for biasing classifiers and a new under-sampling method is proposed. The under-sampling method is named Binary PSO instance selection; it gathers with ensemble classifiers to find the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01655v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01655v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01655v1-abstract-full" style="display: none;"> Ensemble technique and under-sampling technique are both effective tools used for imbalanced dataset classification problems. In this paper, a novel ensemble method combining the advantages of both ensemble learning for biasing classifiers and a new under-sampling method is proposed. The under-sampling method is named Binary PSO instance selection; it gathers with ensemble classifiers to find the most suitable length and combination of the majority class samples to build a new dataset with minority class samples. The proposed method adopts multi-objective strategy, and contribution of this method is a notable improvement of the performances of imbalanced classification, and in the meantime guaranteeing a best integrity possible for the original dataset. We experimented the proposed method and compared its performance of processing imbalanced datasets with several other conventional basic ensemble methods. Experiment is also conducted on these imbalanced datasets using an improved version where ensemble classifiers are wrapped in the Binary PSO instance selection. According to experimental results, our proposed methods outperform single ensemble methods, state-of-the-art under-sampling methods, and also combinations of these methods with the traditional PSO instance selection algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01655v1-abstract-full').style.display = 'none'; document.getElementById('2502.01655v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages, 18 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Li, J., Wu, Y., Fong, S., Tall贸n-Ballesteros, A. J., Yang, X. S., Mohammed, S., & Wu, F. (2022). A binary PSO-based ensemble under-sampling model for rebalancing imbalanced training data. The Journal of Supercomputing, 1-36 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01411">arXiv:2502.01411</a> <span> [<a href="https://arxiv.org/pdf/2502.01411">pdf</a>, <a href="https://arxiv.org/format/2502.01411">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Human Body Restoration with One-Step Diffusion Model and A New Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gong%2C+J">Jue Gong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingkai Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zheng Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xing Liu</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+H">Hong Gu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yulun Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaokang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01411v1-abstract-short" style="display: inline;"> Human body restoration, as a specific application of image restoration, is widely applied in practice and plays a vital role across diverse fields. However, thorough research remains difficult, particularly due to the lack of benchmark datasets. In this study, we propose a high-quality dataset automated cropping and filtering (HQ-ACF) pipeline. This pipeline leverages existing object detection dat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01411v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01411v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01411v1-abstract-full" style="display: none;"> Human body restoration, as a specific application of image restoration, is widely applied in practice and plays a vital role across diverse fields. However, thorough research remains difficult, particularly due to the lack of benchmark datasets. In this study, we propose a high-quality dataset automated cropping and filtering (HQ-ACF) pipeline. This pipeline leverages existing object detection datasets and other unlabeled images to automatically crop and filter high-quality human images. Using this pipeline, we constructed a person-based restoration with sophisticated objects and natural activities (\emph{PERSONA}) dataset, which includes training, validation, and test sets. The dataset significantly surpasses other human-related datasets in both quality and content richness. Finally, we propose \emph{OSDHuman}, a novel one-step diffusion model for human body restoration. Specifically, we propose a high-fidelity image embedder (HFIE) as the prompt generator to better guide the model with low-quality human image information, effectively avoiding misleading prompts. Experimental results show that OSDHuman outperforms existing methods in both visual quality and quantitative metrics. The dataset and code will at https://github.com/gobunu/OSDHuman. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01411v1-abstract-full').style.display = 'none'; document.getElementById('2502.01411v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 9 figures. The code and model will be available at https://github.com/gobunu/OSDHuman</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01403">arXiv:2502.01403</a> <span> [<a href="https://arxiv.org/pdf/2502.01403">pdf</a>, <a href="https://arxiv.org/format/2502.01403">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> AdaSVD: Adaptive Singular Value Decomposition for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhiteng Li</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+M">Mingyuan Xia</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Hui%2C+Z">Zheng Hui</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+L">Linghe Kong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yulun Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaokang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01403v2-abstract-short" style="display: inline;"> Large language models (LLMs) have achieved remarkable success in natural language processing (NLP) tasks, yet their substantial memory requirements present significant challenges for deployment on resource-constrained devices. Singular Value Decomposition (SVD) has emerged as a promising compression technique for LLMs, offering considerable reductions in memory overhead. However, existing SVD-base… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01403v2-abstract-full').style.display = 'inline'; document.getElementById('2502.01403v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01403v2-abstract-full" style="display: none;"> Large language models (LLMs) have achieved remarkable success in natural language processing (NLP) tasks, yet their substantial memory requirements present significant challenges for deployment on resource-constrained devices. Singular Value Decomposition (SVD) has emerged as a promising compression technique for LLMs, offering considerable reductions in memory overhead. However, existing SVD-based methods often struggle to effectively mitigate the errors introduced by SVD truncation, leading to a noticeable performance gap when compared to the original models. Furthermore, applying a uniform compression ratio across all transformer layers fails to account for the varying importance of different layers. To address these challenges, we propose AdaSVD, an adaptive SVD-based LLM compression approach. Specifically, AdaSVD introduces adaComp, which adaptively compensates for SVD truncation errors by alternately updating the singular matrices U and V^T. Additionally, AdaSVD introduces adaCR, which adaptively assigns layer-specific compression ratios based on the relative importance of each layer. Extensive experiments across multiple LLM families and evaluation metrics demonstrate that AdaSVD consistently outperforms state-of-the-art (SOTA) SVD-based methods, achieving superior performance with significantly reduced memory requirements. The code and models will be available at https://github.com/ZHITENGLI/AdaSVD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01403v2-abstract-full').style.display = 'none'; document.getElementById('2502.01403v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The code and models will be available at https://github.com/ZHITENGLI/AdaSVD</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00858">arXiv:2502.00858</a> <span> [<a href="https://arxiv.org/pdf/2502.00858">pdf</a>, <a href="https://arxiv.org/format/2502.00858">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Learning to Plan with Personalized Preferences </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+M">Manjie Xu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xinyi Yang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+W">Wei Liang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chi Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yixin Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00858v1-abstract-short" style="display: inline;"> Effective integration of AI agents into daily life requires them to understand and adapt to individual human preferences, particularly in collaborative roles. Although recent studies on embodied intelligence have advanced significantly, they typically adopt generalized approaches that overlook personal preferences in planning. We address this limitation by developing agents that not only learn pre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00858v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00858v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00858v1-abstract-full" style="display: none;"> Effective integration of AI agents into daily life requires them to understand and adapt to individual human preferences, particularly in collaborative roles. Although recent studies on embodied intelligence have advanced significantly, they typically adopt generalized approaches that overlook personal preferences in planning. We address this limitation by developing agents that not only learn preferences from few demonstrations but also learn to adapt their planning strategies based on these preferences. Our research leverages the observation that preferences, though implicitly expressed through minimal demonstrations, can generalize across diverse planning scenarios. To systematically evaluate this hypothesis, we introduce Preference-based Planning (PbP) benchmark, an embodied benchmark featuring hundreds of diverse preferences spanning from atomic actions to complex sequences. Our evaluation of SOTA methods reveals that while symbol-based approaches show promise in scalability, significant challenges remain in learning to generate and execute plans that satisfy personalized preferences. We further demonstrate that incorporating learned preferences as intermediate representations in planning significantly improves the agent's ability to construct personalized plans. These findings establish preferences as a valuable abstraction layer for adaptive planning, opening new directions for research in preference-guided plan generation and execution. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00858v1-abstract-full').style.display = 'none'; document.getElementById('2502.00858v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00847">arXiv:2502.00847</a> <span> [<a href="https://arxiv.org/pdf/2502.00847">pdf</a>, <a href="https://arxiv.org/format/2502.00847">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SecPE: Secure Prompt Ensembling for Private and Robust Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiawen Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kejia Chen</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zunlei Feng</a>, <a href="/search/cs?searchtype=author&query=Lou%2C+J">Jian Lou</a>, <a href="/search/cs?searchtype=author&query=Song%2C+M">Mingli Song</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jian Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaohu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00847v1-abstract-short" style="display: inline;"> With the growing popularity of LLMs among the general public users, privacy-preserving and adversarial robustness have become two pressing demands for LLM-based services, which have largely been pursued separately but rarely jointly. In this paper, to the best of our knowledge, we are among the first attempts towards robust and private LLM inference by tightly integrating two disconnected fields:… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00847v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00847v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00847v1-abstract-full" style="display: none;"> With the growing popularity of LLMs among the general public users, privacy-preserving and adversarial robustness have become two pressing demands for LLM-based services, which have largely been pursued separately but rarely jointly. In this paper, to the best of our knowledge, we are among the first attempts towards robust and private LLM inference by tightly integrating two disconnected fields: private inference and prompt ensembling. The former protects users' privacy by encrypting inference data transmitted and processed by LLMs, while the latter enhances adversarial robustness by yielding an aggregated output from multiple prompted LLM responses. Although widely recognized as effective individually, private inference for prompt ensembling together entails new challenges that render the naive combination of existing techniques inefficient. To overcome the hurdles, we propose SecPE, which designs efficient fully homomorphic encryption (FHE) counterparts for the core algorithmic building blocks of prompt ensembling. We conduct extensive experiments on 8 tasks to evaluate the accuracy, robustness, and efficiency of SecPE. The results show that SecPE maintains high clean accuracy and offers better robustness at the expense of merely $2.5\%$ efficiency overhead compared to baseline private inference methods, indicating a satisfactory ``accuracy-robustness-efficiency'' tradeoff. For the efficiency of the encrypted Argmax operation that incurs major slowdown for prompt ensembling, SecPE is 35.4x faster than the state-of-the-art peers, which can be of independent interest beyond this work. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00847v1-abstract-full').style.display = 'none'; document.getElementById('2502.00847v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00840">arXiv:2502.00840</a> <span> [<a href="https://arxiv.org/pdf/2502.00840">pdf</a>, <a href="https://arxiv.org/format/2502.00840">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Activation Approximations Can Incur Safety Vulnerabilities Even in Aligned LLMs: Comprehensive Analysis and Defense </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiawen Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kejia Chen</a>, <a href="/search/cs?searchtype=author&query=He%2C+L">Lipeng He</a>, <a href="/search/cs?searchtype=author&query=Lou%2C+J">Jian Lou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dan Li</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zunlei Feng</a>, <a href="/search/cs?searchtype=author&query=Song%2C+M">Mingli Song</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jian Liu</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+K">Kui Ren</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaohu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00840v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have showcased remarkable capabilities across various domains. Accompanying the evolving capabilities and expanding deployment scenarios of LLMs, their deployment challenges escalate due to their sheer scale and the advanced yet complex activation designs prevalent in notable model series, such as Llama, Gemma, and Mistral. These challenges have become particularly pro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00840v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00840v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00840v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have showcased remarkable capabilities across various domains. Accompanying the evolving capabilities and expanding deployment scenarios of LLMs, their deployment challenges escalate due to their sheer scale and the advanced yet complex activation designs prevalent in notable model series, such as Llama, Gemma, and Mistral. These challenges have become particularly pronounced in resource-constrained deployment scenarios, where mitigating inference efficiency bottlenecks is imperative. Among various recent efforts, activation approximation has emerged as a promising avenue for pursuing inference efficiency, sometimes considered indispensable in applications such as private inference. Despite achieving substantial speedups with minimal impact on utility, even appearing sound and practical for real-world deployment, the safety implications of activation approximations remain unclear. In this work, we fill this critical gap in LLM safety by conducting the first systematic safety evaluation of activation approximations. Our safety vetting spans seven sota techniques across three popular categories, revealing consistent safety degradation across ten safety-aligned LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00840v1-abstract-full').style.display = 'none'; document.getElementById('2502.00840v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00663">arXiv:2502.00663</a> <span> [<a href="https://arxiv.org/pdf/2502.00663">pdf</a>, <a href="https://arxiv.org/format/2502.00663">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Enhanced Convolutional Neural Networks for Improved Image Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaoran Yang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+S">Shuhan Yu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+W">Wenxi Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00663v1-abstract-short" style="display: inline;"> Image classification is a fundamental task in computer vision with diverse applications, ranging from autonomous systems to medical imaging. The CIFAR-10 dataset is a widely used benchmark to evaluate the performance of classification models on small-scale, multi-class datasets. Convolutional Neural Networks (CNNs) have demonstrated state-of-the-art results; however, they often suffer from overfit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00663v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00663v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00663v1-abstract-full" style="display: none;"> Image classification is a fundamental task in computer vision with diverse applications, ranging from autonomous systems to medical imaging. The CIFAR-10 dataset is a widely used benchmark to evaluate the performance of classification models on small-scale, multi-class datasets. Convolutional Neural Networks (CNNs) have demonstrated state-of-the-art results; however, they often suffer from overfitting and suboptimal feature representation when applied to challenging datasets like CIFAR-10. In this paper, we propose an enhanced CNN architecture that integrates deeper convolutional blocks, batch normalization, and dropout regularization to achieve superior performance. The proposed model achieves a test accuracy of 84.95%, outperforming baseline CNN architectures. Through detailed ablation studies, we demonstrate the effectiveness of the enhancements and analyze the hierarchical feature representations. This work highlights the potential of refined CNN architectures for tackling small-scale image classification problems effectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00663v1-abstract-full').style.display = 'none'; document.getElementById('2502.00663v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00262">arXiv:2502.00262</a> <span> [<a href="https://arxiv.org/pdf/2502.00262">pdf</a>, <a href="https://arxiv.org/format/2502.00262">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> INSIGHT: Enhancing Autonomous Driving Safety through Vision-Language Models on Context-Aware Hazard Detection and Edge Case Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dianwei Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zifan Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuchen Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X+T">Xianfeng Terry Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00262v2-abstract-short" style="display: inline;"> Autonomous driving systems face significant challenges in handling unpredictable edge-case scenarios, such as adversarial pedestrian movements, dangerous vehicle maneuvers, and sudden environmental changes. Current end-to-end driving models struggle with generalization to these rare events due to limitations in traditional detection and prediction approaches. To address this, we propose INSIGHT (I… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00262v2-abstract-full').style.display = 'inline'; document.getElementById('2502.00262v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00262v2-abstract-full" style="display: none;"> Autonomous driving systems face significant challenges in handling unpredictable edge-case scenarios, such as adversarial pedestrian movements, dangerous vehicle maneuvers, and sudden environmental changes. Current end-to-end driving models struggle with generalization to these rare events due to limitations in traditional detection and prediction approaches. To address this, we propose INSIGHT (Integration of Semantic and Visual Inputs for Generalized Hazard Tracking), a hierarchical vision-language model (VLM) framework designed to enhance hazard detection and edge-case evaluation. By using multimodal data fusion, our approach integrates semantic and visual representations, enabling precise interpretation of driving scenarios and accurate forecasting of potential dangers. Through supervised fine-tuning of VLMs, we optimize spatial hazard localization using attention-based mechanisms and coordinate regression techniques. Experimental results on the BDD100K dataset demonstrate a substantial improvement in hazard prediction straightforwardness and accuracy over existing models, achieving a notable increase in generalization performance. This advancement enhances the robustness and safety of autonomous driving systems, ensuring improved situational awareness and potential decision-making in complex real-world scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00262v2-abstract-full').style.display = 'none'; document.getElementById('2502.00262v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yang%2C+X&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yang%2C+X&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+X&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+X&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+X&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+X&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository