Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 892 results for author: <span class="mathjax">Liu, P</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Liu%2C+P">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Liu, P"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Liu%2C+P&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Liu, P"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Liu%2C+P&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Liu%2C+P&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Liu%2C+P&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Liu%2C+P&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Liu%2C+P&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Liu%2C+P&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.18834">arXiv:2502.18834</a> <span> [<a href="https://arxiv.org/pdf/2502.18834">pdf</a>, <a href="https://arxiv.org/format/2502.18834">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> FinTSB: A Comprehensive and Practical Benchmark for Financial Time Series Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yifan Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuante Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peiyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yuxia Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+N">Naiqi Li</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+T">Tao Dai</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shu-tao Xia</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+D">Dawei Cheng</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+C">Changjun Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.18834v1-abstract-short" style="display: inline;"> Financial time series (FinTS) record the behavior of human-brain-augmented decision-making, capturing valuable historical information that can be leveraged for profitable investment strategies. Not surprisingly, this area has attracted considerable attention from researchers, who have proposed a wide range of methods based on various backbones. However, the evaluation of the area often exhibits th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18834v1-abstract-full').style.display = 'inline'; document.getElementById('2502.18834v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.18834v1-abstract-full" style="display: none;"> Financial time series (FinTS) record the behavior of human-brain-augmented decision-making, capturing valuable historical information that can be leveraged for profitable investment strategies. Not surprisingly, this area has attracted considerable attention from researchers, who have proposed a wide range of methods based on various backbones. However, the evaluation of the area often exhibits three systemic limitations: 1. Failure to account for the full spectrum of stock movement patterns observed in dynamic financial markets. (Diversity Gap), 2. The absence of unified assessment protocols undermines the validity of cross-study performance comparisons. (Standardization Deficit), and 3. Neglect of critical market structure factors, resulting in inflated performance metrics that lack practical applicability. (Real-World Mismatch). Addressing these limitations, we propose FinTSB, a comprehensive and practical benchmark for financial time series forecasting (FinTSF). To increase the variety, we categorize movement patterns into four specific parts, tokenize and pre-process the data, and assess the data quality based on some sequence characteristics. To eliminate biases due to different evaluation settings, we standardize the metrics across three dimensions and build a user-friendly, lightweight pipeline incorporating methods from various backbones. To accurately simulate real-world trading scenarios and facilitate practical implementation, we extensively model various regulatory constraints, including transaction fees, among others. Finally, we conduct extensive experiments on FinTSB, highlighting key insights to guide model selection under varying market conditions. Overall, FinTSB provides researchers with a novel and comprehensive platform for improving and evaluating FinTSF methods. The code is available at https://github.com/TongjiFinLab/FinTSBenchmark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18834v1-abstract-full').style.display = 'none'; document.getElementById('2502.18834v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.18489">arXiv:2502.18489</a> <span> [<a href="https://arxiv.org/pdf/2502.18489">pdf</a>, <a href="https://arxiv.org/format/2502.18489">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LLM4EFFI: Leveraging Large Language Models to Enhance Code Efficiency and Correctness </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+T">Tong Ye</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Weigang Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuhong Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+T">Tengfei Ma</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peiyu Liu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jianwei Yin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenhai Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.18489v1-abstract-short" style="display: inline;"> Large Language Models (LLMs), particularly Code LLMs, have demonstrated impressive performance in code generation. Current research primarily focuses on the correctness of generated code, while efficiency remains less explored. Recent works have focused on modifying the initial version of the code to improve its efficiency. However, such refinements are limited by the algorithmic design and overal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18489v1-abstract-full').style.display = 'inline'; document.getElementById('2502.18489v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.18489v1-abstract-full" style="display: none;"> Large Language Models (LLMs), particularly Code LLMs, have demonstrated impressive performance in code generation. Current research primarily focuses on the correctness of generated code, while efficiency remains less explored. Recent works have focused on modifying the initial version of the code to improve its efficiency. However, such refinements are limited by the algorithmic design and overall logic of the initial code, resulting in only incremental improvements. In contrast, when human developers write high-quality code, they typically begin by designing several potential solutions at the logical level, evaluating various algorithms and their complexities, and then proceeding to implement and optimize the solution. In this study, we introduce \tool: \uline{L}arge \uline{L}anguage \uline{M}odel for Code \uline{Effi}ciency, a novel framework that enables LLMs to generate code that balances both efficiency and correctness. Specifically, \tool divides the efficiency optimization process into two domains: algorithmic exploration in the logic domain and implementation optimization in the code domain. The correctness of the code is then guaranteed through a synthetic test case refinement process. This approach, which prioritizes efficiency before ensuring correctness, offers a new paradigm for efficient code generation. Experiments demonstrate that \tool consistently improves both efficiency and correctness, achieving new state-of-the-art performance in code efficiency benchmarks across various LLM backbones. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18489v1-abstract-full').style.display = 'none'; document.getElementById('2502.18489v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.18042">arXiv:2502.18042</a> <span> [<a href="https://arxiv.org/pdf/2502.18042">pdf</a>, <a href="https://arxiv.org/format/2502.18042">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> VLM-E2E: Enhancing End-to-End Autonomous Driving with Multimodal Driver Attention Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+P">Pei Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haipeng Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haichao Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xin Liu</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+J">Jinxin Ni</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jun Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.18042v1-abstract-short" style="display: inline;"> Human drivers adeptly navigate complex scenarios by utilizing rich attentional semantics, but the current autonomous systems struggle to replicate this ability, as they often lose critical semantic information when converting 2D observations into 3D space. In this sense, it hinders their effective deployment in dynamic and complex environments. Leveraging the superior scene understanding and reaso… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18042v1-abstract-full').style.display = 'inline'; document.getElementById('2502.18042v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.18042v1-abstract-full" style="display: none;"> Human drivers adeptly navigate complex scenarios by utilizing rich attentional semantics, but the current autonomous systems struggle to replicate this ability, as they often lose critical semantic information when converting 2D observations into 3D space. In this sense, it hinders their effective deployment in dynamic and complex environments. Leveraging the superior scene understanding and reasoning abilities of Vision-Language Models (VLMs), we propose VLM-E2E, a novel framework that uses the VLMs to enhance training by providing attentional cues. Our method integrates textual representations into Bird's-Eye-View (BEV) features for semantic supervision, which enables the model to learn richer feature representations that explicitly capture the driver's attentional semantics. By focusing on attentional semantics, VLM-E2E better aligns with human-like driving behavior, which is critical for navigating dynamic and complex environments. Furthermore, we introduce a BEV-Text learnable weighted fusion strategy to address the issue of modality importance imbalance in fusing multimodal information. This approach dynamically balances the contributions of BEV and text features, ensuring that the complementary information from visual and textual modality is effectively utilized. By explicitly addressing the imbalance in multimodal fusion, our method facilitates a more holistic and robust representation of driving environments. We evaluate VLM-E2E on the nuScenes dataset and demonstrate its superiority over state-of-the-art approaches, showcasing significant improvements in performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18042v1-abstract-full').style.display = 'none'; document.getElementById('2502.18042v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12499">arXiv:2502.12499</a> <span> [<a href="https://arxiv.org/pdf/2502.12499">pdf</a>, <a href="https://arxiv.org/format/2502.12499">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.jpdc.2025.105053">10.1016/j.jpdc.2025.105053 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> GPU Memory Usage Optimization for Backward Propagation in Deep Network Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hong%2C+D">Ding-Yong Hong</a>, <a href="/search/cs?searchtype=author&query=Tsai%2C+T">Tzu-Hsien Tsai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+N">Ning Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Pangfeng Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jan-Jan Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12499v1-abstract-short" style="display: inline;"> In modern Deep Learning, it has been a trend to design larger Deep Neural Networks (DNNs) for the execution of more complex tasks and better accuracy. On the other hand, Convolutional Neural Networks (CNNs) have become the standard method for most of computer vision tasks. However, the memory allocation for the intermediate data in convolution layers can cause severe memory pressure during model t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12499v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12499v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12499v1-abstract-full" style="display: none;"> In modern Deep Learning, it has been a trend to design larger Deep Neural Networks (DNNs) for the execution of more complex tasks and better accuracy. On the other hand, Convolutional Neural Networks (CNNs) have become the standard method for most of computer vision tasks. However, the memory allocation for the intermediate data in convolution layers can cause severe memory pressure during model training. Many solutions have been proposed to resolve the problem. Besides hardware-dependent solutions, a general methodology rematerialization can reduce GPU memory usage by trading computation for memory efficiently. The idea is to select a set of intermediate results during the forward phase as checkpoints, and only save them in memory to reduce memory usage. The backward phase recomputes the intermediate data from the closest checkpoints in memory as needed. This recomputation increases execution time but saves memory by not storing all intermediate results in memory during the forward phase. In this paper, we will focus on efficiently finding the optimal checkpoint subset to achieve the least peak memory usage during the model training. We first describe the theoretical background of the training of a neural network using mathematical equations. We use these equations to identify all essential data required during both forward and backward phases to compute the gradient of weights of the model. We first identify the checkpoint selection problem and propose a dynamic programming algorithm with time complexity O(n3) to solve the problem of finding the optimal checkpoint subset. With extensive experiments, we formulate a more accurate description of the problem using our theoretical analysis and revise the objective function based on the tracing, and propose an O(n)-time algorithm for finding the optimal checkpoint subset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12499v1-abstract-full').style.display = 'none'; document.getElementById('2502.12499v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in JPDC 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12102">arXiv:2502.12102</a> <span> [<a href="https://arxiv.org/pdf/2502.12102">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> </div> <p class="title is-5 mathjax"> Relational Norms for Human-AI Cooperation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Earp%2C+B+D">Brian D. Earp</a>, <a href="/search/cs?searchtype=author&query=Mann%2C+S+P">Sebastian Porsdam Mann</a>, <a href="/search/cs?searchtype=author&query=Aboy%2C+M">Mateo Aboy</a>, <a href="/search/cs?searchtype=author&query=Awad%2C+E">Edmond Awad</a>, <a href="/search/cs?searchtype=author&query=Betzler%2C+M">Monika Betzler</a>, <a href="/search/cs?searchtype=author&query=Botes%2C+M">Marietjie Botes</a>, <a href="/search/cs?searchtype=author&query=Calcott%2C+R">Rachel Calcott</a>, <a href="/search/cs?searchtype=author&query=Caraccio%2C+M">Mina Caraccio</a>, <a href="/search/cs?searchtype=author&query=Chater%2C+N">Nick Chater</a>, <a href="/search/cs?searchtype=author&query=Coeckelbergh%2C+M">Mark Coeckelbergh</a>, <a href="/search/cs?searchtype=author&query=Constantinescu%2C+M">Mihaela Constantinescu</a>, <a href="/search/cs?searchtype=author&query=Dabbagh%2C+H">Hossein Dabbagh</a>, <a href="/search/cs?searchtype=author&query=Devlin%2C+K">Kate Devlin</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+X">Xiaojun Ding</a>, <a href="/search/cs?searchtype=author&query=Dranseika%2C+V">Vilius Dranseika</a>, <a href="/search/cs?searchtype=author&query=Everett%2C+J+A+C">Jim A. C. Everett</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+R">Ruiping Fan</a>, <a href="/search/cs?searchtype=author&query=Feroz%2C+F">Faisal Feroz</a>, <a href="/search/cs?searchtype=author&query=Francis%2C+K+B">Kathryn B. Francis</a>, <a href="/search/cs?searchtype=author&query=Friedman%2C+C">Cindy Friedman</a>, <a href="/search/cs?searchtype=author&query=Friedrich%2C+O">Orsolya Friedrich</a>, <a href="/search/cs?searchtype=author&query=Gabriel%2C+I">Iason Gabriel</a>, <a href="/search/cs?searchtype=author&query=Hannikainen%2C+I">Ivar Hannikainen</a>, <a href="/search/cs?searchtype=author&query=Hellmann%2C+J">Julie Hellmann</a>, <a href="/search/cs?searchtype=author&query=Jahrome%2C+A+K">Arasj Khodadade Jahrome</a> , et al. (37 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12102v1-abstract-short" style="display: inline;"> How we should design and interact with social artificial intelligence depends on the socio-relational role the AI is meant to emulate or occupy. In human society, relationships such as teacher-student, parent-child, neighbors, siblings, or employer-employee are governed by specific norms that prescribe or proscribe cooperative functions including hierarchy, care, transaction, and mating. These nor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12102v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12102v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12102v1-abstract-full" style="display: none;"> How we should design and interact with social artificial intelligence depends on the socio-relational role the AI is meant to emulate or occupy. In human society, relationships such as teacher-student, parent-child, neighbors, siblings, or employer-employee are governed by specific norms that prescribe or proscribe cooperative functions including hierarchy, care, transaction, and mating. These norms shape our judgments of what is appropriate for each partner. For example, workplace norms may allow a boss to give orders to an employee, but not vice versa, reflecting hierarchical and transactional expectations. As AI agents and chatbots powered by large language models are increasingly designed to serve roles analogous to human positions - such as assistant, mental health provider, tutor, or romantic partner - it is imperative to examine whether and how human relational norms should extend to human-AI interactions. Our analysis explores how differences between AI systems and humans, such as the absence of conscious experience and immunity to fatigue, may affect an AI's capacity to fulfill relationship-specific functions and adhere to corresponding norms. This analysis, which is a collaborative effort by philosophers, psychologists, relationship scientists, ethicists, legal experts, and AI researchers, carries important implications for AI systems design, user behavior, and regulation. While we accept that AI systems can offer significant benefits such as increased availability and consistency in certain socio-relational roles, they also risk fostering unhealthy dependencies or unrealistic expectations that could spill over into human-human relationships. We propose that understanding and thoughtfully shaping (or implementing) suitable human-AI relational norms will be crucial for ensuring that human-AI interactions are ethical, trustworthy, and favorable to human well-being. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12102v1-abstract-full').style.display = 'none'; document.getElementById('2502.12102v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">76 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11946">arXiv:2502.11946</a> <span> [<a href="https://arxiv.org/pdf/2502.11946">pdf</a>, <a href="https://arxiv.org/format/2502.11946">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Step-Audio: Unified Understanding and Generation in Intelligent Speech Interaction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+A">Ailin Huang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Boyong Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bruce Wang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+C">Chao Yan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+C">Chen Hu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chengli Feng</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+F">Fei Tian</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+F">Feiyu Shen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jingbei Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mingrui Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peng Liu</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+R">Ruihang Miao</a>, <a href="/search/cs?searchtype=author&query=You%2C+W">Wang You</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xi Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xuerui Yang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yechang Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuxiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Z">Zheng Gong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zixin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hongyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jianjian Sun</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Brian Li</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chengting Feng</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+C">Changyi Wan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hanpeng Hu</a> , et al. (120 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11946v2-abstract-short" style="display: inline;"> Real-time speech interaction, serving as a fundamental interface for human-machine collaboration, holds immense potential. However, current open-source models face limitations such as high costs in voice data collection, weakness in dynamic control, and limited intelligence. To address these challenges, this paper introduces Step-Audio, the first production-ready open-source solution. Key contribu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11946v2-abstract-full').style.display = 'inline'; document.getElementById('2502.11946v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11946v2-abstract-full" style="display: none;"> Real-time speech interaction, serving as a fundamental interface for human-machine collaboration, holds immense potential. However, current open-source models face limitations such as high costs in voice data collection, weakness in dynamic control, and limited intelligence. To address these challenges, this paper introduces Step-Audio, the first production-ready open-source solution. Key contributions include: 1) a 130B-parameter unified speech-text multi-modal model that achieves unified understanding and generation, with the Step-Audio-Chat version open-sourced; 2) a generative speech data engine that establishes an affordable voice cloning framework and produces the open-sourced lightweight Step-Audio-TTS-3B model through distillation; 3) an instruction-driven fine control system enabling dynamic adjustments across dialects, emotions, singing, and RAP; 4) an enhanced cognitive architecture augmented with tool calling and role-playing abilities to manage complex tasks effectively. Based on our new StepEval-Audio-360 evaluation benchmark, Step-Audio achieves state-of-the-art performance in human evaluations, especially in terms of instruction following. On open-source benchmarks like LLaMA Question, shows 9.3% average performance improvement, demonstrating our commitment to advancing the development of open-source multi-modal language technologies. Our code and models are available at https://github.com/stepfun-ai/Step-Audio. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11946v2-abstract-full').style.display = 'none'; document.getElementById('2502.11946v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11886">arXiv:2502.11886</a> <span> [<a href="https://arxiv.org/pdf/2502.11886">pdf</a>, <a href="https://arxiv.org/format/2502.11886">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LIMR: Less is More for RL Scaling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xuefeng Li</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+H">Haoyang Zou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Pengfei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11886v1-abstract-short" style="display: inline;"> In this paper, we ask: what truly determines the effectiveness of RL training data for enhancing language models' reasoning capabilities? While recent advances like o1, Deepseek R1, and Kimi1.5 demonstrate RL's potential, the lack of transparency about training data requirements has hindered systematic progress. Starting directly from base models without distillation, we challenge the assumption t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11886v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11886v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11886v1-abstract-full" style="display: none;"> In this paper, we ask: what truly determines the effectiveness of RL training data for enhancing language models' reasoning capabilities? While recent advances like o1, Deepseek R1, and Kimi1.5 demonstrate RL's potential, the lack of transparency about training data requirements has hindered systematic progress. Starting directly from base models without distillation, we challenge the assumption that scaling up RL training data inherently improves performance. we demonstrate that a strategically selected subset of just 1,389 samples can outperform the full 8,523-sample dataset. We introduce Learning Impact Measurement (LIM), an automated method to evaluate and prioritize training samples based on their alignment with model learning trajectories, enabling efficient resource utilization and scalable implementation. Our method achieves comparable or even superior performance using only 1,389 samples versus the full 8,523 samples dataset. Notably, while recent data-efficient approaches (e.g., LIMO and s1) show promise with 32B-scale models, we find it significantly underperforms at 7B-scale through supervised fine-tuning (SFT). In contrast, our RL-based LIMR achieves 16.7% higher accuracy on AIME24 and outperforms LIMO and s1 by 13.0% and 22.2% on MATH500. These results fundamentally reshape our understanding of RL scaling in LLMs, demonstrating that precise sample selection, rather than data scale, may be the key to unlocking enhanced reasoning capabilities. For reproducible research and future innovation, we are open-sourcing LIMR, including implementation of LIM, training and evaluation code, curated datasets, and trained models at https://github.com/GAIR-NLP/LIMR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11886v1-abstract-full').style.display = 'none'; document.getElementById('2502.11886v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09768">arXiv:2502.09768</a> <span> [<a href="https://arxiv.org/pdf/2502.09768">pdf</a>, <a href="https://arxiv.org/format/2502.09768">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Physics and Society">physics.soc-ph</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TSMC.2025.3525465">10.1109/TSMC.2025.3525465 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Complex Network Modelling with Power-law Activating Patterns and Its Evolutionary Dynamics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Ziyan Zeng</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+M">Minyu Feng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Pengfei Liu</a>, <a href="/search/cs?searchtype=author&query=Kurths%2C+J">Jurgen Kurths</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09768v1-abstract-short" style="display: inline;"> Complex network theory provides a unifying framework for the study of structured dynamic systems. The current literature emphasizes a widely reported phenomenon of intermittent interaction among network vertices. In this paper, we introduce a complex network model that considers the stochastic switching of individuals between activated and quiescent states at power-law rates and the corresponding… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09768v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09768v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09768v1-abstract-full" style="display: none;"> Complex network theory provides a unifying framework for the study of structured dynamic systems. The current literature emphasizes a widely reported phenomenon of intermittent interaction among network vertices. In this paper, we introduce a complex network model that considers the stochastic switching of individuals between activated and quiescent states at power-law rates and the corresponding evolutionary dynamics. By using the Markov chain and renewal theory, we discover a homogeneous stationary distribution of activated sizes in the network with power-law activating patterns and infer some statistical characteristics. To better understand the effect of power-law activating patterns, we study the two-person-two-strategy evolutionary game dynamics, demonstrate the absorbability of strategies, and obtain the critical cooperation conditions for prisoner's dilemmas in homogeneous networks without mutation. The evolutionary dynamics in real networks are also discussed. Our results provide a new perspective to analyze and understand social physics in time-evolving network systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09768v1-abstract-full').style.display = 'none'; document.getElementById('2502.09768v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 9 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Transactions on Systems, Man, and Cybernetics: Systems (2025) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08658">arXiv:2502.08658</a> <span> [<a href="https://arxiv.org/pdf/2502.08658">pdf</a>, <a href="https://arxiv.org/format/2502.08658">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Analyzable Parameters Dominated Vehicle Platoon Dynamics Modeling and Analysis: A Physics-Encoded Deep Learning Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lyu%2C+H">Hao Lyu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yanyong Guo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Pan Liu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+S">Shuo Feng</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+W">Weilin Ren</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+Q">Quansheng Yue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08658v1-abstract-short" style="display: inline;"> Recently, artificial intelligence (AI)-enabled nonlinear vehicle platoon dynamics modeling plays a crucial role in predicting and optimizing the interactions between vehicles. Existing efforts lack the extraction and capture of vehicle behavior interaction features at the platoon scale. More importantly, maintaining high modeling accuracy without losing physical analyzability remains to be solved.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08658v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08658v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08658v1-abstract-full" style="display: none;"> Recently, artificial intelligence (AI)-enabled nonlinear vehicle platoon dynamics modeling plays a crucial role in predicting and optimizing the interactions between vehicles. Existing efforts lack the extraction and capture of vehicle behavior interaction features at the platoon scale. More importantly, maintaining high modeling accuracy without losing physical analyzability remains to be solved. To this end, this paper proposes a novel physics-encoded deep learning network, named PeMTFLN, to model the nonlinear vehicle platoon dynamics. Specifically, an analyzable parameters encoded computational graph (APeCG) is designed to guide the platoon to respond to the driving behavior of the lead vehicle while ensuring local stability. Besides, a multi-scale trajectory feature learning network (MTFLN) is constructed to capture platoon following patterns and infer the physical parameters required for APeCG from trajectory data. The human-driven vehicle trajectory datasets (HIGHSIM) were used to train the proposed PeMTFLN. The trajectories prediction experiments show that PeMTFLN exhibits superior compared to the baseline models in terms of predictive accuracy in speed and gap. The stability analysis result shows that the physical parameters in APeCG is able to reproduce the platoon stability in real-world condition. In simulation experiments, PeMTFLN performs low inference error in platoon trajectories generation. Moreover, PeMTFLN also accurately reproduces ground-truth safety statistics. The code of proposed PeMTFLN is open source. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08658v1-abstract-full').style.display = 'none'; document.getElementById('2502.08658v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08021">arXiv:2502.08021</a> <span> [<a href="https://arxiv.org/pdf/2502.08021">pdf</a>, <a href="https://arxiv.org/format/2502.08021">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Model Selection for Off-policy Evaluation: New Algorithms and Experimental Protocol </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+P">Pai Liu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Lingfeng Zhao</a>, <a href="/search/cs?searchtype=author&query=Agarwal%2C+S">Shivangi Agarwal</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jinghan Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+A">Audrey Huang</a>, <a href="/search/cs?searchtype=author&query=Amortila%2C+P">Philip Amortila</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+N">Nan Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08021v1-abstract-short" style="display: inline;"> Holdout validation and hyperparameter tuning from data is a long-standing problem in offline reinforcement learning (RL). A standard framework is to use off-policy evaluation (OPE) methods to evaluate and select the policies, but OPE either incurs exponential variance (e.g., importance sampling) or has hyperparameters on their own (e.g., FQE and model-based). In this work we focus on hyperparamete… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08021v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08021v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08021v1-abstract-full" style="display: none;"> Holdout validation and hyperparameter tuning from data is a long-standing problem in offline reinforcement learning (RL). A standard framework is to use off-policy evaluation (OPE) methods to evaluate and select the policies, but OPE either incurs exponential variance (e.g., importance sampling) or has hyperparameters on their own (e.g., FQE and model-based). In this work we focus on hyperparameter tuning for OPE itself, which is even more under-investigated. Concretely, we select among candidate value functions ("model-free") or dynamics ("model-based") to best assess the performance of a target policy. Our contributions are two fold. We develop: (1) new model-free and model-based selectors with theoretical guarantees, and (2) a new experimental protocol for empirically evaluating them. Compared to the model-free protocol in prior works, our new protocol allows for more stable generation of candidate value functions, better control of misspecification, and evaluation of model-free and model-based methods alike. We exemplify the protocol on a Gym environment, and find that our new model-free selector, LSTD-Tournament, demonstrates promising empirical performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08021v1-abstract-full').style.display = 'none'; document.getElementById('2502.08021v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07239">arXiv:2502.07239</a> <span> [<a href="https://arxiv.org/pdf/2502.07239">pdf</a>, <a href="https://arxiv.org/format/2502.07239">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Contextual Gesture: Co-Speech Gesture Video Generation through Context-aware Gesture Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+P">Pinxin Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Pengfei Zhang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H">Hyeongwoo Kim</a>, <a href="/search/cs?searchtype=author&query=Garrido%2C+P">Pablo Garrido</a>, <a href="/search/cs?searchtype=author&query=Sharpio%2C+A">Ari Sharpio</a>, <a href="/search/cs?searchtype=author&query=Olszewski%2C+K">Kyle Olszewski</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07239v1-abstract-short" style="display: inline;"> Co-speech gesture generation is crucial for creating lifelike avatars and enhancing human-computer interactions by synchronizing gestures with speech. Despite recent advancements, existing methods struggle with accurately identifying the rhythmic or semantic triggers from audio for generating contextualized gesture patterns and achieving pixel-level realism. To address these challenges, we introdu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07239v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07239v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07239v1-abstract-full" style="display: none;"> Co-speech gesture generation is crucial for creating lifelike avatars and enhancing human-computer interactions by synchronizing gestures with speech. Despite recent advancements, existing methods struggle with accurately identifying the rhythmic or semantic triggers from audio for generating contextualized gesture patterns and achieving pixel-level realism. To address these challenges, we introduce Contextual Gesture, a framework that improves co-speech gesture video generation through three innovative components: (1) a chronological speech-gesture alignment that temporally connects two modalities, (2) a contextualized gesture tokenization that incorporate speech context into motion pattern representation through distillation, and (3) a structure-aware refinement module that employs edge connection to link gesture keypoints to improve video generation. Our extensive experiments demonstrate that Contextual Gesture not only produces realistic and speech-aligned gesture videos but also supports long-sequence generation and video gesture editing applications, shown in Fig.1 Project Page: https://andypinxinliu.github.io/Contextual-Gesture/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07239v1-abstract-full').style.display = 'none'; document.getElementById('2502.07239v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07007">arXiv:2502.07007</a> <span> [<a href="https://arxiv.org/pdf/2502.07007">pdf</a>, <a href="https://arxiv.org/ps/2502.07007">ps</a>, <a href="https://arxiv.org/format/2502.07007">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Grounding Creativity in Physics: A Brief Survey of Physical Priors in AIGC </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Meng%2C+S">Siwei Meng</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yawei Luo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Ping Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07007v1-abstract-short" style="display: inline;"> Recent advancements in AI-generated content have significantly improved the realism of 3D and 4D generation. However, most existing methods prioritize appearance consistency while neglecting underlying physical principles, leading to artifacts such as unrealistic deformations, unstable dynamics, and implausible objects interactions. Incorporating physics priors into generative models has become a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07007v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07007v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07007v1-abstract-full" style="display: none;"> Recent advancements in AI-generated content have significantly improved the realism of 3D and 4D generation. However, most existing methods prioritize appearance consistency while neglecting underlying physical principles, leading to artifacts such as unrealistic deformations, unstable dynamics, and implausible objects interactions. Incorporating physics priors into generative models has become a crucial research direction to enhance structural integrity and motion realism. This survey provides a review of physics-aware generative methods, systematically analyzing how physical constraints are integrated into 3D and 4D generation. First, we examine recent works in incorporating physical priors into static and dynamic 3D generation, categorizing methods based on representation types, including vision-based, NeRF-based, and Gaussian Splatting-based approaches. Second, we explore emerging techniques in 4D generation, focusing on methods that model temporal dynamics with physical simulations. Finally, we conduct a comparative analysis of major methods, highlighting their strengths, limitations, and suitability for different materials and motion dynamics. By presenting an in-depth analysis of physics-grounded AIGC, this survey aims to bridge the gap between generative models and physical realism, providing insights that inspire future research in physically consistent content generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07007v1-abstract-full').style.display = 'none'; document.getElementById('2502.07007v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06707">arXiv:2502.06707</a> <span> [<a href="https://arxiv.org/pdf/2502.06707">pdf</a>, <a href="https://arxiv.org/format/2502.06707">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> FinMamba: Market-Aware Graph Enhanced Multi-Level Mamba for Stock Movement Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yifan Hu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peiyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuante Li</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+D">Dawei Cheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+N">Naiqi Li</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+T">Tao Dai</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+J">Jigang Bao</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shu-Tao Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06707v1-abstract-short" style="display: inline;"> Recently, combining stock features with inter-stock correlations has become a common and effective approach for stock movement prediction. However, financial data presents significant challenges due to its low signal-to-noise ratio and the dynamic complexity of the market, which give rise to two key limitations in existing methods. First, the relationships between stocks are highly influenced by m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06707v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06707v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06707v1-abstract-full" style="display: none;"> Recently, combining stock features with inter-stock correlations has become a common and effective approach for stock movement prediction. However, financial data presents significant challenges due to its low signal-to-noise ratio and the dynamic complexity of the market, which give rise to two key limitations in existing methods. First, the relationships between stocks are highly influenced by multifaceted factors including macroeconomic market dynamics, and current models fail to adaptively capture these evolving interactions under specific market conditions. Second, for the accuracy and timeliness required by real-world trading, existing financial data mining methods struggle to extract beneficial pattern-oriented dependencies from long historical data while maintaining high efficiency and low memory consumption. To address the limitations, we propose FinMamba, a Mamba-GNN-based framework for market-aware and multi-level hybrid stock movement prediction. Specifically, we devise a dynamic graph to learn the changing representations of inter-stock relationships by integrating a pruning module that adapts to market trends. Afterward, with a selective mechanism, the multi-level Mamba discards irrelevant information and resets states to skillfully recall historical patterns across multiple time scales with linear time costs, which are then jointly optimized for reliable prediction. Extensive experiments on U.S. and Chinese stock markets demonstrate the effectiveness of our proposed FinMamba, achieving state-of-the-art prediction accuracy and trading profitability, while maintaining low computational complexity. The code is available at https://github.com/TROUBADOUR000/FinMamba. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06707v1-abstract-full').style.display = 'none'; document.getElementById('2502.06707v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05673">arXiv:2502.05673</a> <span> [<a href="https://arxiv.org/pdf/2502.05673">pdf</a>, <a href="https://arxiv.org/format/2502.05673">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> The Evolution of Dataset Distillation: Toward Scalable and Generalizable Solutions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+P">Ping Liu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+J">Jiawei Du</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05673v1-abstract-short" style="display: inline;"> Dataset distillation, which condenses large-scale datasets into compact synthetic representations, has emerged as a critical solution for training modern deep learning models efficiently. While prior surveys focus on developments before 2023, this work comprehensively reviews recent advances, emphasizing scalability to large-scale datasets such as ImageNet-1K and ImageNet-21K. We categorize progre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05673v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05673v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05673v1-abstract-full" style="display: none;"> Dataset distillation, which condenses large-scale datasets into compact synthetic representations, has emerged as a critical solution for training modern deep learning models efficiently. While prior surveys focus on developments before 2023, this work comprehensively reviews recent advances, emphasizing scalability to large-scale datasets such as ImageNet-1K and ImageNet-21K. We categorize progress into a few key methodologies: trajectory matching, gradient matching, distribution matching, scalable generative approaches, and decoupling optimization mechanisms. As a comprehensive examination of recent dataset distillation advances, this survey highlights breakthrough innovations: the SRe2L framework for efficient and effective condensation, soft label strategies that significantly enhance model accuracy, and lossless distillation techniques that maximize compression while maintaining performance. Beyond these methodological advancements, we address critical challenges, including robustness against adversarial and backdoor attacks, effective handling of non-IID data distributions. Additionally, we explore emerging applications in video and audio processing, multi-modal learning, medical imaging, and scientific computing, highlighting its domain versatility. By offering extensive performance comparisons and actionable research directions, this survey equips researchers and practitioners with practical insights to advance efficient and generalizable dataset distillation, paving the way for future innovations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05673v1-abstract-full').style.display = 'none'; document.getElementById('2502.05673v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05641">arXiv:2502.05641</a> <span> [<a href="https://arxiv.org/pdf/2502.05641">pdf</a>, <a href="https://arxiv.org/format/2502.05641">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/978-3-031-73033-7_1">10.1007/978-3-031-73033-7_1 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Generating Physically Realistic and Directable Human Motions from Multi-Modal Inputs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shrestha%2C+A">Aayam Shrestha</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Pan Liu</a>, <a href="/search/cs?searchtype=author&query=Ros%2C+G">German Ros</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+K">Kai Yuan</a>, <a href="/search/cs?searchtype=author&query=Fern%2C+A">Alan Fern</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05641v1-abstract-short" style="display: inline;"> This work focuses on generating realistic, physically-based human behaviors from multi-modal inputs, which may only partially specify the desired motion. For example, the input may come from a VR controller providing arm motion and body velocity, partial key-point animation, computer vision applied to videos, or even higher-level motion goals. This requires a versatile low-level humanoid controlle… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05641v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05641v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05641v1-abstract-full" style="display: none;"> This work focuses on generating realistic, physically-based human behaviors from multi-modal inputs, which may only partially specify the desired motion. For example, the input may come from a VR controller providing arm motion and body velocity, partial key-point animation, computer vision applied to videos, or even higher-level motion goals. This requires a versatile low-level humanoid controller that can handle such sparse, under-specified guidance, seamlessly switch between skills, and recover from failures. Current approaches for learning humanoid controllers from demonstration data capture some of these characteristics, but none achieve them all. To this end, we introduce the Masked Humanoid Controller (MHC), a novel approach that applies multi-objective imitation learning on augmented and selectively masked motion demonstrations. The training methodology results in an MHC that exhibits the key capabilities of catch-up to out-of-sync input commands, combining elements from multiple motion sequences, and completing unspecified parts of motions from sparse multimodal input. We demonstrate these key capabilities for an MHC learned over a dataset of 87 diverse skills and showcase different multi-modal use cases, including integration with planning frameworks to highlight MHC's ability to solve new user-defined tasks without any finetuning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05641v1-abstract-full').style.display = 'none'; document.getElementById('2502.05641v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> The European Conference on Computer Vision (ECCV), 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03387">arXiv:2502.03387</a> <span> [<a href="https://arxiv.org/pdf/2502.03387">pdf</a>, <a href="https://arxiv.org/format/2502.03387">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LIMO: Less is More for Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yixin Ye</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhen Huang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Y">Yang Xiao</a>, <a href="/search/cs?searchtype=author&query=Chern%2C+E">Ethan Chern</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shijie Xia</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Pengfei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03387v1-abstract-short" style="display: inline;"> We present a fundamental discovery that challenges our understanding of how complex reasoning emerges in large language models. While conventional wisdom suggests that sophisticated reasoning tasks demand extensive training data (>100,000 examples), we demonstrate that complex mathematical reasoning abilities can be effectively elicited with surprisingly few examples. Through comprehensive experim… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03387v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03387v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03387v1-abstract-full" style="display: none;"> We present a fundamental discovery that challenges our understanding of how complex reasoning emerges in large language models. While conventional wisdom suggests that sophisticated reasoning tasks demand extensive training data (>100,000 examples), we demonstrate that complex mathematical reasoning abilities can be effectively elicited with surprisingly few examples. Through comprehensive experiments, our proposed model LIMO demonstrates unprecedented performance in mathematical reasoning. With merely 817 curated training samples, LIMO achieves 57.1% accuracy on AIME and 94.8% on MATH, improving from previous SFT-based models' 6.5% and 59.2% respectively, while only using 1% of the training data required by previous approaches. LIMO demonstrates exceptional out-of-distribution generalization, achieving 40.5% absolute improvement across 10 diverse benchmarks, outperforming models trained on 100x more data, challenging the notion that SFT leads to memorization rather than generalization. Based on these results, we propose the Less-Is-More Reasoning Hypothesis (LIMO Hypothesis): In foundation models where domain knowledge has been comprehensively encoded during pre-training, sophisticated reasoning capabilities can emerge through minimal but precisely orchestrated demonstrations of cognitive processes. This hypothesis posits that the elicitation threshold for complex reasoning is determined by two key factors: (1) the completeness of the model's encoded knowledge foundation during pre-training, and (2) the effectiveness of post-training examples as "cognitive templates" that show the model how to utilize its knowledge base to solve complex reasoning tasks. To facilitate reproducibility and future research in data-efficient reasoning, we release LIMO as a comprehensive open-source suite at https://github.com/GAIR-NLP/LIMO. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03387v1-abstract-full').style.display = 'none'; document.getElementById('2502.03387v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01770">arXiv:2502.01770</a> <span> [<a href="https://arxiv.org/pdf/2502.01770">pdf</a>, <a href="https://arxiv.org/format/2502.01770">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Hamming Attention Distillation: Binarizing Keys and Queries for Efficient Long-Context Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Horton%2C+M">Mark Horton</a>, <a href="/search/cs?searchtype=author&query=Molom-Ochir%2C+T">Tergel Molom-Ochir</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peter Liu</a>, <a href="/search/cs?searchtype=author&query=Gopal%2C+B">Bhavna Gopal</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+C">Chiyue Wei</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+C">Cong Guo</a>, <a href="/search/cs?searchtype=author&query=Taylor%2C+B">Brady Taylor</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+D">Deliang Fan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S+X">Shan X. Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hai Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yiran Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01770v1-abstract-short" style="display: inline;"> Pre-trained transformer models with extended context windows are notoriously expensive to run at scale, often limiting real-world deployment due to their high computational and memory requirements. In this paper, we introduce Hamming Attention Distillation (HAD), a novel framework that binarizes keys and queries in the attention mechanism to achieve significant efficiency gains. By converting keys… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01770v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01770v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01770v1-abstract-full" style="display: none;"> Pre-trained transformer models with extended context windows are notoriously expensive to run at scale, often limiting real-world deployment due to their high computational and memory requirements. In this paper, we introduce Hamming Attention Distillation (HAD), a novel framework that binarizes keys and queries in the attention mechanism to achieve significant efficiency gains. By converting keys and queries into {-1, +1} vectors and replacing dot-product operations with efficient Hamming distance computations, our method drastically reduces computational overhead. Additionally, we incorporate attention matrix sparsification to prune low-impact activations, which further reduces the cost of processing long-context sequences. \par Despite these aggressive compression strategies, our distilled approach preserves a high degree of representational power, leading to substantially improved accuracy compared to prior transformer binarization methods. We evaluate HAD on a range of tasks and models, including the GLUE benchmark, ImageNet, and QuALITY, demonstrating state-of-the-art performance among binarized Transformers while drastically reducing the computational costs of long-context inference. \par We implement HAD in custom hardware simulations, demonstrating superior performance characteristics compared to a custom hardware implementation of standard attention. HAD achieves just $\mathbf{1.78}\%$ performance losses on GLUE compared to $9.08\%$ in state-of-the-art binarization work, and $\mathbf{2.5}\%$ performance losses on ImageNet compared to $12.14\%$, all while targeting custom hardware with a $\mathbf{79}\%$ area reduction and $\mathbf{87}\%$ power reduction compared to its standard attention counterpart. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01770v1-abstract-full').style.display = 'none'; document.getElementById('2502.01770v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00043">arXiv:2502.00043</a> <span> [<a href="https://arxiv.org/pdf/2502.00043">pdf</a>, <a href="https://arxiv.org/ps/2502.00043">ps</a>, <a href="https://arxiv.org/format/2502.00043">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A scalable adaptive deep Koopman predictive controller for real-time optimization of mixed traffic flow </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lyu%2C+H">Hao Lyu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yanyong Guo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Pan Liu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+N">Nan Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Ting Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00043v1-abstract-short" style="display: inline;"> The use of connected automated vehicle (CAV) is advocated to mitigate traffic oscillations in mixed traffic flow consisting of CAVs and human driven vehicles (HDVs). This study proposes an adaptive deep Koopman predictive control framework (AdapKoopPC) for regulating mixed traffic flow. Firstly, a Koopman theory-based adaptive trajectory prediction deep network (AdapKoopnet) is designed for modeli… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00043v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00043v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00043v1-abstract-full" style="display: none;"> The use of connected automated vehicle (CAV) is advocated to mitigate traffic oscillations in mixed traffic flow consisting of CAVs and human driven vehicles (HDVs). This study proposes an adaptive deep Koopman predictive control framework (AdapKoopPC) for regulating mixed traffic flow. Firstly, a Koopman theory-based adaptive trajectory prediction deep network (AdapKoopnet) is designed for modeling HDVs car-following behavior. AdapKoopnet enables the representation of HDVs behavior by a linear model in a high-dimensional space. Secondly, the model predictive control is employed to smooth the mixed traffic flow, where the combination of the linear dynamic model of CAVs and linear prediction blocks from AdapKoopnet is embedded as the predictive model into the AdapKoopPC. Finally, the predictive performance of the prosed AdapKoopnet is verified using the HighD naturalistic driving dataset. Furthermore, the control performance of AdapKoopPC is validated by the numerical simulations. Results demonstrate that the AdapKoopnet provides more accuracy HDVs predicted trajectories than the baseline nonlinear models. Moreover, the proposed AdapKoopPC exhibits more effective control performance with less computation cost compared with baselines in mitigating traffic oscillations, especially at the low CAVs penetration rates. The code of proposed AdapKoopPC is open source. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00043v1-abstract-full').style.display = 'none'; document.getElementById('2502.00043v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18898">arXiv:2501.18898</a> <span> [<a href="https://arxiv.org/pdf/2501.18898">pdf</a>, <a href="https://arxiv.org/format/2501.18898">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> GestureLSM: Latent Shortcut based Co-Speech Gesture Generation with Spatial-Temporal Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+P">Pinxin Liu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+L">Luchuan Song</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Junhua Huang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chenliang Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18898v1-abstract-short" style="display: inline;"> Controlling human gestures based on speech signals presents a significant challenge in computer vision. While existing works did preliminary studies of generating holistic co-speech gesture from speech, the spatial interaction of each body region during the speech remains barely explored. This leads to wield body part interactions given the speech signal. Furthermore, the slow generation speed lim… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18898v1-abstract-full').style.display = 'inline'; document.getElementById('2501.18898v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18898v1-abstract-full" style="display: none;"> Controlling human gestures based on speech signals presents a significant challenge in computer vision. While existing works did preliminary studies of generating holistic co-speech gesture from speech, the spatial interaction of each body region during the speech remains barely explored. This leads to wield body part interactions given the speech signal. Furthermore, the slow generation speed limits the construction of real-world digital avatars. To resolve these problems, we propose \textbf{GestureLSM}, a Latent Shortcut based approach for Co-Speech Gesture Generation with spatial-temporal modeling. We tokenize various body regions and explicitly model their interactions with spatial and temporal attention. To achieve real-time gesture generations, we exam the denoising patterns and design an effective time distribution to speed up sampling while improve the generation quality for shortcut model. Extensive quantitative and qualitative experiments demonstrate the effectiveness of GestureLSM, showcasing its potential for various applications in the development of digital humans and embodied agents. Project Page: https://andypinxinliu.github.io/GestureLSM <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18898v1-abstract-full').style.display = 'none'; document.getElementById('2501.18898v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18794">arXiv:2501.18794</a> <span> [<a href="https://arxiv.org/pdf/2501.18794">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Genomics">q-bio.GN</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Survey and Improvement Strategies for Gene Prioritization with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Neeley%2C+M">Matthew Neeley</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+G">Guantong Qi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guanchu Wang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+R">Ruixiang Tang</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+D">Dongxue Mao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chaozhong Liu</a>, <a href="/search/cs?searchtype=author&query=Pasupuleti%2C+S">Sasidhar Pasupuleti</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+B">Bo Yuan</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+F">Fan Xia</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Pengfei Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhandong Liu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xia Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18794v1-abstract-short" style="display: inline;"> Rare diseases are challenging to diagnose due to limited patient data and genetic diversity. Despite advances in variant prioritization, many cases remain undiagnosed. While large language models (LLMs) have performed well in medical exams, their effectiveness in diagnosing rare genetic diseases has not been assessed. To identify causal genes, we benchmarked various LLMs for gene prioritization. U… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18794v1-abstract-full').style.display = 'inline'; document.getElementById('2501.18794v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18794v1-abstract-full" style="display: none;"> Rare diseases are challenging to diagnose due to limited patient data and genetic diversity. Despite advances in variant prioritization, many cases remain undiagnosed. While large language models (LLMs) have performed well in medical exams, their effectiveness in diagnosing rare genetic diseases has not been assessed. To identify causal genes, we benchmarked various LLMs for gene prioritization. Using multi-agent and Human Phenotype Ontology (HPO) classification, we categorized patients based on phenotypes and solvability levels. As gene set size increased, LLM performance deteriorated, so we used a divide-and-conquer strategy to break the task into smaller subsets. At baseline, GPT-4 outperformed other LLMs, achieving near 30% accuracy in ranking causal genes correctly. The multi-agent and HPO approaches helped distinguish confidently solved cases from challenging ones, highlighting the importance of known gene-phenotype associations and phenotype specificity. We found that cases with specific phenotypes or clear associations were more accurately solved. However, we observed biases toward well-studied genes and input order sensitivity, which hindered gene prioritization. Our divide-and-conquer strategy improved accuracy by overcoming these biases. By utilizing HPO classification, novel multi-agent techniques, and our LLM strategy, we improved causal gene identification accuracy compared to our baseline evaluation. This approach streamlines rare disease diagnosis, facilitates reanalysis of unsolved cases, and accelerates gene discovery, supporting the development of targeted diagnostics and therapies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18794v1-abstract-full').style.display = 'none'; document.getElementById('2501.18794v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 4 figures, 10 pages of supplementary figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16349">arXiv:2501.16349</a> <span> [<a href="https://arxiv.org/pdf/2501.16349">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Risk-Informed Diffusion Transformer for Long-Tail Trajectory Prediction in the Crash Scenario </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junlan Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Pei Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zihao Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hongyi Zhao</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+Y">Yufei Ji</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+Z">Ziyuan Pu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16349v1-abstract-short" style="display: inline;"> Trajectory prediction methods have been widely applied in autonomous driving technologies. Although the overall performance accuracy of trajectory prediction is relatively high, the lack of trajectory data in critical scenarios in the training data leads to the long-tail phenomenon. Normally, the trajectories of the tail data are more critical and more difficult to predict and may include rare sce… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16349v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16349v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16349v1-abstract-full" style="display: none;"> Trajectory prediction methods have been widely applied in autonomous driving technologies. Although the overall performance accuracy of trajectory prediction is relatively high, the lack of trajectory data in critical scenarios in the training data leads to the long-tail phenomenon. Normally, the trajectories of the tail data are more critical and more difficult to predict and may include rare scenarios such as crashes. To solve this problem, we extracted the trajectory data from real-world crash scenarios, which contain more long-tail data. Meanwhile, based on the trajectory data in this scenario, we integrated graph-based risk information and diffusion with transformer and proposed the Risk-Informed Diffusion Transformer (RI-DiT) trajectory prediction method. Extensive experiments were conducted on trajectory data in the real-world crash scenario, and the results show that the algorithm we proposed has good performance. When predicting the data of the tail 10\% (Top 10\%), the minADE and minFDE indicators are 0.016/2.667 m. At the same time, we showed the trajectory conditions of different long-tail distributions. The distribution of trajectory data is closer to the tail, the less smooth the trajectory is. Through the trajectory data in real-world crash scenarios, Our work expands the methods to overcome the long-tail challenges in trajectory prediction. Our method, RI-DiT, integrates inverse time to collision (ITTC) and the feature of traffic flow, which can predict long-tail trajectories more accurately and improve the safety of autonomous driving systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16349v1-abstract-full').style.display = 'none'; document.getElementById('2501.16349v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15338">arXiv:2501.15338</a> <span> [<a href="https://arxiv.org/pdf/2501.15338">pdf</a>, <a href="https://arxiv.org/format/2501.15338">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Fairness-aware Contextual Dynamic Pricing with Strategic Buyers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+P">Pangpang Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+W+W">Will Wei Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15338v1-abstract-short" style="display: inline;"> Contextual pricing strategies are prevalent in online retailing, where the seller adjusts prices based on products' attributes and buyers' characteristics. Although such strategies can enhance seller's profits, they raise concerns about fairness when significant price disparities emerge among specific groups, such as gender or race. These disparities can lead to adverse perceptions of fairness amo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15338v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15338v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15338v1-abstract-full" style="display: none;"> Contextual pricing strategies are prevalent in online retailing, where the seller adjusts prices based on products' attributes and buyers' characteristics. Although such strategies can enhance seller's profits, they raise concerns about fairness when significant price disparities emerge among specific groups, such as gender or race. These disparities can lead to adverse perceptions of fairness among buyers and may even violate the law and regulation. In contrast, price differences can incentivize disadvantaged buyers to strategically manipulate their group identity to obtain a lower price. In this paper, we investigate contextual dynamic pricing with fairness constraints, taking into account buyers' strategic behaviors when their group status is private and unobservable from the seller. We propose a dynamic pricing policy that simultaneously achieves price fairness and discourages strategic behaviors. Our policy achieves an upper bound of $O(\sqrt{T}+H(T))$ regret over $T$ time horizons, where the term $H(T)$ arises from buyers' assessment of the fairness of the pricing policy based on their learned price difference. When buyers are able to learn the fairness of the price policy, this upper bound reduces to $O(\sqrt{T})$. We also prove an $惟(\sqrt{T})$ regret lower bound of any pricing policy under our problem setting. We support our findings with extensive experimental evidence, showcasing our policy's effectiveness. In our real data analysis, we observe the existence of price discrimination against race in the loan application even after accounting for other contextual information. Our proposed pricing policy demonstrates a significant improvement, achieving 35.06% reduction in regret compared to the benchmark policy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15338v1-abstract-full').style.display = 'none'; document.getElementById('2501.15338v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13370">arXiv:2501.13370</a> <span> [<a href="https://arxiv.org/pdf/2501.13370">pdf</a>, <a href="https://arxiv.org/format/2501.13370">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unraveling Normal Anatomy via Fluid-Driven Anomaly Randomization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peirong Liu</a>, <a href="/search/cs?searchtype=author&query=Aguila%2C+A+L">Ana Lawry Aguila</a>, <a href="/search/cs?searchtype=author&query=Iglesias%2C+J+E">Juan E. Iglesias</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13370v1-abstract-short" style="display: inline;"> Data-driven machine learning has made significant strides in medical image analysis. However, most existing methods are tailored to specific modalities and assume a particular resolution (often isotropic). This limits their generalizability in clinical settings, where variations in scan appearance arise from differences in sequence parameters, resolution, and orientation. Furthermore, most general… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13370v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13370v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13370v1-abstract-full" style="display: none;"> Data-driven machine learning has made significant strides in medical image analysis. However, most existing methods are tailored to specific modalities and assume a particular resolution (often isotropic). This limits their generalizability in clinical settings, where variations in scan appearance arise from differences in sequence parameters, resolution, and orientation. Furthermore, most general-purpose models are designed for healthy subjects and suffer from performance degradation when pathology is present. We introduce UNA (Unraveling Normal Anatomy), the first modality-agnostic learning approach for normal brain anatomy reconstruction that can handle both healthy scans and cases with pathology. We propose a fluid-driven anomaly randomization method that generates an unlimited number of realistic pathology profiles on-the-fly. UNA is trained on a combination of synthetic and real data, and can be applied directly to real images with potential pathology without the need for fine-tuning. We demonstrate UNA's effectiveness in reconstructing healthy brain anatomy and showcase its direct application to anomaly detection, using both simulated and real images from 3D healthy and stroke datasets, including CT and MRI scans. By bridging the gap between healthy and diseased images, UNA enables the use of general-purpose models on diseased images, opening up new opportunities for large-scale analysis of uncurated clinical images in the presence of pathology. Code is available at https://github.com/peirong26/UNA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13370v1-abstract-full').style.display = 'none'; document.getElementById('2501.13370v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13041">arXiv:2501.13041</a> <span> [<a href="https://arxiv.org/pdf/2501.13041">pdf</a>, <a href="https://arxiv.org/format/2501.13041">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> TimeFilter: Patch-Specific Spatial-Temporal Graph Filtration for Time Series Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yifan Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Guibin Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peiyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+D">Disen Lan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+N">Naiqi Li</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+D">Dawei Cheng</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+T">Tao Dai</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shu-Tao Xia</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+S">Shirui Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13041v1-abstract-short" style="display: inline;"> Current time series forecasting methods can be broadly classified into two categories: Channel Independent (CI) and Channel Dependent (CD) strategies, both aiming to capture the complex dependencies within time series data. However, the CI strategy fails to exploit highly correlated covariate information, while the CD strategy integrates all dependencies, including irrelevant or noisy ones, thus c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13041v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13041v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13041v1-abstract-full" style="display: none;"> Current time series forecasting methods can be broadly classified into two categories: Channel Independent (CI) and Channel Dependent (CD) strategies, both aiming to capture the complex dependencies within time series data. However, the CI strategy fails to exploit highly correlated covariate information, while the CD strategy integrates all dependencies, including irrelevant or noisy ones, thus compromising generalization. To mitigate these issues, recent works have introduced the Channel Clustering (CC) strategy by grouping channels with similar characteristics and applying different modeling techniques to each cluster. However, coarse-grained clustering cannot flexibly capture complex, time-varying interactions. Addressing the above challenges, we propose TimeFilter, a graph-based framework for adaptive and fine-grained dependency modeling. Specifically, after constructing the graph with the input sequence, TimeFilter filters out irrelevant correlations and preserves the most critical ones through patch-specific filtering. Extensive experiments on 13 real-world datasets from various application domains demonstrate the state-of-the-art performance of TimeFilter. The code is available at https://github.com/TROUBADOUR000/TimeFilter. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13041v1-abstract-full').style.display = 'none'; document.getElementById('2501.13041v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10017">arXiv:2501.10017</a> <span> [<a href="https://arxiv.org/pdf/2501.10017">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Crash Frequency Modeling Based on Augmented Multi-Type Data by Hybrid VAE-Diffusion-Based Generative Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junlan Chen</a>, <a href="/search/cs?searchtype=author&query=He%2C+Q">Qijie He</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Pei Liu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+W">Wei Ma</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+Z">Ziyuan Pu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10017v1-abstract-short" style="display: inline;"> Crash frequency modelling analyzes the impact of factors like traffic volume, road geometry, and environmental conditions on crash occurrences. Inaccurate predictions can distort our understanding of these factors, leading to misguided policies and wasted resources, which jeopardize traffic safety. A key challenge in crash frequency modelling is the prevalence of excessive zero observations, cause… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10017v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10017v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10017v1-abstract-full" style="display: none;"> Crash frequency modelling analyzes the impact of factors like traffic volume, road geometry, and environmental conditions on crash occurrences. Inaccurate predictions can distort our understanding of these factors, leading to misguided policies and wasted resources, which jeopardize traffic safety. A key challenge in crash frequency modelling is the prevalence of excessive zero observations, caused by underreporting, the low probability of crashes, and high data collection costs. These zero observations often reduce model accuracy and introduce bias, complicating safety decision making. While existing approaches, such as statistical methods, data aggregation, and resampling, attempt to address this issue, they either rely on restrictive assumptions or result in significant information loss, distorting crash data. To overcome these limitations, we propose a hybrid VAE-Diffusion neural network, designed to reduce zero observations and handle the complexities of multi-type tabular crash data (count, ordinal, nominal, and real-valued variables). We assess the synthetic data quality generated by this model through metrics like similarity, accuracy, diversity, and structural consistency, and compare its predictive performance against traditional statistical models. Our findings demonstrate that the hybrid VAE-Diffusion model outperforms baseline models across all metrics, offering a more effective approach to augmenting crash data and improving the accuracy of crash frequency predictions. This study highlights the potential of synthetic data to enhance traffic safety by improving crash frequency modelling and informing better policy decisions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10017v1-abstract-full').style.display = 'none'; document.getElementById('2501.10017v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09279">arXiv:2501.09279</a> <span> [<a href="https://arxiv.org/pdf/2501.09279">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Text Semantics to Flexible Design: A Residential Layout Generation Method Based on Stable Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiu%2C+Z">Zijin Qiu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiepeng Liu</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Yi Xia</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+H">Hongtuo Qi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Pengkun Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09279v1-abstract-short" style="display: inline;"> Flexibility in the AI-based residential layout design remains a significant challenge, as traditional methods like rule-based heuristics and graph-based generation often lack flexibility and require substantial design knowledge from users. To address these limitations, we propose a cross-modal design approach based on the Stable Diffusion model for generating flexible residential layouts. The meth… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09279v1-abstract-full').style.display = 'inline'; document.getElementById('2501.09279v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09279v1-abstract-full" style="display: none;"> Flexibility in the AI-based residential layout design remains a significant challenge, as traditional methods like rule-based heuristics and graph-based generation often lack flexibility and require substantial design knowledge from users. To address these limitations, we propose a cross-modal design approach based on the Stable Diffusion model for generating flexible residential layouts. The method offers multiple input types for learning objectives, allowing users to specify both boundaries and layouts. It incorporates natural language as design constraints and introduces ControlNet to enable stable layout generation through two distinct pathways. We also present a scheme that encapsulates design expertise within a knowledge graph and translates it into natural language, providing an interpretable representation of design knowledge. This comprehensibility and diversity of input options enable professionals and non-professionals to directly express design requirements, enhancing flexibility and controllability. Finally, experiments verify the flexibility of the proposed methods under multimodal constraints better than state-of-the-art models, even when specific semantic information about room areas or connections is incomplete. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09279v1-abstract-full').style.display = 'none'; document.getElementById('2501.09279v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06697">arXiv:2501.06697</a> <span> [<a href="https://arxiv.org/pdf/2501.06697">pdf</a>, <a href="https://arxiv.org/format/2501.06697">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Mamba-MOC: A Multicategory Remote Object Counting via State Space Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peng Liu</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+S">Sen Lei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Heng-Chao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06697v1-abstract-short" style="display: inline;"> Multicategory remote object counting is a fundamental task in computer vision, aimed at accurately estimating the number of objects of various categories in remote images. Existing methods rely on CNNs and Transformers, but CNNs struggle to capture global dependencies, and Transformers are computationally expensive, which limits their effectiveness in remote applications. Recently, Mamba has emerg… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06697v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06697v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06697v1-abstract-full" style="display: none;"> Multicategory remote object counting is a fundamental task in computer vision, aimed at accurately estimating the number of objects of various categories in remote images. Existing methods rely on CNNs and Transformers, but CNNs struggle to capture global dependencies, and Transformers are computationally expensive, which limits their effectiveness in remote applications. Recently, Mamba has emerged as a promising solution in the field of computer vision, offering a linear complexity for modeling global dependencies. To this end, we propose Mamba-MOC, a mamba-based network designed for multi-category remote object counting, which represents the first application of Mamba to remote sensing object counting. Specifically, we propose a cross-scale interaction module to facilitate the deep integration of hierarchical features. Then we design a context state space model to capture both global and local contextual information and provide local neighborhood information during the scan process. Experimental results in large-scale realistic scenarios demonstrate that our proposed method achieves state-of-the-art performance compared with some mainstream counting algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06697v1-abstract-full').style.display = 'none'; document.getElementById('2501.06697v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06458">arXiv:2501.06458</a> <span> [<a href="https://arxiv.org/pdf/2501.06458">pdf</a>, <a href="https://arxiv.org/format/2501.06458">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> O1 Replication Journey -- Part 3: Inference-time Scaling for Medical Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhongzhen Huang</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+G">Gui Geng</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+S">Shengyi Hua</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhen Huang</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+H">Haoyang Zou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shaoting Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Pengfei Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaofan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06458v1-abstract-short" style="display: inline;"> Building upon our previous investigations of O1 replication (Part 1: Journey Learning [Qin et al., 2024] and Part 2: Distillation [Huang et al., 2024]), this work explores the potential of inference-time scaling in large language models (LLMs) for medical reasoning tasks, ranging from diagnostic decision-making to treatment planning. Through extensive experiments on medical benchmarks of varying c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06458v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06458v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06458v1-abstract-full" style="display: none;"> Building upon our previous investigations of O1 replication (Part 1: Journey Learning [Qin et al., 2024] and Part 2: Distillation [Huang et al., 2024]), this work explores the potential of inference-time scaling in large language models (LLMs) for medical reasoning tasks, ranging from diagnostic decision-making to treatment planning. Through extensive experiments on medical benchmarks of varying complexity (MedQA, Medbullets, and JAMA Clinical Challenges), our investigation reveals several key insights: (1) Increasing inference time does lead to improved performance. With a modest training set of 500 samples, our model yields substantial performance improvements of 6%-11%. (2) Task complexity directly correlates with the required length of reasoning chains, confirming the necessity of extended thought processes for challenging problems. (3) The differential diagnoses generated by our model adhere to the principles of the hypothetico-deductive method, producing a list of potential conditions that may explain a patient's symptoms and systematically narrowing these possibilities by evaluating the evidence. These findings demonstrate the promising synergy between inference-time scaling and journey learning in advancing LLMs' real-world clinical reasoning capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06458v1-abstract-full').style.display = 'none'; document.getElementById('2501.06458v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06250">arXiv:2501.06250</a> <span> [<a href="https://arxiv.org/pdf/2501.06250">pdf</a>, <a href="https://arxiv.org/format/2501.06250">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Generative AI for Cel-Animation: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunlong Tang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Junjia Guo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Pinxin Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhiyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+H">Hang Hua</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+J">Jia-Xing Zhong</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Y">Yunzhong Xiao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chao Huang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+L">Luchuan Song</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+S">Susan Liang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yizhi Song</a>, <a href="/search/cs?searchtype=author&query=He%2C+L">Liu He</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+J">Jing Bi</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+M">Mingqian Feng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xinyang Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zeliang Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chenliang Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06250v1-abstract-short" style="display: inline;"> Traditional Celluloid (Cel) Animation production pipeline encompasses multiple essential steps, including storyboarding, layout design, keyframe animation, inbetweening, and colorization, which demand substantial manual effort, technical expertise, and significant time investment. These challenges have historically impeded the efficiency and scalability of Cel-Animation production. The rise of gen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06250v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06250v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06250v1-abstract-full" style="display: none;"> Traditional Celluloid (Cel) Animation production pipeline encompasses multiple essential steps, including storyboarding, layout design, keyframe animation, inbetweening, and colorization, which demand substantial manual effort, technical expertise, and significant time investment. These challenges have historically impeded the efficiency and scalability of Cel-Animation production. The rise of generative artificial intelligence (GenAI), encompassing large language models, multimodal models, and diffusion models, offers innovative solutions by automating tasks such as inbetween frame generation, colorization, and storyboard creation. This survey explores how GenAI integration is revolutionizing traditional animation workflows by lowering technical barriers, broadening accessibility for a wider range of creators through tools like AniDoc, ToonCrafter, and AniSora, and enabling artists to focus more on creative expression and artistic innovation. Despite its potential, issues such as maintaining visual consistency, ensuring stylistic coherence, and addressing ethical considerations continue to pose challenges. Furthermore, this paper discusses future directions and explores potential advancements in AI-assisted animation. For further exploration and resources, please visit our GitHub repository: https://github.com/yunlong10/Awesome-AI4Animation <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06250v1-abstract-full').style.display = 'none'; document.getElementById('2501.06250v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04908">arXiv:2501.04908</a> <span> [<a href="https://arxiv.org/pdf/2501.04908">pdf</a>, <a href="https://arxiv.org/format/2501.04908">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Programming Languages">cs.PL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> HaVen: Hallucination-Mitigated LLM for Verilog Code Generation Aligned with HDL Engineers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yiyao Yang</a>, <a href="/search/cs?searchtype=author&query=Teng%2C+F">Fu Teng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Pengju Liu</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+M">Mengnan Qi</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+C">Chenyang Lv</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Ji Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuhong Zhang</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhezhi He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04908v1-abstract-short" style="display: inline;"> Recently, the use of large language models (LLMs) for Verilog code generation has attracted great research interest to enable hardware design automation. However, previous works have shown a gap between the ability of LLMs and the practical demands of hardware description language (HDL) engineering. This gap includes differences in how engineers phrase questions and hallucinations in the code gene… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04908v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04908v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04908v1-abstract-full" style="display: none;"> Recently, the use of large language models (LLMs) for Verilog code generation has attracted great research interest to enable hardware design automation. However, previous works have shown a gap between the ability of LLMs and the practical demands of hardware description language (HDL) engineering. This gap includes differences in how engineers phrase questions and hallucinations in the code generated. To address these challenges, we introduce HaVen, a novel LLM framework designed to mitigate hallucinations and align Verilog code generation with the practices of HDL engineers. HaVen tackles hallucination issues by proposing a comprehensive taxonomy and employing a chain-of-thought (CoT) mechanism to translate symbolic modalities (e.g. truth tables, state diagrams, etc.) into accurate natural language descriptions. Furthermore, HaVen bridges this gap by using a data augmentation strategy. It synthesizes high-quality instruction-code pairs that match real HDL engineering practices. Our experiments demonstrate that HaVen significantly improves the correctness of Verilog code generation, outperforming state-of-the-art LLM-based Verilog generation methods on VerilogEval and RTLLM benchmark. HaVen is publicly available at https://github.com/Intelligent-Computing-Research-Group/HaVen. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04908v1-abstract-full').style.display = 'none'; document.getElementById('2501.04908v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02010">arXiv:2501.02010</a> <span> [<a href="https://arxiv.org/pdf/2501.02010">pdf</a>, <a href="https://arxiv.org/format/2501.02010">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Explainable Neural Networks with Guarantees: A Sparse Estimation Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ledent%2C+A">Antoine Ledent</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peng Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02010v2-abstract-short" style="display: inline;"> Balancing predictive power and interpretability has long been a challenging research area, particularly in powerful yet complex models like neural networks, where nonlinearity obstructs direct interpretation. This paper introduces a novel approach to constructing an explainable neural network that harmonizes predictiveness and explainability. Our model, termed SparXnet, is designed as a linear com… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02010v2-abstract-full').style.display = 'inline'; document.getElementById('2501.02010v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02010v2-abstract-full" style="display: none;"> Balancing predictive power and interpretability has long been a challenging research area, particularly in powerful yet complex models like neural networks, where nonlinearity obstructs direct interpretation. This paper introduces a novel approach to constructing an explainable neural network that harmonizes predictiveness and explainability. Our model, termed SparXnet, is designed as a linear combination of a sparse set of jointly learned features, each derived from a different trainable function applied to a single 1-dimensional input feature. Leveraging the ability to learn arbitrarily complex relationships, our neural network architecture enables automatic selection of a sparse set of important features, with the final prediction being a linear combination of rescaled versions of these features. We demonstrate the ability to select significant features while maintaining comparable predictive performance and direct interpretability through extensive experiments on synthetic and real-world datasets. We also provide theoretical analysis on the generalization bounds of our framework, which is favorably linear in the number of selected features and only logarithmic in the number of input features. We further lift any dependence of sample complexity on the number of parameters or the architectural details under very mild conditions. Our research paves the way for further research on sparse and explainable neural networks with guarantee. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02010v2-abstract-full').style.display = 'none'; document.getElementById('2501.02010v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00747">arXiv:2501.00747</a> <span> [<a href="https://arxiv.org/pdf/2501.00747">pdf</a>, <a href="https://arxiv.org/format/2501.00747">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> DIVE: Diversified Iterative Self-Improvement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qin%2C+Y">Yiwei Qin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yixiu Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Pengfei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00747v1-abstract-short" style="display: inline;"> Recent advances in large language models (LLMs) have demonstrated the effectiveness of Iterative Self-Improvement (ISI) techniques. However, continuous training on self-generated data leads to reduced output diversity, a limitation particularly critical in reasoning tasks where diverse solution paths are essential. We present DIVE (Diversified Iterative Self-Improvement), a novel framework that ad… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00747v1-abstract-full').style.display = 'inline'; document.getElementById('2501.00747v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00747v1-abstract-full" style="display: none;"> Recent advances in large language models (LLMs) have demonstrated the effectiveness of Iterative Self-Improvement (ISI) techniques. However, continuous training on self-generated data leads to reduced output diversity, a limitation particularly critical in reasoning tasks where diverse solution paths are essential. We present DIVE (Diversified Iterative Self-Improvement), a novel framework that addresses this challenge through two key components: Sample Pool Expansion for broader solution exploration, and Data Selection for balancing diversity and quality in preference pairs. Experiments on MATH and GSM8k datasets show that DIVE achieves a 10% to 45% relative increase in output diversity metrics while maintaining performance quality compared to vanilla ISI. Our ablation studies confirm both components' significance in achieving these improvements. Code is available at https://github.com/qinyiwei/DIVE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00747v1-abstract-full').style.display = 'none'; document.getElementById('2501.00747v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00009">arXiv:2501.00009</a> <span> [<a href="https://arxiv.org/pdf/2501.00009">pdf</a>, <a href="https://arxiv.org/format/2501.00009">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Model-Driven Deep Neural Network for Enhanced AoA Estimation Using 5G gNB </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shengheng Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xingkang Li</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Z">Zihuan Mao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peng Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yongming Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00009v1-abstract-short" style="display: inline;"> High-accuracy positioning has become a fundamental enabler for intelligent connected devices. Nevertheless, the present wireless networks still rely on model-driven approaches to achieve positioning functionality, which are susceptible to performance degradation in practical scenarios, primarily due to hardware impairments. Integrating artificial intelligence into the positioning framework present… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00009v1-abstract-full').style.display = 'inline'; document.getElementById('2501.00009v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00009v1-abstract-full" style="display: none;"> High-accuracy positioning has become a fundamental enabler for intelligent connected devices. Nevertheless, the present wireless networks still rely on model-driven approaches to achieve positioning functionality, which are susceptible to performance degradation in practical scenarios, primarily due to hardware impairments. Integrating artificial intelligence into the positioning framework presents a promising solution to revolutionize the accuracy and robustness of location-based services. In this study, we address this challenge by reformulating the problem of angle-of-arrival (AoA) estimation into image reconstruction of spatial spectrum. To this end, we design a model-driven deep neural network (MoD-DNN), which can automatically calibrate the angular-dependent phase error. The proposed MoD-DNN approach employs an iterative optimization scheme between a convolutional neural network and a sparse conjugate gradient algorithm. Simulation and experimental results are presented to demonstrate the effectiveness of the proposed method in enhancing spectrum calibration and AoA estimation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00009v1-abstract-full').style.display = 'none'; document.getElementById('2501.00009v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Presented at AAAI 2024 (Main Technical Track)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20891">arXiv:2412.20891</a> <span> [<a href="https://arxiv.org/pdf/2412.20891">pdf</a>, <a href="https://arxiv.org/format/2412.20891">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DoTA: Weight-Decomposed Tensor Adaptation for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiaolin Hu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xiang Cheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peiyu Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wei Liu</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+J">Jian Luan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bin Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yong Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20891v1-abstract-short" style="display: inline;"> Low-rank adaptation (LoRA) reduces the computational and memory demands of fine-tuning large language models (LLMs) by approximating updates with low-rank matrices. However, low-rank approximation in two-dimensional space fails to capture high-dimensional structures within the target matrix. Recently, tensor decomposition methods have been explored for fine-tuning LLMs, leveraging their ability to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20891v1-abstract-full').style.display = 'inline'; document.getElementById('2412.20891v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20891v1-abstract-full" style="display: none;"> Low-rank adaptation (LoRA) reduces the computational and memory demands of fine-tuning large language models (LLMs) by approximating updates with low-rank matrices. However, low-rank approximation in two-dimensional space fails to capture high-dimensional structures within the target matrix. Recently, tensor decomposition methods have been explored for fine-tuning LLMs, leveraging their ability to extract structured information. Yet, these approaches primarily rely on random initialization, and the impact of initialization on tensor adaptation remains underexplored. In this paper, we reveal that random initialization significantly diverges from the validation loss achieved by full fine-tuning. To address this, we propose Weight-Decomposed Tensor Adaptation (DoTA), which leverages the Matrix Product Operator (MPO) decomposition of pre-trained weights for effective initialization in fine-tuning LLMs. Additionally, we introduce QDoTA, a quantized version of DoTA designed for 4-bit quantization. Experiments on commonsense and arithmetic reasoning tasks show that DoTA outperforms random initialization methods with fewer parameters. QDoTA further reduces memory consumption and achieves comparable performance to DoTA on commonsense reasoning tasks. We will release our code to support future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20891v1-abstract-full').style.display = 'none'; document.getElementById('2412.20891v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19483">arXiv:2412.19483</a> <span> [<a href="https://arxiv.org/pdf/2412.19483">pdf</a>, <a href="https://arxiv.org/format/2412.19483">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Learning Radiance Fields from a Single Snapshot Compressive Image </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunhao Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiang Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaodong Wang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xin Yuan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peidong Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19483v1-abstract-short" style="display: inline;"> In this paper, we explore the potential of Snapshot Compressive Imaging (SCI) technique for recovering the underlying 3D scene structure from a single temporal compressed image. SCI is a cost-effective method that enables the recording of high-dimensional data, such as hyperspectral or temporal information, into a single image using low-cost 2D imaging sensors. To achieve this, a series of special… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19483v1-abstract-full').style.display = 'inline'; document.getElementById('2412.19483v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19483v1-abstract-full" style="display: none;"> In this paper, we explore the potential of Snapshot Compressive Imaging (SCI) technique for recovering the underlying 3D scene structure from a single temporal compressed image. SCI is a cost-effective method that enables the recording of high-dimensional data, such as hyperspectral or temporal information, into a single image using low-cost 2D imaging sensors. To achieve this, a series of specially designed 2D masks are usually employed, reducing storage and transmission requirements and offering potential privacy protection. Inspired by this, we take one step further to recover the encoded 3D scene information leveraging powerful 3D scene representation capabilities of neural radiance fields (NeRF). Specifically, we propose SCINeRF, in which we formulate the physical imaging process of SCI as part of the training of NeRF, allowing us to exploit its impressive performance in capturing complex scene structures. In addition, we further integrate the popular 3D Gaussian Splatting (3DGS) framework and propose SCISplat to improve 3D scene reconstruction quality and training/rendering speed by explicitly optimizing point clouds into 3D Gaussian representations. To assess the effectiveness of our method, we conduct extensive evaluations using both synthetic data and real data captured by our SCI system. Experimental results demonstrate that our proposed approach surpasses the state-of-the-art methods in terms of image reconstruction and novel view synthesis. Moreover, our method also exhibits the ability to render high frame-rate multi-view consistent images in real time by leveraging SCI and the rendering capabilities of 3DGS. Codes will be available at: https://github.com/WU- CVGL/SCISplat. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19483v1-abstract-full').style.display = 'none'; document.getElementById('2412.19483v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18551">arXiv:2412.18551</a> <span> [<a href="https://arxiv.org/pdf/2412.18551">pdf</a>, <a href="https://arxiv.org/format/2412.18551">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Libra-Leaderboard: Towards Responsible AI through a Balanced Leaderboard of Safety and Capability </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">Haonan Li</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xudong Han</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+Z">Zenan Zhai</a>, <a href="/search/cs?searchtype=author&query=Mu%2C+H">Honglin Mu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hao Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhenxuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+Y">Yilin Geng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+S">Shom Lin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Renxi Wang</a>, <a href="/search/cs?searchtype=author&query=Shelmanov%2C+A">Artem Shelmanov</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+X">Xiangyu Qi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuxia Wang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+D">Donghai Hong</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Youliang Yuan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Meng Chen</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+H">Haoqin Tu</a>, <a href="/search/cs?searchtype=author&query=Koto%2C+F">Fajri Koto</a>, <a href="/search/cs?searchtype=author&query=Kuribayashi%2C+T">Tatsuki Kuribayashi</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+C">Cong Zeng</a>, <a href="/search/cs?searchtype=author&query=Bhardwaj%2C+R">Rishabh Bhardwaj</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+B">Bingchen Zhao</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+Y">Yawen Duan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yi Liu</a>, <a href="/search/cs?searchtype=author&query=Alghamdi%2C+E+A">Emad A. Alghamdi</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yaodong Yang</a> , et al. (10 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18551v1-abstract-short" style="display: inline;"> To address this gap, we introduce Libra-Leaderboard, a comprehensive framework designed to rank LLMs through a balanced evaluation of performance and safety. Combining a dynamic leaderboard with an interactive LLM arena, Libra-Leaderboard encourages the joint optimization of capability and safety. Unlike traditional approaches that average performance and safety metrics, Libra-Leaderboard uses a d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18551v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18551v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18551v1-abstract-full" style="display: none;"> To address this gap, we introduce Libra-Leaderboard, a comprehensive framework designed to rank LLMs through a balanced evaluation of performance and safety. Combining a dynamic leaderboard with an interactive LLM arena, Libra-Leaderboard encourages the joint optimization of capability and safety. Unlike traditional approaches that average performance and safety metrics, Libra-Leaderboard uses a distance-to-optimal-score method to calculate the overall rankings. This approach incentivizes models to achieve a balance rather than excelling in one dimension at the expense of some other ones. In the first release, Libra-Leaderboard evaluates 26 mainstream LLMs from 14 leading organizations, identifying critical safety challenges even in state-of-the-art models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18551v1-abstract-full').style.display = 'none'; document.getElementById('2412.18551v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18194">arXiv:2412.18194</a> <span> [<a href="https://arxiv.org/pdf/2412.18194">pdf</a>, <a href="https://arxiv.org/format/2412.18194">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VLABench: A Large-Scale Benchmark for Language-Conditioned Robotics Manipulation with Long-Horizon Reasoning Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shiduo Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhe Xu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peiju Liu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xiaopeng Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuan Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Q">Qinghui Gao</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+Z">Zhaoye Fei</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhangyue Yin</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zuxuan Wu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yu-Gang Jiang</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xipeng Qiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18194v1-abstract-short" style="display: inline;"> General-purposed embodied agents are designed to understand the users' natural instructions or intentions and act precisely to complete universal tasks. Recently, methods based on foundation models especially Vision-Language-Action models (VLAs) have shown a substantial potential to solve language-conditioned manipulation (LCM) tasks well. However, existing benchmarks do not adequately meet the ne… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18194v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18194v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18194v1-abstract-full" style="display: none;"> General-purposed embodied agents are designed to understand the users' natural instructions or intentions and act precisely to complete universal tasks. Recently, methods based on foundation models especially Vision-Language-Action models (VLAs) have shown a substantial potential to solve language-conditioned manipulation (LCM) tasks well. However, existing benchmarks do not adequately meet the needs of VLAs and relative algorithms. To better define such general-purpose tasks in the context of LLMs and advance the research in VLAs, we present VLABench, an open-source benchmark for evaluating universal LCM task learning. VLABench provides 100 carefully designed categories of tasks, with strong randomization in each category of task and a total of 2000+ objects. VLABench stands out from previous benchmarks in four key aspects: 1) tasks requiring world knowledge and common sense transfer, 2) natural language instructions with implicit human intentions rather than templates, 3) long-horizon tasks demanding multi-step reasoning, and 4) evaluation of both action policies and language model capabilities. The benchmark assesses multiple competencies including understanding of mesh\&texture, spatial relationship, semantic instruction, physical laws, knowledge transfer and reasoning, etc. To support the downstream finetuning, we provide high-quality training data collected via an automated framework incorporating heuristic skills and prior information. The experimental results indicate that both the current state-of-the-art pretrained VLAs and the workflow based on VLMs face challenges in our tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18194v1-abstract-full').style.display = 'none'; document.getElementById('2412.18194v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18112">arXiv:2412.18112</a> <span> [<a href="https://arxiv.org/pdf/2412.18112">pdf</a>, <a href="https://arxiv.org/format/2412.18112">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Spectrum-oriented Point-supervised Saliency Detector for Hyperspectral Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peifu Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+T">Tingfa Xu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+G">Guokai Shi</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jingxuan Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Huan Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jianan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18112v1-abstract-short" style="display: inline;"> Hyperspectral salient object detection (HSOD) aims to extract targets or regions with significantly different spectra from hyperspectral images. While existing deep learning-based methods can achieve good detection results, they generally necessitate pixel-level annotations, which are notably challenging to acquire for hyperspectral images. To address this issue, we introduce point supervision int… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18112v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18112v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18112v1-abstract-full" style="display: none;"> Hyperspectral salient object detection (HSOD) aims to extract targets or regions with significantly different spectra from hyperspectral images. While existing deep learning-based methods can achieve good detection results, they generally necessitate pixel-level annotations, which are notably challenging to acquire for hyperspectral images. To address this issue, we introduce point supervision into HSOD, and incorporate Spectral Saliency, derived from conventional HSOD methods, as a pivotal spectral representation within the framework. This integration leads to the development of a novel Spectrum-oriented Point-supervised Saliency Detector (SPSD). Specifically, we propose a novel pipeline, specifically designed for HSIs, to generate pseudo-labels, effectively mitigating the performance decline associated with point supervision strategy. Additionally, Spectral Saliency is employed to counteract information loss during model supervision and saliency refinement, thereby maintaining the structural integrity and edge accuracy of the detected objects. Furthermore, we introduce a Spectrum-transformed Spatial Gate to focus more precisely on salient regions while reducing feature redundancy. We have carried out comprehensive experiments on both HSOD-BIT and HS-SOD datasets to validate the efficacy of our proposed method, using mean absolute error (MAE), E-measure, F-measure, Area Under Curve, and Cross Correlation as evaluation metrics. For instance, on the HSOD-BIT dataset, our SPSD achieves a MAE of 0.031 and an F-measure of 0.878. Thorough ablation studies have substantiated the effectiveness of each individual module and provided insights into the model's working mechanism. Further evaluations on RGB-thermal salient object detection datasets highlight the versatility of our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18112v1-abstract-full').style.display = 'none'; document.getElementById('2412.18112v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE TIM. Code: https://github.com/laprf/SPSD</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.17589">arXiv:2412.17589</a> <span> [<a href="https://arxiv.org/pdf/2412.17589">pdf</a>, <a href="https://arxiv.org/format/2412.17589">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> PC Agent: While You Sleep, AI Works -- A Cognitive Journey into Digital World </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+Y">Yanheng He</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+J">Jiahe Jin</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shijie Xia</a>, <a href="/search/cs?searchtype=author&query=Su%2C+J">Jiadi Su</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+R">Runze Fan</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+H">Haoyang Zou</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiangkun Hu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Pengfei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.17589v1-abstract-short" style="display: inline;"> Imagine a world where AI can handle your work while you sleep - organizing your research materials, drafting a report, or creating a presentation you need for tomorrow. However, while current digital agents can perform simple tasks, they are far from capable of handling the complex real-world work that humans routinely perform. We present PC Agent, an AI system that demonstrates a crucial step tow… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17589v1-abstract-full').style.display = 'inline'; document.getElementById('2412.17589v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.17589v1-abstract-full" style="display: none;"> Imagine a world where AI can handle your work while you sleep - organizing your research materials, drafting a report, or creating a presentation you need for tomorrow. However, while current digital agents can perform simple tasks, they are far from capable of handling the complex real-world work that humans routinely perform. We present PC Agent, an AI system that demonstrates a crucial step toward this vision through human cognition transfer. Our key insight is that the path from executing simple "tasks" to handling complex "work" lies in efficiently capturing and learning from human cognitive processes during computer use. To validate this hypothesis, we introduce three key innovations: (1) PC Tracker, a lightweight infrastructure that efficiently collects high-quality human-computer interaction trajectories with complete cognitive context; (2) a two-stage cognition completion pipeline that transforms raw interaction data into rich cognitive trajectories by completing action semantics and thought processes; and (3) a multi-agent system combining a planning agent for decision-making with a grounding agent for robust visual grounding. Our preliminary experiments in PowerPoint presentation creation reveal that complex digital work capabilities can be achieved with a small amount of high-quality cognitive data - PC Agent, trained on just 133 cognitive trajectories, can handle sophisticated work scenarios involving up to 50 steps across multiple applications. This demonstrates the data efficiency of our approach, highlighting that the key to training capable digital agents lies in collecting human cognitive data. By open-sourcing our complete framework, including the data collection infrastructure and cognition completion methods, we aim to lower the barriers for the research community to develop truly capable digital agents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17589v1-abstract-full').style.display = 'none'; document.getElementById('2412.17589v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16555">arXiv:2412.16555</a> <span> [<a href="https://arxiv.org/pdf/2412.16555">pdf</a>, <a href="https://arxiv.org/format/2412.16555">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Divide and Conquer: A Hybrid Strategy Defeats Multimodal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mao%2C+Y">Yanxu Mao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peipei Liu</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+T">Tiehan Cui</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Congying Liu</a>, <a href="/search/cs?searchtype=author&query=You%2C+D">Datao You</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16555v1-abstract-short" style="display: inline;"> Large language models (LLMs) are widely applied in various fields of society due to their powerful reasoning, understanding, and generation capabilities. However, the security issues associated with these models are becoming increasingly severe. Jailbreaking attacks, as an important method for detecting vulnerabilities in LLMs, have been explored by researchers who attempt to induce these models t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16555v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16555v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16555v1-abstract-full" style="display: none;"> Large language models (LLMs) are widely applied in various fields of society due to their powerful reasoning, understanding, and generation capabilities. However, the security issues associated with these models are becoming increasingly severe. Jailbreaking attacks, as an important method for detecting vulnerabilities in LLMs, have been explored by researchers who attempt to induce these models to generate harmful content through various attack methods. Nevertheless, existing jailbreaking methods face numerous limitations, such as excessive query counts, limited coverage of jailbreak modalities, low attack success rates, and simplistic evaluation methods. To overcome these constraints, this paper proposes a multimodal jailbreaking method: JMLLM. This method integrates multiple strategies to perform comprehensive jailbreak attacks across text, visual, and auditory modalities. Additionally, we contribute a new and comprehensive dataset for multimodal jailbreaking research: TriJail, which includes jailbreak prompts for all three modalities. Experiments on the TriJail dataset and the benchmark dataset AdvBench, conducted on 13 popular LLMs, demonstrate advanced attack success rates and significant reduction in time overhead. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16555v1-abstract-full').style.display = 'none'; document.getElementById('2412.16555v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13664">arXiv:2412.13664</a> <span> [<a href="https://arxiv.org/pdf/2412.13664">pdf</a>, <a href="https://arxiv.org/format/2412.13664">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> A Skeleton-Based Topological Planner for Exploration in Complex Unknown Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Niu%2C+H">Haochen Niu</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+X">Xingwu Ji</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lantao Zhang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+F">Fei Wen</a>, <a href="/search/cs?searchtype=author&query=Ying%2C+R">Rendong Ying</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peilin Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13664v1-abstract-short" style="display: inline;"> The capability of autonomous exploration in complex, unknown environments is important in many robotic applications. While recent research on autonomous exploration have achieved much progress, there are still limitations, e.g., existing methods relying on greedy heuristics or optimal path planning are often hindered by repetitive paths and high computational demands. To address such limitations,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13664v1-abstract-full').style.display = 'inline'; document.getElementById('2412.13664v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13664v1-abstract-full" style="display: none;"> The capability of autonomous exploration in complex, unknown environments is important in many robotic applications. While recent research on autonomous exploration have achieved much progress, there are still limitations, e.g., existing methods relying on greedy heuristics or optimal path planning are often hindered by repetitive paths and high computational demands. To address such limitations, we propose a novel exploration framework that utilizes the global topology information of observed environment to improve exploration efficiency while reducing computational overhead. Specifically, global information is utilized based on a skeletal topological graph representation of the environment geometry. We first propose an incremental skeleton extraction method based on wavefront propagation, based on which we then design an approach to generate a lightweight topological graph that can effectively capture the environment's structural characteristics. Building upon this, we introduce a finite state machine that leverages the topological structure to efficiently plan coverage paths, which can substantially mitigate the back-and-forth maneuvers (BFMs) problem. Experimental results demonstrate the superiority of our method in comparison with state-of-the-art methods. The source code will be made publicly available at: \url{https://github.com/Haochen-Niu/STGPlanner}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13664v1-abstract-full').style.display = 'none'; document.getElementById('2412.13664v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11609">arXiv:2412.11609</a> <span> [<a href="https://arxiv.org/pdf/2412.11609">pdf</a>, <a href="https://arxiv.org/format/2412.11609">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CLIP-SR: Collaborative Linguistic and Image Processing for Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+B">Bingwen Hu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Heng Liu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zhedong Zheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Ping Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11609v1-abstract-short" style="display: inline;"> Convolutional Neural Networks (CNNs) have advanced Image Super-Resolution (SR), but most CNN-based methods rely solely on pixel-based transformations, often leading to artifacts and blurring, particularly with severe downsampling (e.g., 8x or 16x). Recent text-guided SR methods attempt to leverage textual information for enhanced detail, but they frequently struggle with effective alignment, resul… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11609v1-abstract-full').style.display = 'inline'; document.getElementById('2412.11609v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11609v1-abstract-full" style="display: none;"> Convolutional Neural Networks (CNNs) have advanced Image Super-Resolution (SR), but most CNN-based methods rely solely on pixel-based transformations, often leading to artifacts and blurring, particularly with severe downsampling (e.g., 8x or 16x). Recent text-guided SR methods attempt to leverage textual information for enhanced detail, but they frequently struggle with effective alignment, resulting in inconsistent semantic coherence. To address these limitations, we introduce a multi-modal semantic enhancement approach that combines textual semantics with visual features, effectively tackling semantic mismatches and detail loss in highly degraded LR images. Our proposed multi-modal collaborative framework enables the production of realistic and high-quality SR images at significant up-scaling factors. The framework integrates text and image inputs, employing a prompt predictor, Text-Image Fusion Block (TIFBlock), and Iterative Refinement Module alongside CLIP (Contrastive Language-Image Pretraining) features to guide a progressive enhancement process with fine-grained alignment. This alignment produces high-resolution outputs with crisp details and semantic coherence, even at large scaling factors. Through extensive comparative experiments and ablation studies, we validate the effectiveness of our approach. Additionally, by incorporating textual semantic guidance, our technique enables a degree of super-resolution editability while maintaining semantic coherence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11609v1-abstract-full').style.display = 'none'; document.getElementById('2412.11609v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11466">arXiv:2412.11466</a> <span> [<a href="https://arxiv.org/pdf/2412.11466">pdf</a>, <a href="https://arxiv.org/format/2412.11466">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Mining In-distribution Attributes in Outliers for Out-of-distribution Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lei%2C+Y">Yutian Lei</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+L">Luping Ji</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Pei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11466v1-abstract-short" style="display: inline;"> Out-of-distribution (OOD) detection is indispensable for deploying reliable machine learning systems in real-world scenarios. Recent works, using auxiliary outliers in training, have shown good potential. However, they seldom concern the intrinsic correlations between in-distribution (ID) and OOD data. In this work, we discover an obvious correlation that OOD data usually possesses significant ID… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11466v1-abstract-full').style.display = 'inline'; document.getElementById('2412.11466v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11466v1-abstract-full" style="display: none;"> Out-of-distribution (OOD) detection is indispensable for deploying reliable machine learning systems in real-world scenarios. Recent works, using auxiliary outliers in training, have shown good potential. However, they seldom concern the intrinsic correlations between in-distribution (ID) and OOD data. In this work, we discover an obvious correlation that OOD data usually possesses significant ID attributes. These attributes should be factored into the training process, rather than blindly suppressed as in previous approaches. Based on this insight, we propose a structured multi-view-based out-of-distribution detection learning (MVOL) framework, which facilitates rational handling of the intrinsic in-distribution attributes in outliers. We provide theoretical insights on the effectiveness of MVOL for OOD detection. Extensive experiments demonstrate the superiority of our framework to others. MVOL effectively utilizes both auxiliary OOD datasets and even wild datasets with noisy in-distribution data. Code is available at https://github.com/UESTC-nnLab/MVOL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11466v1-abstract-full').style.display = 'none'; document.getElementById('2412.11466v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10644">arXiv:2412.10644</a> <span> [<a href="https://arxiv.org/pdf/2412.10644">pdf</a>, <a href="https://arxiv.org/format/2412.10644">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Model-driven deep neural network for enhanced direction finding with commodity 5G gNodeB </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shengheng Liu</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Z">Zihuan Mao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xingkang Li</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+M">Mengguan Pan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peng Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yongming Huang</a>, <a href="/search/cs?searchtype=author&query=You%2C+X">Xiaohu You</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10644v1-abstract-short" style="display: inline;"> Pervasive and high-accuracy positioning has become increasingly important as a fundamental enabler for intelligent connected devices in mobile networks. Nevertheless, current wireless networks heavily rely on pure model-driven techniques to achieve positioning functionality, often succumbing to performance deterioration due to hardware impairments in practical scenarios. Here we reformulate the di… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10644v1-abstract-full').style.display = 'inline'; document.getElementById('2412.10644v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10644v1-abstract-full" style="display: none;"> Pervasive and high-accuracy positioning has become increasingly important as a fundamental enabler for intelligent connected devices in mobile networks. Nevertheless, current wireless networks heavily rely on pure model-driven techniques to achieve positioning functionality, often succumbing to performance deterioration due to hardware impairments in practical scenarios. Here we reformulate the direction finding or angle-of-arrival (AoA) estimation problem as an image recovery task of the spatial spectrum and propose a new model-driven deep neural network (MoD-DNN) framework. The proposed MoD-DNN scheme comprises three modules: a multi-task autoencoder-based beamformer, a coarray spectrum generation module, and a model-driven deep learning-based spatial spectrum reconstruction module. Our technique enables automatic calibration of angular-dependent phase error thereby enhancing the resilience of direction-finding precision against realistic system non-idealities. We validate the proposed scheme both using numerical simulations and field tests. The results show that the proposed MoD-DNN framework enables effective spectrum calibration and accurate AoA estimation. To the best of our knowledge, this study marks the first successful demonstration of hybrid data-and-model-driven direction finding utilizing readily available commodity 5G gNodeB. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10644v1-abstract-full').style.display = 'none'; document.getElementById('2412.10644v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in ACM TOSN. A preliminary version of this article was presented at the AAAI'2024 Main Technical Track</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09922">arXiv:2412.09922</a> <span> [<a href="https://arxiv.org/pdf/2412.09922">pdf</a>, <a href="https://arxiv.org/format/2412.09922">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Low-Resource Fast Text Classification Based on Intra-Class and Inter-Class Distance Calculation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mao%2C+Y">Yanxu Mao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peipei Liu</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+T">Tiehan Cui</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Congying Liu</a>, <a href="/search/cs?searchtype=author&query=You%2C+D">Datao You</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09922v1-abstract-short" style="display: inline;"> In recent years, text classification methods based on neural networks and pre-trained models have gained increasing attention and demonstrated excellent performance. However, these methods still have some limitations in practical applications: (1) They typically focus only on the matching similarity between sentences. However, there exists implicit high-value information both within sentences of t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09922v1-abstract-full').style.display = 'inline'; document.getElementById('2412.09922v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09922v1-abstract-full" style="display: none;"> In recent years, text classification methods based on neural networks and pre-trained models have gained increasing attention and demonstrated excellent performance. However, these methods still have some limitations in practical applications: (1) They typically focus only on the matching similarity between sentences. However, there exists implicit high-value information both within sentences of the same class and across different classes, which is very crucial for classification tasks. (2) Existing methods such as pre-trained language models and graph-based approaches often consume substantial memory for training and text-graph construction. (3) Although some low-resource methods can achieve good performance, they often suffer from excessively long processing times. To address these challenges, we propose a low-resource and fast text classification model called LFTC. Our approach begins by constructing a compressor list for each class to fully mine the regularity information within intra-class data. We then remove redundant information irrelevant to the target classification to reduce processing time. Finally, we compute the similarity distance between text pairs for classification. We evaluate LFTC on 9 publicly available benchmark datasets, and the results demonstrate significant improvements in performance and processing time, especially under limited computational and data resources, highlighting its superior advantages. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09922v1-abstract-full').style.display = 'none'; document.getElementById('2412.09922v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09460">arXiv:2412.09460</a> <span> [<a href="https://arxiv.org/pdf/2412.09460">pdf</a>, <a href="https://arxiv.org/format/2412.09460">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> The Impact of Copyrighted Material on Large Language Models: A Norwegian Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=de+la+Rosa%2C+J">Javier de la Rosa</a>, <a href="/search/cs?searchtype=author&query=Mikhailov%2C+V">Vladislav Mikhailov</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lemei Zhang</a>, <a href="/search/cs?searchtype=author&query=Wetjen%2C+F">Freddy Wetjen</a>, <a href="/search/cs?searchtype=author&query=Samuel%2C+D">David Samuel</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peng Liu</a>, <a href="/search/cs?searchtype=author&query=Braaten%2C+R">Rolv-Arild Braaten</a>, <a href="/search/cs?searchtype=author&query=M%C3%A6hlum%2C+P">Petter M忙hlum</a>, <a href="/search/cs?searchtype=author&query=Birkenes%2C+M+B">Magnus Breder Birkenes</a>, <a href="/search/cs?searchtype=author&query=Kutuzov%2C+A">Andrey Kutuzov</a>, <a href="/search/cs?searchtype=author&query=Enstad%2C+T">Tita Enstad</a>, <a href="/search/cs?searchtype=author&query=Farseth%C3%A5s%2C+H+C">Hans Christian Farseth氓s</a>, <a href="/search/cs?searchtype=author&query=Brygfjeld%2C+S+A">Svein Arne Brygfjeld</a>, <a href="/search/cs?searchtype=author&query=Gulla%2C+J+A">Jon Atle Gulla</a>, <a href="/search/cs?searchtype=author&query=Oepen%2C+S">Stephan Oepen</a>, <a href="/search/cs?searchtype=author&query=Velldal%2C+E">Erik Velldal</a>, <a href="/search/cs?searchtype=author&query=%C3%98stgulen%2C+W">Wilfred 脴stgulen</a>, <a href="/search/cs?searchtype=author&query=%C3%98vrelid%2C+L">Liljia 脴vrelid</a>, <a href="/search/cs?searchtype=author&query=Myhre%2C+A+S">Aslak Sira Myhre</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09460v4-abstract-short" style="display: inline;"> The use of copyrighted materials in training language models raises critical legal and ethical questions. This paper presents a framework for and the results of empirically assessing the impact of publisher-controlled copyrighted corpora on the performance of generative large language models (LLMs) for Norwegian. When evaluated on a diverse set of tasks, we found that adding both books and newspap… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09460v4-abstract-full').style.display = 'inline'; document.getElementById('2412.09460v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09460v4-abstract-full" style="display: none;"> The use of copyrighted materials in training language models raises critical legal and ethical questions. This paper presents a framework for and the results of empirically assessing the impact of publisher-controlled copyrighted corpora on the performance of generative large language models (LLMs) for Norwegian. When evaluated on a diverse set of tasks, we found that adding both books and newspapers to the data mixture of LLMs tend to improve their performance, while the addition of fiction works seems to be detrimental. Our experiments could inform the creation of a compensation scheme for authors whose works contribute to AI development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09460v4-abstract-full').style.display = 'none'; document.getElementById('2412.09460v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 5 figures, 8 tables. Accepted at NoDaLiDa/Baltic-HLT 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08901">arXiv:2412.08901</a> <span> [<a href="https://arxiv.org/pdf/2412.08901">pdf</a>, <a href="https://arxiv.org/format/2412.08901">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Radiology Report Generation via Multi-objective Preference Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Ting Xiao</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+L">Lei Shi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peng Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhe Wang</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+C">Chenjia Bai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08901v2-abstract-short" style="display: inline;"> Automatic Radiology Report Generation (RRG) is an important topic for alleviating the substantial workload of radiologists. Existing RRG approaches rely on supervised regression based on different architectures or additional knowledge injection,while the generated report may not align optimally with radiologists' preferences. Especially, since the preferences of radiologists are inherently heterog… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08901v2-abstract-full').style.display = 'inline'; document.getElementById('2412.08901v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08901v2-abstract-full" style="display: none;"> Automatic Radiology Report Generation (RRG) is an important topic for alleviating the substantial workload of radiologists. Existing RRG approaches rely on supervised regression based on different architectures or additional knowledge injection,while the generated report may not align optimally with radiologists' preferences. Especially, since the preferences of radiologists are inherently heterogeneous and multidimensional, e.g., some may prioritize report fluency, while others emphasize clinical accuracy. To address this problem,we propose a new RRG method via Multi-objective Preference Optimization (MPO) to align the pre-trained RRG model with multiple human preferences, which can be formulated by multi-dimensional reward functions and optimized by multi-objective reinforcement learning (RL). Specifically, we use a preference vector to represent the weight of preferences and use it as a condition for the RRG model. Then, a linearly weighed reward is obtained via a dot product between the preference vector and multi-dimensional reward. Next,the RRG model is optimized to align with the preference vector by optimizing such a reward via RL. In the training stage,we randomly sample diverse preference vectors from the preference space and align the model by optimizing the weighted multi-objective rewards, which leads to an optimal policy on the entire preference space. When inference,our model can generate reports aligned with specific preferences without further fine-tuning. Extensive experiments on two public datasets show the proposed method can generate reports that cater to different preferences in a single model and achieve state-of-the-art performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08901v2-abstract-full').style.display = 'none'; document.getElementById('2412.08901v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08210">arXiv:2412.08210</a> <span> [<a href="https://arxiv.org/pdf/2412.08210">pdf</a>, <a href="https://arxiv.org/format/2412.08210">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Unicorn: Unified Neural Image Compression with One Number Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+Q">Qi Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haozhi Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zihao Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaming Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peiye Liu</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Z">Zhijian Hao</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yanheng Lu</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+D">Dimin Niu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jinjia Zhou</a>, <a href="/search/cs?searchtype=author&query=Jing%2C+M">Minge Jing</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Y">Yibo Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08210v1-abstract-short" style="display: inline;"> Prevalent lossy image compression schemes can be divided into: 1) explicit image compression (EIC), including traditional standards and neural end-to-end algorithms; 2) implicit image compression (IIC) based on implicit neural representations (INR). The former is encountering impasses of either leveling off bitrate reduction at a cost of tremendous complexity while the latter suffers from excessiv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08210v1-abstract-full').style.display = 'inline'; document.getElementById('2412.08210v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08210v1-abstract-full" style="display: none;"> Prevalent lossy image compression schemes can be divided into: 1) explicit image compression (EIC), including traditional standards and neural end-to-end algorithms; 2) implicit image compression (IIC) based on implicit neural representations (INR). The former is encountering impasses of either leveling off bitrate reduction at a cost of tremendous complexity while the latter suffers from excessive smoothing quality as well as lengthy decoder models. In this paper, we propose an innovative paradigm, which we dub \textbf{Unicorn} (\textbf{U}nified \textbf{N}eural \textbf{I}mage \textbf{C}ompression with \textbf{O}ne \textbf{N}number \textbf{R}econstruction). By conceptualizing the images as index-image pairs and learning the inherent distribution of pairs in a subtle neural network model, Unicorn can reconstruct a visually pleasing image from a randomly generated noise with only one index number. The neural model serves as the unified decoder of images while the noises and indexes corresponds to explicit representations. As a proof of concept, we propose an effective and efficient prototype of Unicorn based on latent diffusion models with tailored model designs. Quantitive and qualitative experimental results demonstrate that our prototype achieves significant bitrates reduction compared with EIC and IIC algorithms. More impressively, benefitting from the unified decoder, our compression ratio escalates as the quantity of images increases. We envision that more advanced model designs will endow Unicorn with greater potential in image compression. We will release our codes in \url{https://github.com/uniqzheng/Unicorn-Laduree}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08210v1-abstract-full').style.display = 'none'; document.getElementById('2412.08210v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06838">arXiv:2412.06838</a> <span> [<a href="https://arxiv.org/pdf/2412.06838">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Hardware implementation of timely reliable Bayesian decision-making using memristors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+L">Lekai Song</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Pengyu Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+J">Jingfang Pei</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+W">Wenyu Cui</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Songwei Liu</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Y">Yingyi Wen</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+T">Teng Ma</a>, <a href="/search/cs?searchtype=author&query=Pun%2C+K">Kong-Pang Pun</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+L+W+T">Leonard W. T. Ng</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+G">Guohua Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06838v2-abstract-short" style="display: inline;"> Brains perform decision-making by Bayes theorem. The theorem quantifies events as probabilities and, based on probability rules, renders the decisions. Learning from this, Bayes theorem can be applied to enable efficient user-scene interactions. However, given the probabilistic nature, implementing Bayes theorem in hardware using conventional deterministic computing can incur excessive computation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06838v2-abstract-full').style.display = 'inline'; document.getElementById('2412.06838v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06838v2-abstract-full" style="display: none;"> Brains perform decision-making by Bayes theorem. The theorem quantifies events as probabilities and, based on probability rules, renders the decisions. Learning from this, Bayes theorem can be applied to enable efficient user-scene interactions. However, given the probabilistic nature, implementing Bayes theorem in hardware using conventional deterministic computing can incur excessive computational cost and decision latency. Though challenging, here we present a probabilistic computing approach based on memristors to implement the Bayes theorem. We integrate memristors with Boolean logics and, by exploiting the volatile stochastic switching of the memristors, realise probabilistic logic operations, key for hardware Bayes theorem implementation. To empirically validate the efficacy of the hardware Bayes theorem in user-scene interactions, we develop lightweight Bayesian inference and fusion hardware operators using the probabilistic logics and apply the operators in road scene parsing for self-driving, including route planning and obstacle detection. The results show our operators can achieve reliable decisions in less than 0.4 ms (or equivalently 2,500 fps), outperforming human decision-making and the existing driving assistance systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06838v2-abstract-full').style.display = 'none'; document.getElementById('2412.06838v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06273">arXiv:2412.06273</a> <span> [<a href="https://arxiv.org/pdf/2412.06273">pdf</a>, <a href="https://arxiv.org/format/2412.06273">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Omni-Scene: Omni-Gaussian Representation for Ego-Centric Sparse-View Scene Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+D">Dongxu Wei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhiqi Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peidong Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06273v1-abstract-short" style="display: inline;"> Prior works employing pixel-based Gaussian representation have demonstrated efficacy in feed-forward sparse-view reconstruction. However, such representation necessitates cross-view overlap for accurate depth estimation, and is challenged by object occlusions and frustum truncations. As a result, these methods require scene-centric data acquisition to maintain cross-view overlap and complete scene… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06273v1-abstract-full').style.display = 'inline'; document.getElementById('2412.06273v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06273v1-abstract-full" style="display: none;"> Prior works employing pixel-based Gaussian representation have demonstrated efficacy in feed-forward sparse-view reconstruction. However, such representation necessitates cross-view overlap for accurate depth estimation, and is challenged by object occlusions and frustum truncations. As a result, these methods require scene-centric data acquisition to maintain cross-view overlap and complete scene visibility to circumvent occlusions and truncations, which limits their applicability to scene-centric reconstruction. In contrast, in autonomous driving scenarios, a more practical paradigm is ego-centric reconstruction, which is characterized by minimal cross-view overlap and frequent occlusions and truncations. The limitations of pixel-based representation thus hinder the utility of prior works in this task. In light of this, this paper conducts an in-depth analysis of different representations, and introduces Omni-Gaussian representation with tailored network design to complement their strengths and mitigate their drawbacks. Experiments show that our method significantly surpasses state-of-the-art methods, pixelSplat and MVSplat, in ego-centric reconstruction, and achieves comparable performance to prior works in scene-centric reconstruction. Furthermore, we extend our method with diffusion models, pioneering feed-forward multi-modal generation of 3D driving scenes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06273v1-abstract-full').style.display = 'none'; document.getElementById('2412.06273v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under Review</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Liu%2C+P&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Liu%2C+P&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Liu%2C+P&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Liu%2C+P&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Liu%2C+P&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Liu%2C+P&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository