Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 5,745 results for author: <span class="mathjax">Zhang, Z</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Zhang%2C+Z">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zhang, Z"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zhang%2C+Z&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zhang, Z"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10391">arXiv:2502.10391</a> <span> [<a href="https://arxiv.org/pdf/2502.10391">pdf</a>, <a href="https://arxiv.org/format/2502.10391">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MM-RLHF: The Next Step Forward in Multimodal LLM Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yi-Fan Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+T">Tao Yu</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+H">Haochen Tian</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+C">Chaoyou Fu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Peiyan Li</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+J">Jianshu Zeng</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+W">Wulin Xie</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yang Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Huanyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Junkang Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xue Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yibo Hu</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+B">Bin Wen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+F">Fan Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhang Zhang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+T">Tingting Gao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Di Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liang Wang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+R">Rong Jin</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+T">Tieniu Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10391v1-abstract-short" style="display: inline;"> Despite notable advancements in Multimodal Large Language Models (MLLMs), most state-of-the-art models have not undergone thorough alignment with human preferences. This gap exists because current alignment research has primarily achieved progress in specific areas (e.g., hallucination reduction), while the broader question of whether aligning models with human preferences can systematically enhan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10391v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10391v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10391v1-abstract-full" style="display: none;"> Despite notable advancements in Multimodal Large Language Models (MLLMs), most state-of-the-art models have not undergone thorough alignment with human preferences. This gap exists because current alignment research has primarily achieved progress in specific areas (e.g., hallucination reduction), while the broader question of whether aligning models with human preferences can systematically enhance MLLM capability remains largely unexplored. To this end, we introduce MM-RLHF, a dataset containing $\mathbf{120k}$ fine-grained, human-annotated preference comparison pairs. This dataset represents a substantial advancement over existing resources, offering superior size, diversity, annotation granularity, and quality. Leveraging this dataset, we propose several key innovations to improve both the quality of reward models and the efficiency of alignment algorithms. Notably, we introduce a Critique-Based Reward Model, which generates critiques of model outputs before assigning scores, offering enhanced interpretability and more informative feedback compared to traditional scalar reward mechanisms. Additionally, we propose Dynamic Reward Scaling, a method that adjusts the loss weight of each sample according to the reward signal, thereby optimizing the use of high-quality comparison pairs. Our approach is rigorously evaluated across $\mathbf{10}$ distinct dimensions and $\mathbf{27}$ benchmarks, with results demonstrating significant and consistent improvements in model performance. Specifically, fine-tuning LLaVA-ov-7B with MM-RLHF and our alignment algorithm leads to a $\mathbf{19.5}$% increase in conversational abilities and a $\mathbf{60}$% improvement in safety. We have open-sourced the preference dataset, reward model, training and evaluation code, as well as reward modeling and safety benchmarks. For more details, please visit our project page: https://mm-rlhf.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10391v1-abstract-full').style.display = 'none'; document.getElementById('2502.10391v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://mm-rlhf.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10248">arXiv:2502.10248</a> <span> [<a href="https://arxiv.org/pdf/2502.10248">pdf</a>, <a href="https://arxiv.org/format/2502.10248">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Step-Video-T2V Technical Report: The Practice, Challenges, and Future of Video Foundation Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+G">Guoqing Ma</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Haoyang Huang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+K">Kun Yan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liangyu Chen</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+N">Nan Duan</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+S">Shengming Yin</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+C">Changyi Wan</a>, <a href="/search/cs?searchtype=author&query=Ming%2C+R">Ranchen Ming</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xiaoniu Song</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xing Chen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yu Zhou</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+D">Deshan Sun</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+D">Deyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jian Zhou</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+K">Kaijun Tan</a>, <a href="/search/cs?searchtype=author&query=An%2C+K">Kang An</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mei Chen</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+W">Wei Ji</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qiling Wu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+W">Wen Sun</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xin Han</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yanan Wei</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+Z">Zheng Ge</a>, <a href="/search/cs?searchtype=author&query=Li%2C+A">Aojie Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bin Wang</a> , et al. (90 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10248v1-abstract-short" style="display: inline;"> We present Step-Video-T2V, a state-of-the-art text-to-video pre-trained model with 30B parameters and the ability to generate videos up to 204 frames in length. A deep compression Variational Autoencoder, Video-VAE, is designed for video generation tasks, achieving 16x16 spatial and 8x temporal compression ratios, while maintaining exceptional video reconstruction quality. User prompts are encoded… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10248v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10248v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10248v1-abstract-full" style="display: none;"> We present Step-Video-T2V, a state-of-the-art text-to-video pre-trained model with 30B parameters and the ability to generate videos up to 204 frames in length. A deep compression Variational Autoencoder, Video-VAE, is designed for video generation tasks, achieving 16x16 spatial and 8x temporal compression ratios, while maintaining exceptional video reconstruction quality. User prompts are encoded using two bilingual text encoders to handle both English and Chinese. A DiT with 3D full attention is trained using Flow Matching and is employed to denoise input noise into latent frames. A video-based DPO approach, Video-DPO, is applied to reduce artifacts and improve the visual quality of the generated videos. We also detail our training strategies and share key observations and insights. Step-Video-T2V's performance is evaluated on a novel video generation benchmark, Step-Video-T2V-Eval, demonstrating its state-of-the-art text-to-video quality when compared with both open-source and commercial engines. Additionally, we discuss the limitations of current diffusion-based model paradigm and outline future directions for video foundation models. We make both Step-Video-T2V and Step-Video-T2V-Eval available at https://github.com/stepfun-ai/Step-Video-T2V. The online version can be accessed from https://yuewen.cn/videos as well. Our goal is to accelerate the innovation of video foundation models and empower video content creators. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10248v1-abstract-full').style.display = 'none'; document.getElementById('2502.10248v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">35 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10038">arXiv:2502.10038</a> <span> [<a href="https://arxiv.org/pdf/2502.10038">pdf</a>, <a href="https://arxiv.org/format/2502.10038">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> POI-Enhancer: An LLM-based Semantic Enhancement Framework for POI Representation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jiawei Cheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yichuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+J">Jiahao Ji</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yuanshao Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhibo Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xiangyu Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10038v1-abstract-short" style="display: inline;"> POI representation learning plays a crucial role in handling tasks related to user mobility data. Recent studies have shown that enriching POI representations with multimodal information can significantly enhance their task performance. Previously, the textual information incorporated into POI representations typically involved only POI categories or check-in content, leading to relatively weak te… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10038v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10038v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10038v1-abstract-full" style="display: none;"> POI representation learning plays a crucial role in handling tasks related to user mobility data. Recent studies have shown that enriching POI representations with multimodal information can significantly enhance their task performance. Previously, the textual information incorporated into POI representations typically involved only POI categories or check-in content, leading to relatively weak textual features in existing methods. In contrast, large language models (LLMs) trained on extensive text data have been found to possess rich textual knowledge. However leveraging such knowledge to enhance POI representation learning presents two key challenges: first, how to extract POI-related knowledge from LLMs effectively, and second, how to integrate the extracted information to enhance POI representations. To address these challenges, we propose POI-Enhancer, a portable framework that leverages LLMs to improve POI representations produced by classic POI learning models. We first design three specialized prompts to extract semantic information from LLMs efficiently. Then, the Dual Feature Alignment module enhances the quality of the extracted information, while the Semantic Feature Fusion module preserves its integrity. The Cross Attention Fusion module then fully adaptively integrates such high-quality information into POI representations and Multi-View Contrastive Learning further injects human-understandable semantic information into these representations. Extensive experiments on three real-world datasets demonstrate the effectiveness of our framework, showing significant improvements across all baseline representations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10038v1-abstract-full').style.display = 'none'; document.getElementById('2502.10038v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09963">arXiv:2502.09963</a> <span> [<a href="https://arxiv.org/pdf/2502.09963">pdf</a>, <a href="https://arxiv.org/format/2502.09963">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Generating on Generated: An Approach Towards Self-Evolving Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xulu Zhang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xiaoyong Wei</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jinlin Wu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiaxin Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhaoxiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+Z">Zhen Lei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qing Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09963v1-abstract-short" style="display: inline;"> Recursive Self-Improvement (RSI) enables intelligence systems to autonomously refine their capabilities. This paper explores the application of RSI in text-to-image diffusion models, addressing the challenge of training collapse caused by synthetic data. We identify two key factors contributing to this collapse: the lack of perceptual alignment and the accumulation of generative hallucinations. To… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09963v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09963v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09963v1-abstract-full" style="display: none;"> Recursive Self-Improvement (RSI) enables intelligence systems to autonomously refine their capabilities. This paper explores the application of RSI in text-to-image diffusion models, addressing the challenge of training collapse caused by synthetic data. We identify two key factors contributing to this collapse: the lack of perceptual alignment and the accumulation of generative hallucinations. To mitigate these issues, we propose three strategies: (1) a prompt construction and filtering pipeline designed to facilitate the generation of perceptual aligned data, (2) a preference sampling method to identify human-preferred samples and filter out generative hallucinations, and (3) a distribution-based weighting scheme to penalize selected samples with hallucinatory errors. Our extensive experiments validate the effectiveness of these approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09963v1-abstract-full').style.display = 'none'; document.getElementById('2502.09963v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09872">arXiv:2502.09872</a> <span> [<a href="https://arxiv.org/pdf/2502.09872">pdf</a>, <a href="https://arxiv.org/format/2502.09872">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Learning to Calibrate for Reliable Visual Fire Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Ziqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xiuzhuang Zhou</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+X">Xiangyang Gong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09872v1-abstract-short" style="display: inline;"> Fire is characterized by its sudden onset and destructive power, making early fire detection crucial for ensuring human safety and protecting property. With the advancement of deep learning, the application of computer vision in fire detection has significantly improved. However, deep learning models often exhibit a tendency toward overconfidence, and most existing works focus primarily on enhanci… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09872v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09872v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09872v1-abstract-full" style="display: none;"> Fire is characterized by its sudden onset and destructive power, making early fire detection crucial for ensuring human safety and protecting property. With the advancement of deep learning, the application of computer vision in fire detection has significantly improved. However, deep learning models often exhibit a tendency toward overconfidence, and most existing works focus primarily on enhancing classification performance, with limited attention given to uncertainty modeling. To address this issue, we propose transforming the Expected Calibration Error (ECE), a metric for measuring uncertainty, into a differentiable ECE loss function. This loss is then combined with the cross-entropy loss to guide the training process of multi-class fire detection models. Additionally, to achieve a good balance between classification accuracy and reliable decision, we introduce a curriculum learning-based approach that dynamically adjusts the weight of the ECE loss during training. Extensive experiments are conducted on two widely used multi-class fire detection datasets, DFAN and EdgeFireSmoke, validating the effectiveness of our uncertainty modeling method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09872v1-abstract-full').style.display = 'none'; document.getElementById('2502.09872v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09819">arXiv:2502.09819</a> <span> [<a href="https://arxiv.org/pdf/2502.09819">pdf</a>, <a href="https://arxiv.org/format/2502.09819">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Programming Languages">cs.PL</span> </div> </div> <p class="title is-5 mathjax"> A Solver-Aided Hierarchical Language for LLM-Driven CAD Design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jones%2C+B+T">Benjamin T. Jones</a>, <a href="/search/cs?searchtype=author&query=H%C3%A4hnlein%2C+F">Felix H盲hnlein</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zihan Zhang</a>, <a href="/search/cs?searchtype=author&query=Ahmad%2C+M">Maaz Ahmad</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+V">Vladimir Kim</a>, <a href="/search/cs?searchtype=author&query=Schulz%2C+A">Adriana Schulz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09819v1-abstract-short" style="display: inline;"> Large language models (LLMs) have been enormously successful in solving a wide variety of structured and unstructured generative tasks, but they struggle to generate procedural geometry in Computer Aided Design (CAD). These difficulties arise from an inability to do spatial reasoning and the necessity to guide a model through complex, long range planning to generate complex geometry. We enable gen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09819v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09819v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09819v1-abstract-full" style="display: none;"> Large language models (LLMs) have been enormously successful in solving a wide variety of structured and unstructured generative tasks, but they struggle to generate procedural geometry in Computer Aided Design (CAD). These difficulties arise from an inability to do spatial reasoning and the necessity to guide a model through complex, long range planning to generate complex geometry. We enable generative CAD Design with LLMs through the introduction of a solver-aided, hierarchical domain specific language (DSL) called AIDL, which offloads the spatial reasoning requirements to a geometric constraint solver. Additionally, we show that in the few-shot regime, AIDL outperforms even a language with in-training data (OpenSCAD), both in terms of generating visual results closer to the prompt and creating objects that are easier to post-process and reason about. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09819v1-abstract-full').style.display = 'none'; document.getElementById('2502.09819v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09766">arXiv:2502.09766</a> <span> [<a href="https://arxiv.org/pdf/2502.09766">pdf</a>, <a href="https://arxiv.org/format/2502.09766">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> LLM-Generated Microservice Implementations from RESTful API Definitions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chauhan%2C+S">Saurabh Chauhan</a>, <a href="/search/cs?searchtype=author&query=Rasheed%2C+Z">Zeeshan Rasheed</a>, <a href="/search/cs?searchtype=author&query=Sami%2C+A+M">Abdul Malik Sami</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zheying Zhang</a>, <a href="/search/cs?searchtype=author&query=Rasku%2C+J">Jussi Rasku</a>, <a href="/search/cs?searchtype=author&query=Kemell%2C+K">Kai-Kristian Kemell</a>, <a href="/search/cs?searchtype=author&query=Abrahamsson%2C+P">Pekka Abrahamsson</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09766v1-abstract-short" style="display: inline;"> The growing need for scalable, maintainable, and fast-deploying systems has made microservice architecture widely popular in software development. This paper presents a system that uses Large Language Models (LLMs) to automate the API-first development of RESTful microservices. This system assists in creating OpenAPI specification, generating server code from it, and refining the code through a fe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09766v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09766v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09766v1-abstract-full" style="display: none;"> The growing need for scalable, maintainable, and fast-deploying systems has made microservice architecture widely popular in software development. This paper presents a system that uses Large Language Models (LLMs) to automate the API-first development of RESTful microservices. This system assists in creating OpenAPI specification, generating server code from it, and refining the code through a feedback loop that analyzes execution logs and error messages. By focusing on the API-first methodology, this system ensures that microservices are designed with well-defined interfaces, promoting consistency and reliability across the development life-cycle. The integration of log analysis enables the LLM to detect and address issues efficiently, reducing the number of iterations required to produce functional and robust services. This process automates the generation of microservices and also simplifies the debugging and refinement phases, allowing developers to focus on higher-level design and integration tasks. This system has the potential to benefit software developers, architects, and organizations to speed up software development cycles and reducing manual effort. To assess the potential of the system, we conducted surveys with six industry practitioners. After surveying practitioners, the system demonstrated notable advantages in enhancing development speed, automating repetitive tasks, and simplifying the prototyping process. While experienced developers appreciated its efficiency for specific tasks, some expressed concerns about its limitations in handling advanced customizations and larger scale projects. The code is publicly available at https://github.com/sirbh/code-gen <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09766v1-abstract-full').style.display = 'none'; document.getElementById('2502.09766v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09528">arXiv:2502.09528</a> <span> [<a href="https://arxiv.org/pdf/2502.09528">pdf</a>, <a href="https://arxiv.org/format/2502.09528">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> SteROI-D: System Design and Mapping for Stereo Depth Inference on Regions of Interest </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Erhardt%2C+J">Jack Erhardt</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Ziang Li</a>, <a href="/search/cs?searchtype=author&query=Pinkham%2C+R">Reid Pinkham</a>, <a href="/search/cs?searchtype=author&query=Berkovich%2C+A">Andrew Berkovich</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhengya Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09528v1-abstract-short" style="display: inline;"> Machine learning algorithms have enabled high quality stereo depth estimation to run on Augmented and Virtual Reality (AR/VR) devices. However, high energy consumption across the full image processing stack prevents stereo depth algorithms from running effectively on battery-limited devices. This paper introduces SteROI-D, a full stereo depth system paired with a mapping methodology. SteROI-D expl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09528v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09528v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09528v1-abstract-full" style="display: none;"> Machine learning algorithms have enabled high quality stereo depth estimation to run on Augmented and Virtual Reality (AR/VR) devices. However, high energy consumption across the full image processing stack prevents stereo depth algorithms from running effectively on battery-limited devices. This paper introduces SteROI-D, a full stereo depth system paired with a mapping methodology. SteROI-D exploits Region-of-Interest (ROI) and temporal sparsity at the system level to save energy. SteROI-D's flexible and heterogeneous compute fabric supports diverse ROIs. Importantly, we introduce a systematic mapping methodology to effectively handle dynamic ROIs, thereby maximizing energy savings. Using these techniques, our 28nm prototype SteROI-D design achieves up to 4.35x reduction in total system energy compared to a baseline ASIC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09528v1-abstract-full').style.display = 'none'; document.getElementById('2502.09528v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted as a full paper by the 2025 EDGE AI FOUNDATION Austin</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09412">arXiv:2502.09412</a> <span> [<a href="https://arxiv.org/pdf/2502.09412">pdf</a>, <a href="https://arxiv.org/format/2502.09412">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Combinatorics">math.CO</span> </div> </div> <p class="title is-5 mathjax"> A LP-rounding based algorithm for soft capacitated facility location problem with submodular penalties </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+H">Hanyin Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiaming Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhikang Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weidong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09412v1-abstract-short" style="display: inline;"> The soft capacitated facility location problem (SCFLP) is a classic combinatorial optimization problem, with its variants widely applied in the fields of operations research and computer science. In the SCFLP, given a set $\mathcal{F}$ of facilities and a set $\mathcal{D}$ of clients, each facility has a capacity and an open cost, allowing to open multiple times, and each client has a demand. Th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09412v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09412v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09412v1-abstract-full" style="display: none;"> The soft capacitated facility location problem (SCFLP) is a classic combinatorial optimization problem, with its variants widely applied in the fields of operations research and computer science. In the SCFLP, given a set $\mathcal{F}$ of facilities and a set $\mathcal{D}$ of clients, each facility has a capacity and an open cost, allowing to open multiple times, and each client has a demand. This problem is to find a subset of facilities in $\mathcal{F}$ and connect each client to the facilities opened, such that the total cost including open cost and connection cost is minimied. SCFLP is a NP-hard problem, which has led to a focus on approximation algorithms. Based on this, we consider a variant, that is, soft capacitated facility location problem with submodular penalties (SCFLPSP), which allows some clients not to be served by accepting the penalty cost. And we consider the integer splittable case of demand, that is, the demand of each client is served by multiple facilities with the integer service amount by each facility. Based on LP-rounding, we propose a $(位R+4)$-approximation algorithm, where $R=\frac{\max_{i \in \mathcal{F} }f_i}{\min_{i \in \mathcal{F} }f_i},位=\frac{R+\sqrt{R^2+8R}}{2R}$. In particular, when the open cost is uniform, the approximation ratio is 6. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09412v1-abstract-full').style.display = 'none'; document.getElementById('2502.09412v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08918">arXiv:2502.08918</a> <span> [<a href="https://arxiv.org/pdf/2502.08918">pdf</a>, <a href="https://arxiv.org/format/2502.08918">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> CLEAR: Cluster-based Prompt Learning on Heterogeneous Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+F">Feiyang Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhongbao Zhang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Junda Ye</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Li Sun</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+J">Jianzhong Qi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08918v1-abstract-short" style="display: inline;"> Prompt learning has attracted increasing attention in the graph domain as a means to bridge the gap between pretext and downstream tasks. Existing studies on heterogeneous graph prompting typically use feature prompts to modify node features for specific downstream tasks, which do not concern the structure of heterogeneous graphs. Such a design also overlooks information from the meta-paths, which… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08918v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08918v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08918v1-abstract-full" style="display: none;"> Prompt learning has attracted increasing attention in the graph domain as a means to bridge the gap between pretext and downstream tasks. Existing studies on heterogeneous graph prompting typically use feature prompts to modify node features for specific downstream tasks, which do not concern the structure of heterogeneous graphs. Such a design also overlooks information from the meta-paths, which are core to learning the high-order semantics of the heterogeneous graphs. To address these issues, we propose CLEAR, a Cluster-based prompt LEARNING model on heterogeneous graphs. We present cluster prompts that reformulate downstream tasks as heterogeneous graph reconstruction. In this way, we align the pretext and downstream tasks to share the same training objective. Additionally, our cluster prompts are also injected into the meta-paths such that the prompt learning process incorporates high-order semantic information entailed by the meta-paths. Extensive experiments on downstream tasks confirm the superiority of CLEAR. It consistently outperforms state-of-the-art models, achieving up to 5% improvement on the F1 metric for node classification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08918v1-abstract-full').style.display = 'none'; document.getElementById('2502.08918v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by PAKDD 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08745">arXiv:2502.08745</a> <span> [<a href="https://arxiv.org/pdf/2502.08745">pdf</a>, <a href="https://arxiv.org/format/2502.08745">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> IHEval: Evaluating Language Models on Following the Instruction Hierarchy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhihan Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shiyang Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zixuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xin Liu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Haoming Jiang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xianfeng Tang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yifan Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zheng Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haodong Wang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhaoxuan Tan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yichuan Li</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Q">Qingyu Yin</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+B">Bing Yin</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+M">Meng Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08745v1-abstract-short" style="display: inline;"> The instruction hierarchy, which establishes a priority order from system messages to user messages, conversation history, and tool outputs, is essential for ensuring consistent and safe behavior in language models (LMs). Despite its importance, this topic receives limited attention, and there is a lack of comprehensive benchmarks for evaluating models' ability to follow the instruction hierarchy.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08745v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08745v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08745v1-abstract-full" style="display: none;"> The instruction hierarchy, which establishes a priority order from system messages to user messages, conversation history, and tool outputs, is essential for ensuring consistent and safe behavior in language models (LMs). Despite its importance, this topic receives limited attention, and there is a lack of comprehensive benchmarks for evaluating models' ability to follow the instruction hierarchy. We bridge this gap by introducing IHEval, a novel benchmark comprising 3,538 examples across nine tasks, covering cases where instructions in different priorities either align or conflict. Our evaluation of popular LMs highlights their struggle to recognize instruction priorities. All evaluated models experience a sharp performance decline when facing conflicting instructions, compared to their original instruction-following performance. Moreover, the most competitive open-source model only achieves 48% accuracy in resolving such conflicts. Our results underscore the need for targeted optimization in the future development of LMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08745v1-abstract-full').style.display = 'none'; document.getElementById('2502.08745v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NAACL 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08528">arXiv:2502.08528</a> <span> [<a href="https://arxiv.org/pdf/2502.08528">pdf</a>, <a href="https://arxiv.org/format/2502.08528">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Astrophysics of Galaxies">astro-ph.GA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> BCDDM: Branch-Corrected Denoising Diffusion Model for Black Hole Image Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=liu%2C+A">Ao liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zelin Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Songbai Chen</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+C">Cuihong Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08528v1-abstract-short" style="display: inline;"> The properties of black holes and accretion flows can be inferred by fitting Event Horizon Telescope (EHT) data to simulated images generated through general relativistic ray tracing (GRRT). However, due to the computationally intensive nature of GRRT, the efficiency of generating specific radiation flux images needs to be improved. This paper introduces the Branch Correction Denoising Diffusion M… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08528v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08528v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08528v1-abstract-full" style="display: none;"> The properties of black holes and accretion flows can be inferred by fitting Event Horizon Telescope (EHT) data to simulated images generated through general relativistic ray tracing (GRRT). However, due to the computationally intensive nature of GRRT, the efficiency of generating specific radiation flux images needs to be improved. This paper introduces the Branch Correction Denoising Diffusion Model (BCDDM), which uses a branch correction mechanism and a weighted mixed loss function to improve the accuracy of generated black hole images based on seven physical parameters of the radiatively inefficient accretion flow (RIAF) model. Our experiments show a strong correlation between the generated images and their physical parameters. By enhancing the GRRT dataset with BCDDM-generated images and using ResNet50 for parameter regression, we achieve significant improvements in parameter prediction performance. This approach reduces computational costs and provides a faster, more efficient method for dataset expansion, parameter estimation, and model fitting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08528v1-abstract-full').style.display = 'none'; document.getElementById('2502.08528v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08353">arXiv:2502.08353</a> <span> [<a href="https://arxiv.org/pdf/2502.08353">pdf</a>, <a href="https://arxiv.org/format/2502.08353">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Trustworthy GNNs with LLMs: A Systematic Review and Taxonomy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xue%2C+R">Ruizhan Xue</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Huimin Deng</a>, <a href="/search/cs?searchtype=author&query=He%2C+F">Fang He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Maojun Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zeyu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08353v1-abstract-short" style="display: inline;"> With the extensive application of Graph Neural Networks (GNNs) across various domains, their trustworthiness has emerged as a focal point of research. Some existing studies have shown that the integration of large language models (LLMs) can improve the semantic understanding and generation capabilities of GNNs, which in turn improves the trustworthiness of GNNs from various aspects. Our review int… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08353v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08353v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08353v1-abstract-full" style="display: none;"> With the extensive application of Graph Neural Networks (GNNs) across various domains, their trustworthiness has emerged as a focal point of research. Some existing studies have shown that the integration of large language models (LLMs) can improve the semantic understanding and generation capabilities of GNNs, which in turn improves the trustworthiness of GNNs from various aspects. Our review introduces a taxonomy that offers researchers a clear framework for comprehending the principles and applications of different methods and helps clarify the connections and differences among various approaches. Then we systematically survey representative approaches along the four categories of our taxonomy. Through our taxonomy, researchers can understand the applicable scenarios, potential advantages, and limitations of each approach for the the trusted integration of GNNs with LLMs. Finally, we present some promising directions of work and future trends for the integration of LLMs and GNNs to improve model trustworthiness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08353v1-abstract-full').style.display = 'none'; document.getElementById('2502.08353v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to IJCAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08234">arXiv:2502.08234</a> <span> [<a href="https://arxiv.org/pdf/2502.08234">pdf</a>, <a href="https://arxiv.org/format/2502.08234">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Learning Human Skill Generators at Key-Step Levels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yilu Wu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+C">Chenhui Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuai Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hanlin Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jing Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhaoxiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Limin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08234v1-abstract-short" style="display: inline;"> We are committed to learning human skill generators at key-step levels. The generation of skills is a challenging endeavor, but its successful implementation could greatly facilitate human skill learning and provide more experience for embodied intelligence. Although current video generation models can synthesis simple and atomic human operations, they struggle with human skills due to their compl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08234v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08234v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08234v1-abstract-full" style="display: none;"> We are committed to learning human skill generators at key-step levels. The generation of skills is a challenging endeavor, but its successful implementation could greatly facilitate human skill learning and provide more experience for embodied intelligence. Although current video generation models can synthesis simple and atomic human operations, they struggle with human skills due to their complex procedure process. Human skills involve multi-step, long-duration actions and complex scene transitions, so the existing naive auto-regressive methods for synthesizing long videos cannot generate human skills. To address this, we propose a novel task, the Key-step Skill Generation (KS-Gen), aimed at reducing the complexity of generating human skill videos. Given the initial state and a skill description, the task is to generate video clips of key steps to complete the skill, rather than a full-length video. To support this task, we introduce a carefully curated dataset and define multiple evaluation metrics to assess performance. Considering the complexity of KS-Gen, we propose a new framework for this task. First, a multimodal large language model (MLLM) generates descriptions for key steps using retrieval argument. Subsequently, we use a Key-step Image Generator (KIG) to address the discontinuity between key steps in skill videos. Finally, a video generation model uses these descriptions and key-step images to generate video clips of the key steps with high temporal consistency. We offer a detailed analysis of the results, hoping to provide more insights on human skill generation. All models and data are available at https://github.com/MCG-NJU/KS-Gen. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08234v1-abstract-full').style.display = 'none'; document.getElementById('2502.08234v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08161">arXiv:2502.08161</a> <span> [<a href="https://arxiv.org/pdf/2502.08161">pdf</a>, <a href="https://arxiv.org/format/2502.08161">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ICDM54844.2022.00070">10.1109/ICDM54844.2022.00070 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> MixDec Sampling: A Soft Link-based Sampling Method of Graph Neural Network for Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+X">Xiangjin Xie</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuxin Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruipeng Wang</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+K">Kai Ouyang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zihan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Hai-Tao Zheng</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+B">Buyue Qian</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Hansen Zheng</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+B">Bo Hu</a>, <a href="/search/cs?searchtype=author&query=Zhuo%2C+C">Chengxiang Zhuo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08161v1-abstract-short" style="display: inline;"> Graph neural networks have been widely used in recent recommender systems, where negative sampling plays an important role. Existing negative sampling methods restrict the relationship between nodes as either hard positive pairs or hard negative pairs. This leads to the loss of structural information, and lacks the mechanism to generate positive pairs for nodes with few neighbors. To overcome limi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08161v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08161v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08161v1-abstract-full" style="display: none;"> Graph neural networks have been widely used in recent recommender systems, where negative sampling plays an important role. Existing negative sampling methods restrict the relationship between nodes as either hard positive pairs or hard negative pairs. This leads to the loss of structural information, and lacks the mechanism to generate positive pairs for nodes with few neighbors. To overcome limitations, we propose a novel soft link-based sampling method, namely MixDec Sampling, which consists of Mixup Sampling module and Decay Sampling module. The Mixup Sampling augments node features by synthesizing new nodes and soft links, which provides sufficient number of samples for nodes with few neighbors. The Decay Sampling strengthens the digestion of graph structure information by generating soft links for node embedding learning. To the best of our knowledge, we are the first to model sampling relationships between nodes by soft links in GNN-based recommender systems. Extensive experiments demonstrate that the proposed MixDec Sampling can significantly and consistently improve the recommendation performance of several representative GNN-based models on various recommendation benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08161v1-abstract-full').style.display = 'none'; document.getElementById('2502.08161v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07490">arXiv:2502.07490</a> <span> [<a href="https://arxiv.org/pdf/2502.07490">pdf</a>, <a href="https://arxiv.org/format/2502.07490">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Mask-Enhanced Autoregressive Prediction: Pay Less Attention to Learn More </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhuang%2C+X">Xialie Zhuang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Z">Zhikai Jia</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jianjin Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhenyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+L">Li Shen</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Z">Zheng Cao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shiwei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07490v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) are discovered to suffer from accurately retrieving key information. To address this, we propose Mask-Enhanced Autoregressive Prediction (MEAP), a simple yet effective training paradigm that seamlessly integrates Masked Language Modeling (MLM) into Next-Token Prediction (NTP) to enhance the latter's in-context retrieval capabilities. Specifically, MEAP first randomly m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07490v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07490v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07490v1-abstract-full" style="display: none;"> Large Language Models (LLMs) are discovered to suffer from accurately retrieving key information. To address this, we propose Mask-Enhanced Autoregressive Prediction (MEAP), a simple yet effective training paradigm that seamlessly integrates Masked Language Modeling (MLM) into Next-Token Prediction (NTP) to enhance the latter's in-context retrieval capabilities. Specifically, MEAP first randomly masks a small fraction of input tokens and then directly performs the standard next-token prediction autoregressive using a decoder-only Transformer. MEAP eliminates the need for bidirectional attention or encoder-decoder architectures for MLM, incurring no additional computational overhead during pre-training or inference. Intensive experiments demonstrate that MEAP substantially outperforms NTP on key information retrieval and long-context reasoning tasks, while performing on par or better on commonsense reasoning tasks. The benefits of MEAP also extend to supervised fine-tuning, where it shows remarkable advantages in lost-in-the-middle scenarios, outperforming NTP by 11.77 percentage points. Our analysis indicates that MEAP's effectiveness arises from its ability to promote more distinguishable attention scores by concentrating on a reduced set of non-masked tokens. This mechanism improves the model's focus on task-relevant signals while mitigating the influence of peripheral context. These findings position MEAP as a promising training paradigm for large language models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07490v1-abstract-full').style.display = 'none'; document.getElementById('2502.07490v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages,7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06876">arXiv:2502.06876</a> <span> [<a href="https://arxiv.org/pdf/2502.06876">pdf</a>, <a href="https://arxiv.org/format/2502.06876">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Mix Data or Merge Models? Balancing the Helpfulness, Honesty, and Harmlessness of Large Language Model via Model Merging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jinluan Yang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+D">Dingnan Jin</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+A">Anke Tang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+L">Li Shen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+D">Didi Zhu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhengyu Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Daixin Wang</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Q">Qing Cui</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhiqiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jun Zhou</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fei Wu</a>, <a href="/search/cs?searchtype=author&query=Kuang%2C+K">Kun Kuang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06876v2-abstract-short" style="display: inline;"> Achieving balanced alignment of large language models (LLMs) in terms of Helpfulness, Honesty, and Harmlessness (3H optimization) constitutes a cornerstone of responsible AI, with existing methods like data mixture strategies facing limitations including reliance on expert knowledge and conflicting optimization signals. While model merging offers a promising alternative by integrating specialized… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06876v2-abstract-full').style.display = 'inline'; document.getElementById('2502.06876v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06876v2-abstract-full" style="display: none;"> Achieving balanced alignment of large language models (LLMs) in terms of Helpfulness, Honesty, and Harmlessness (3H optimization) constitutes a cornerstone of responsible AI, with existing methods like data mixture strategies facing limitations including reliance on expert knowledge and conflicting optimization signals. While model merging offers a promising alternative by integrating specialized models, its potential for 3H optimization remains underexplored. This paper establishes the first comprehensive benchmark for model merging in 3H-aligned LLMs, systematically evaluating 15 methods (12 training-free merging and 3 data mixture techniques) across 10 datasets associated with 5 annotation dimensions, 2 LLM families, and 2 training paradigms. Our analysis reveals three pivotal insights: (i) previously overlooked collaborative/conflicting relationships among 3H dimensions, (ii) the consistent superiority of model merging over data mixture approaches in balancing alignment trade-offs, and (iii) the critical role of parameter-level conflict resolution through redundant component pruning and outlier mitigation. Building on these findings, we propose R-TSVM, a Reweighting-enhanced Task Singular Vector Merging method that incorporates outlier-aware parameter weighting and sparsity-adaptive rank selection strategies adapted to the heavy-tailed parameter distribution and sparsity for LLMs, further improving LLM alignment across multiple evaluations. We release our trained models for further exploration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06876v2-abstract-full').style.display = 'none'; document.getElementById('2502.06876v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06852">arXiv:2502.06852</a> <span> [<a href="https://arxiv.org/pdf/2502.06852">pdf</a>, <a href="https://arxiv.org/format/2502.06852">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> EAP-GP: Mitigating Saturation Effect in Gradient-based Automated Circuit Identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lin Zhang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+W">Wenshuo Dong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhuoran Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shu Yang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+L">Lijie Hu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+N">Ninghao Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+P">Pan Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Di Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06852v1-abstract-short" style="display: inline;"> Understanding the internal mechanisms of transformer-based language models remains challenging. Mechanistic interpretability based on circuit discovery aims to reverse engineer neural networks by analyzing their internal processes at the level of computational subgraphs. In this paper, we revisit existing gradient-based circuit identification methods and find that their performance is either affec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06852v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06852v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06852v1-abstract-full" style="display: none;"> Understanding the internal mechanisms of transformer-based language models remains challenging. Mechanistic interpretability based on circuit discovery aims to reverse engineer neural networks by analyzing their internal processes at the level of computational subgraphs. In this paper, we revisit existing gradient-based circuit identification methods and find that their performance is either affected by the zero-gradient problem or saturation effects, where edge attribution scores become insensitive to input changes, resulting in noisy and unreliable attribution evaluations for circuit components. To address the saturation effect, we propose Edge Attribution Patching with GradPath (EAP-GP), EAP-GP introduces an integration path, starting from the input and adaptively following the direction of the difference between the gradients of corrupted and clean inputs to avoid the saturated region. This approach enhances attribution reliability and improves the faithfulness of circuit identification. We evaluate EAP-GP on 6 datasets using GPT-2 Small, GPT-2 Medium, and GPT-2 XL. Experimental results demonstrate that EAP-GP outperforms existing methods in circuit faithfulness, achieving improvements up to 17.7%. Comparisons with manually annotated ground-truth circuits demonstrate that EAP-GP achieves precision and recall comparable to or better than previous approaches, highlighting its effectiveness in identifying accurate circuits. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06852v1-abstract-full').style.display = 'none'; document.getElementById('2502.06852v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06827">arXiv:2502.06827</a> <span> [<a href="https://arxiv.org/pdf/2502.06827">pdf</a>, <a href="https://arxiv.org/format/2502.06827">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TNNLS.2022.3202842">10.1109/TNNLS.2022.3202842 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Learning to Synthesize Compatible Fashion Items Using Semantic Alignment and Collocation Classification: An Outfit Generation Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+D">Dongliang Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haijun Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kai Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Linlin Liu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Han Yan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiaofei Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+S">Shuicheng Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06827v1-abstract-short" style="display: inline;"> The field of fashion compatibility learning has attracted great attention from both the academic and industrial communities in recent years. Many studies have been carried out for fashion compatibility prediction, collocated outfit recommendation, artificial intelligence (AI)-enabled compatible fashion design, and related topics. In particular, AI-enabled compatible fashion design can be used to s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06827v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06827v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06827v1-abstract-full" style="display: none;"> The field of fashion compatibility learning has attracted great attention from both the academic and industrial communities in recent years. Many studies have been carried out for fashion compatibility prediction, collocated outfit recommendation, artificial intelligence (AI)-enabled compatible fashion design, and related topics. In particular, AI-enabled compatible fashion design can be used to synthesize compatible fashion items or outfits in order to improve the design experience for designers or the efficacy of recommendations for customers. However, previous generative models for collocated fashion synthesis have generally focused on the image-to-image translation between fashion items of upper and lower clothing. In this paper, we propose a novel outfit generation framework, i.e., OutfitGAN, with the aim of synthesizing a set of complementary items to compose an entire outfit, given one extant fashion item and reference masks of target synthesized items. OutfitGAN includes a semantic alignment module, which is responsible for characterizing the mapping correspondence between the existing fashion items and the synthesized ones, to improve the quality of the synthesized images, and a collocation classification module, which is used to improve the compatibility of a synthesized outfit. In order to evaluate the performance of our proposed models, we built a large-scale dataset consisting of 20,000 fashion outfits. Extensive experimental results on this dataset show that our OutfitGAN can synthesize photo-realistic outfits and outperform state-of-the-art methods in terms of similarity, authenticity and compatibility measurements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06827v1-abstract-full').style.display = 'none'; document.getElementById('2502.06827v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper was accepted by IEEE TNNLS</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06823">arXiv:2502.06823</a> <span> [<a href="https://arxiv.org/pdf/2502.06823">pdf</a>, <a href="https://arxiv.org/format/2502.06823">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> CTR-Driven Advertising Image Generation with Multimodal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xingye Chen</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+W">Wei Feng</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhenbang Du</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weizhen Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yanyin Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haohan Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Linkai Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yaoyu Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jinyuan Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yu Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+J">Jingjing Lv</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Junjie Shen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhangang Lin</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+J">Jingping Shao</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Y">Yuanjie Shao</a>, <a href="/search/cs?searchtype=author&query=You%2C+X">Xinge You</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+C">Changxin Gao</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+N">Nong Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06823v1-abstract-short" style="display: inline;"> In web data, advertising images are crucial for capturing user attention and improving advertising effectiveness. Most existing methods generate background for products primarily focus on the aesthetic quality, which may fail to achieve satisfactory online performance. To address this limitation, we explore the use of Multimodal Large Language Models (MLLMs) for generating advertising images by op… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06823v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06823v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06823v1-abstract-full" style="display: none;"> In web data, advertising images are crucial for capturing user attention and improving advertising effectiveness. Most existing methods generate background for products primarily focus on the aesthetic quality, which may fail to achieve satisfactory online performance. To address this limitation, we explore the use of Multimodal Large Language Models (MLLMs) for generating advertising images by optimizing for Click-Through Rate (CTR) as the primary objective. Firstly, we build targeted pre-training tasks, and leverage a large-scale e-commerce multimodal dataset to equip MLLMs with initial capabilities for advertising image generation tasks. To further improve the CTR of generated images, we propose a novel reward model to fine-tune pre-trained MLLMs through Reinforcement Learning (RL), which can jointly utilize multimodal features and accurately reflect user click preferences. Meanwhile, a product-centric preference optimization strategy is developed to ensure that the generated background content aligns with the product characteristics after fine-tuning, enhancing the overall relevance and effectiveness of the advertising images. Extensive experiments have demonstrated that our method achieves state-of-the-art performance in both online and offline metrics. Our code and pre-trained models are publicly available at: https://github.com/Chenguoz/CAIG. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06823v1-abstract-full').style.display = 'none'; document.getElementById('2502.06823v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to WWW 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06719">arXiv:2502.06719</a> <span> [<a href="https://arxiv.org/pdf/2502.06719">pdf</a>, <a href="https://arxiv.org/ps/2502.06719">ps</a>, <a href="https://arxiv.org/format/2502.06719">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Probability">math.PR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Statistics Theory">math.ST</span> </div> </div> <p class="title is-5 mathjax"> Gaussian Approximation and Multiplier Bootstrap for Stochastic Gradient Descent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sheshukova%2C+M">Marina Sheshukova</a>, <a href="/search/cs?searchtype=author&query=Samsonov%2C+S">Sergey Samsonov</a>, <a href="/search/cs?searchtype=author&query=Belomestny%2C+D">Denis Belomestny</a>, <a href="/search/cs?searchtype=author&query=Moulines%2C+E">Eric Moulines</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Q">Qi-Man Shao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhuo-Song Zhang</a>, <a href="/search/cs?searchtype=author&query=Naumov%2C+A">Alexey Naumov</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06719v1-abstract-short" style="display: inline;"> In this paper, we establish non-asymptotic convergence rates in the central limit theorem for Polyak-Ruppert-averaged iterates of stochastic gradient descent (SGD). Our analysis builds on the result of the Gaussian approximation for nonlinear statistics of independent random variables of Shao and Zhang (2022). Using this result, we prove the non-asymptotic validity of the multiplier bootstrap for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06719v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06719v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06719v1-abstract-full" style="display: none;"> In this paper, we establish non-asymptotic convergence rates in the central limit theorem for Polyak-Ruppert-averaged iterates of stochastic gradient descent (SGD). Our analysis builds on the result of the Gaussian approximation for nonlinear statistics of independent random variables of Shao and Zhang (2022). Using this result, we prove the non-asymptotic validity of the multiplier bootstrap for constructing the confidence sets for the optimal solution of an optimization problem. In particular, our approach avoids the need to approximate the limiting covariance of Polyak-Ruppert SGD iterates, which allows us to derive approximation rates in convex distance of order up to $1/\sqrt{n}$. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06719v1-abstract-full').style.display = 'none'; document.getElementById('2502.06719v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 60F05; 62L20; 93E35 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06635">arXiv:2502.06635</a> <span> [<a href="https://arxiv.org/pdf/2502.06635">pdf</a>, <a href="https://arxiv.org/format/2502.06635">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Steel-LLM:From Scratch to Open Source -- A Personal Journey in Building a Chinese-Centric LLM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gu%2C+Q">Qingshui Gu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shu Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+T">Tianyu Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhaoxiang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06635v2-abstract-short" style="display: inline;"> Steel-LLM is a Chinese-centric language model developed from scratch with the goal of creating a high-quality, open-source model despite limited computational resources. Launched in March 2024, the project aimed to train a 1-billion-parameter model on a large-scale dataset, prioritizing transparency and the sharing of practical insights to assist others in the community. The training process prima… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06635v2-abstract-full').style.display = 'inline'; document.getElementById('2502.06635v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06635v2-abstract-full" style="display: none;"> Steel-LLM is a Chinese-centric language model developed from scratch with the goal of creating a high-quality, open-source model despite limited computational resources. Launched in March 2024, the project aimed to train a 1-billion-parameter model on a large-scale dataset, prioritizing transparency and the sharing of practical insights to assist others in the community. The training process primarily focused on Chinese data, with a small proportion of English data included, addressing gaps in existing open-source LLMs by providing a more detailed and practical account of the model-building journey. Steel-LLM has demonstrated competitive performance on benchmarks such as CEVAL and CMMLU, outperforming early models from larger institutions. This paper provides a comprehensive summary of the project's key contributions, including data collection, model design, training methodologies, and the challenges encountered along the way, offering a valuable resource for researchers and practitioners looking to develop their own LLMs. The model checkpoints and training script are available at https://github.com/zhanshijinwat/Steel-LLM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06635v2-abstract-full').style.display = 'none'; document.getElementById('2502.06635v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06586">arXiv:2502.06586</a> <span> [<a href="https://arxiv.org/pdf/2502.06586">pdf</a>, <a href="https://arxiv.org/ps/2502.06586">ps</a>, <a href="https://arxiv.org/format/2502.06586">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Probability">math.PR</span> </div> </div> <p class="title is-5 mathjax"> Decay of correlation for edge colorings when $q>3螖$ </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zejia Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yulin Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chihao Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zihan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06586v1-abstract-short" style="display: inline;"> We examine various perspectives on the decay of correlation for the uniform distribution over proper $q$-edge colorings of graphs with maximum degree $螖$. First, we establish the coupling independence property when $q\ge 3螖$ for general graphs. Together with the work of Chen et al. (2024), this result implies a fully polynomial-time approximation scheme (FPTAS) for counting the number of proper… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06586v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06586v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06586v1-abstract-full" style="display: none;"> We examine various perspectives on the decay of correlation for the uniform distribution over proper $q$-edge colorings of graphs with maximum degree $螖$. First, we establish the coupling independence property when $q\ge 3螖$ for general graphs. Together with the work of Chen et al. (2024), this result implies a fully polynomial-time approximation scheme (FPTAS) for counting the number of proper $q$-edge colorings. Next, we prove the strong spatial mixing property on trees, provided that $q> (3+o(1))螖$. The strong spatial mixing property is derived from the spectral independence property of a version of the weighted edge coloring distribution, which is established using the matrix trickle-down method developed in Abdolazimi, Liu and Oveis Gharan (FOCS, 2021) and Wang, Zhang and Zhang (STOC, 2024). Finally, we show that the weak spatial mixing property holds on trees with maximum degree $螖$ if and only if $q\ge 2螖-1$. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06586v1-abstract-full').style.display = 'none'; document.getElementById('2502.06586v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06494">arXiv:2502.06494</a> <span> [<a href="https://arxiv.org/pdf/2502.06494">pdf</a>, <a href="https://arxiv.org/format/2502.06494">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> GuideLLM: Exploring LLM-Guided Conversation with Applications in Autobiography Interviewing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Duan%2C+J">Jinhao Duan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xinyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhuoxuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Ko%2C+E">Eunhye Ko</a>, <a href="/search/cs?searchtype=author&query=Boddy%2C+L">Lily Boddy</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chenan Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianhao Li</a>, <a href="/search/cs?searchtype=author&query=Rasgon%2C+A">Alexander Rasgon</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+J">Junyuan Hong</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+M+K">Min Kyung Lee</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+C">Chenxi Yuan</a>, <a href="/search/cs?searchtype=author&query=Long%2C+Q">Qi Long</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Ying Ding</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianlong Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+K">Kaidi Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06494v1-abstract-short" style="display: inline;"> Although Large Language Models (LLMs) succeed in human-guided conversations such as instruction following and question answering, the potential of LLM-guided conversations-where LLMs direct the discourse and steer the conversation's objectives-remains under-explored. In this study, we first characterize LLM-guided conversation into three fundamental components: (i) Goal Navigation; (ii) Context Ma… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06494v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06494v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06494v1-abstract-full" style="display: none;"> Although Large Language Models (LLMs) succeed in human-guided conversations such as instruction following and question answering, the potential of LLM-guided conversations-where LLMs direct the discourse and steer the conversation's objectives-remains under-explored. In this study, we first characterize LLM-guided conversation into three fundamental components: (i) Goal Navigation; (ii) Context Management; (iii) Empathetic Engagement, and propose GuideLLM as an installation. We then implement an interviewing environment for the evaluation of LLM-guided conversation. Specifically, various topics are involved in this environment for comprehensive interviewing evaluation, resulting in around 1.4k turns of utterances, 184k tokens, and over 200 events mentioned during the interviewing for each chatbot evaluation. We compare GuideLLM with 6 state-of-the-art LLMs such as GPT-4o and Llama-3-70b-Instruct, from the perspective of interviewing quality, and autobiography generation quality. For automatic evaluation, we derive user proxies from multiple autobiographies and employ LLM-as-a-judge to score LLM behaviors. We further conduct a human-involved experiment by employing 45 human participants to chat with GuideLLM and baselines. We then collect human feedback, preferences, and ratings regarding the qualities of conversation and autobiography. Experimental results indicate that GuideLLM significantly outperforms baseline LLMs in automatic evaluation and achieves consistent leading performances in human ratings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06494v1-abstract-full').style.display = 'none'; document.getElementById('2502.06494v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">31 pages; the first three authors contributed equally</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06280">arXiv:2502.06280</a> <span> [<a href="https://arxiv.org/pdf/2502.06280">pdf</a>, <a href="https://arxiv.org/format/2502.06280">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> IceBerg: Debiased Self-Training for Class-Imbalanced Node Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhixun Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dingshuo Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+T">Tong Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Daixin Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hongrui Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhiqiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jun Zhou</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J+X">Jeffrey Xu Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06280v1-abstract-short" style="display: inline;"> Graph Neural Networks (GNNs) have achieved great success in dealing with non-Euclidean graph-structured data and have been widely deployed in many real-world applications. However, their effectiveness is often jeopardized under class-imbalanced training sets. Most existing studies have analyzed class-imbalanced node classification from a supervised learning perspective, but they do not fully utili… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06280v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06280v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06280v1-abstract-full" style="display: none;"> Graph Neural Networks (GNNs) have achieved great success in dealing with non-Euclidean graph-structured data and have been widely deployed in many real-world applications. However, their effectiveness is often jeopardized under class-imbalanced training sets. Most existing studies have analyzed class-imbalanced node classification from a supervised learning perspective, but they do not fully utilize the large number of unlabeled nodes in semi-supervised scenarios. We claim that the supervised signal is just the tip of the iceberg and a large number of unlabeled nodes have not yet been effectively utilized. In this work, we propose IceBerg, a debiased self-training framework to address the class-imbalanced and few-shot challenges for GNNs at the same time. Specifically, to figure out the Matthew effect and label distribution shift in self-training, we propose Double Balancing, which can largely improve the performance of existing baselines with just a few lines of code as a simple plug-and-play module. Secondly, to enhance the long-range propagation capability of GNNs, we disentangle the propagation and transformation operations of GNNs. Therefore, the weak supervision signals can propagate more effectively to address the few-shot issue. In summary, we find that leveraging unlabeled nodes can significantly enhance the performance of GNNs in class-imbalanced and few-shot scenarios, and even small, surgical modifications can lead to substantial performance improvements. Systematic experiments on benchmark datasets show that our method can deliver considerable performance gain over existing class-imbalanced node classification baselines. Additionally, due to IceBerg's outstanding ability to leverage unsupervised signals, it also achieves state-of-the-art results in few-shot node classification scenarios. The code of IceBerg is available at: https://github.com/ZhixunLEE/IceBerg. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06280v1-abstract-full').style.display = 'none'; document.getElementById('2502.06280v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by TheWebConf (WWW) 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06257">arXiv:2502.06257</a> <span> [<a href="https://arxiv.org/pdf/2502.06257">pdf</a>, <a href="https://arxiv.org/format/2502.06257">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> K-ON: Stacking Knowledge On the Head Layer of Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+L">Lingbing Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yichi Zhang</a>, <a href="/search/cs?searchtype=author&query=Bo%2C+Z">Zhongpu Bo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhuo Chen</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Mengshu Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhiqiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wen Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Huajun Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06257v1-abstract-short" style="display: inline;"> Recent advancements in large language models (LLMs) have significantly improved various natural language processing (NLP) tasks. Typically, LLMs are trained to predict the next token, aligning well with many NLP tasks. However, in knowledge graph (KG) scenarios, entities are the fundamental units and identifying an entity requires at least several tokens. This leads to a granularity mismatch betwe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06257v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06257v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06257v1-abstract-full" style="display: none;"> Recent advancements in large language models (LLMs) have significantly improved various natural language processing (NLP) tasks. Typically, LLMs are trained to predict the next token, aligning well with many NLP tasks. However, in knowledge graph (KG) scenarios, entities are the fundamental units and identifying an entity requires at least several tokens. This leads to a granularity mismatch between KGs and natural languages. To address this issue, we propose K-ON, which integrates KG knowledge into the LLM by employing multiple head layers for next k-step prediction. K-ON can not only generate entity-level results in one step, but also enables contrastive loss against entities, which is the most powerful tool in KG representation learning. Experimental results show that K-ON outperforms state-of-the-art methods that incorporate text and even the other modalities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06257v1-abstract-full').style.display = 'none'; document.getElementById('2502.06257v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AAAI 2025 (Oral)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06171">arXiv:2502.06171</a> <span> [<a href="https://arxiv.org/pdf/2502.06171">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Data-Efficient Pan-Tumor Foundation Model for Oncology CT Interpretation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lei%2C+W">Wenhui Lei</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hanyu Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zitian Zhang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+L">Luyang Luo</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Q">Qiong Xiao</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yannian Gu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+P">Peng Gao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yankai Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Ci Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+G">Guangtao Wu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+T">Tongjia Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yingjie Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaofan Zhang</a>, <a href="/search/cs?searchtype=author&query=Rajpurkar%2C+P">Pranav Rajpurkar</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shaoting Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhenning Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06171v1-abstract-short" style="display: inline;"> Artificial intelligence-assisted imaging analysis has made substantial strides in tumor diagnosis and management. Here we present PASTA, a pan-tumor CT foundation model that achieves state-of-the-art performance on 45 of 46 representative oncology tasks -- including lesion segmentation, tumor detection in plain CT, tumor staging, survival prediction, structured report generation, and cross-modalit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06171v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06171v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06171v1-abstract-full" style="display: none;"> Artificial intelligence-assisted imaging analysis has made substantial strides in tumor diagnosis and management. Here we present PASTA, a pan-tumor CT foundation model that achieves state-of-the-art performance on 45 of 46 representative oncology tasks -- including lesion segmentation, tumor detection in plain CT, tumor staging, survival prediction, structured report generation, and cross-modality transfer learning, significantly outperforming the second-best models on 35 tasks. This remarkable advancement is driven by our development of PASTA-Gen, an innovative synthetic tumor generation framework that produces a comprehensive dataset of 30,000 CT scans with pixel-level annotated lesions and paired structured reports, encompassing malignancies across ten organs and five benign lesion types. By leveraging this rich, high-quality synthetic data, we overcome a longstanding bottleneck in the development of CT foundation models -- specifically, the scarcity of publicly available, high-quality annotated datasets due to privacy constraints and the substantial labor required for scaling precise data annotation. Encouragingly, PASTA demonstrates exceptional data efficiency with promising practical value, markedly improving performance on various tasks with only a small amount of real-world data. The open release of both the synthetic dataset and PASTA foundation model effectively addresses the challenge of data scarcity, thereby advancing oncological research and clinical translation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06171v1-abstract-full').style.display = 'none'; document.getElementById('2502.06171v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">57 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05947">arXiv:2502.05947</a> <span> [<a href="https://arxiv.org/pdf/2502.05947">pdf</a>, <a href="https://arxiv.org/format/2502.05947">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Acceleration Multiple Heads Decoding for LLM via Dynamic Tree Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhendong Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05947v1-abstract-short" style="display: inline;"> Multiple heads decoding accelerates the inference of Large Language Models (LLMs) by predicting next several tokens simultaneously. It generates and verifies multiple candidate sequences in parallel via tree attention with a fixed structure. In this paper, we replace the fixed tree attention with dynamic tree attention on multiple head decoding, specifically in the context of MEDUSA. We propose a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05947v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05947v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05947v1-abstract-full" style="display: none;"> Multiple heads decoding accelerates the inference of Large Language Models (LLMs) by predicting next several tokens simultaneously. It generates and verifies multiple candidate sequences in parallel via tree attention with a fixed structure. In this paper, we replace the fixed tree attention with dynamic tree attention on multiple head decoding, specifically in the context of MEDUSA. We propose a simple and low complexity strategy to generate candidates and construct the dynamic tree structure. Preliminary experiments show that the proposed method improves the decoding efficiency of multiple head decoding for LLMs while maintaining the generation quality. This result demonstrates the potential for improvement of multiple head decoding in candidate generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05947v1-abstract-full').style.display = 'none'; document.getElementById('2502.05947v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05924">arXiv:2502.05924</a> <span> [<a href="https://arxiv.org/pdf/2502.05924">pdf</a>, <a href="https://arxiv.org/format/2502.05924">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Multi-Branch Collaborative Learning Network for Video Quality Assessment in Industrial Video Search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+H">Hengzhu Tang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zefeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhiping Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhenyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xing Wu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+L">Li Gao</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+S">Suqi Cheng</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+D">Dawei Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05924v1-abstract-short" style="display: inline;"> Video Quality Assessment (VQA) is vital for large-scale video retrieval systems, aimed at identifying quality issues to prioritize high-quality videos. In industrial systems, low-quality video characteristics fall into four categories: visual-related issues like mosaics and black boxes, textual issues from video titles and OCR content, and semantic issues like frame incoherence and frame-text mism… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05924v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05924v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05924v1-abstract-full" style="display: none;"> Video Quality Assessment (VQA) is vital for large-scale video retrieval systems, aimed at identifying quality issues to prioritize high-quality videos. In industrial systems, low-quality video characteristics fall into four categories: visual-related issues like mosaics and black boxes, textual issues from video titles and OCR content, and semantic issues like frame incoherence and frame-text mismatch from AI-generated videos. Despite their prevalence in industrial settings, these low-quality videos have been largely overlooked in academic research, posing a challenge for accurate identification. To address this, we introduce the Multi-Branch Collaborative Network (MBCN) tailored for industrial video retrieval systems. MBCN features four branches, each designed to tackle one of the aforementioned quality issues. After each branch independently scores videos, we aggregate these scores using a weighted approach and a squeeze-and-excitation mechanism to dynamically address quality issues across different scenarios. We implement point-wise and pair-wise optimization objectives to ensure score stability and reasonableness. Extensive offline and online experiments on a world-level video search engine demonstrate MBCN's effectiveness in identifying video quality issues, significantly enhancing the retrieval system's ranking performance. Detailed experimental analyses confirm the positive contribution of all four evaluation branches. Furthermore, MBCN significantly improves recognition accuracy for low-quality AI-generated videos compared to the baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05924v1-abstract-full').style.display = 'none'; document.getElementById('2502.05924v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">KDD 2025 ADS</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05743">arXiv:2502.05743</a> <span> [<a href="https://arxiv.org/pdf/2502.05743">pdf</a>, <a href="https://arxiv.org/format/2502.05743">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Understanding Representation Dynamics of Diffusion Models via Low-Dimensional Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiao Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zekai Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Siyi Chen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Z">Zhihui Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peng Wang</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+Q">Qing Qu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05743v1-abstract-short" style="display: inline;"> This work addresses the critical question of why and when diffusion models, despite being designed for generative tasks, can excel at learning high-quality representations in a self-supervised manner. To address this, we develop a mathematical framework based on a low-dimensional data model and posterior estimation, revealing a fundamental trade-off between generation and representation quality ne… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05743v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05743v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05743v1-abstract-full" style="display: none;"> This work addresses the critical question of why and when diffusion models, despite being designed for generative tasks, can excel at learning high-quality representations in a self-supervised manner. To address this, we develop a mathematical framework based on a low-dimensional data model and posterior estimation, revealing a fundamental trade-off between generation and representation quality near the final stage of image generation. Our analysis explains the unimodal representation dynamics across noise scales, mainly driven by the interplay between data denoising and class specification. Building on these insights, we propose an ensemble method that aggregates features across noise levels, significantly improving both clean performance and robustness under label noise. Extensive experiments on both synthetic and real-world datasets validate our findings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05743v1-abstract-full').style.display = 'none'; document.getElementById('2502.05743v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">First two authors contributed equally</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05534">arXiv:2502.05534</a> <span> [<a href="https://arxiv.org/pdf/2502.05534">pdf</a>, <a href="https://arxiv.org/format/2502.05534">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Fg-T2M++: LLMs-Augmented Fine-Grained Text Driven Human Motion Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yin Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mu Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiapeng Liu</a>, <a href="/search/cs?searchtype=author&query=Leng%2C+Z">Zhiying Leng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F+W+B">Frederick W. B. Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Ziyao Zhang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiaohui Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05534v1-abstract-short" style="display: inline;"> We address the challenging problem of fine-grained text-driven human motion generation. Existing works generate imprecise motions that fail to accurately capture relationships specified in text due to: (1) lack of effective text parsing for detailed semantic cues regarding body parts, (2) not fully modeling linguistic structures between words to comprehend text comprehensively. To tackle these lim… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05534v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05534v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05534v1-abstract-full" style="display: none;"> We address the challenging problem of fine-grained text-driven human motion generation. Existing works generate imprecise motions that fail to accurately capture relationships specified in text due to: (1) lack of effective text parsing for detailed semantic cues regarding body parts, (2) not fully modeling linguistic structures between words to comprehend text comprehensively. To tackle these limitations, we propose a novel fine-grained framework Fg-T2M++ that consists of: (1) an LLMs semantic parsing module to extract body part descriptions and semantics from text, (2) a hyperbolic text representation module to encode relational information between text units by embedding the syntactic dependency graph into hyperbolic space, and (3) a multi-modal fusion module to hierarchically fuse text and motion features. Extensive experiments on HumanML3D and KIT-ML datasets demonstrate that Fg-T2M++ outperforms SOTA methods, validating its ability to accurately generate motions adhering to comprehensive text semantics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05534v1-abstract-full').style.display = 'none'; document.getElementById('2502.05534v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05472">arXiv:2502.05472</a> <span> [<a href="https://arxiv.org/pdf/2502.05472">pdf</a>, <a href="https://arxiv.org/format/2502.05472">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3696410.3714915">10.1145/3696410.3714915 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Robust Deep Signed Graph Clustering via Weak Balance Theory </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+P">Peiyao Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zeyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mingzhong Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xueying Zhu</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+L">Lejian Liao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05472v1-abstract-short" style="display: inline;"> Signed graph clustering is a critical technique for discovering community structures in graphs that exhibit both positive and negative relationships. We have identified two significant challenges in this domain: i) existing signed spectral methods are highly vulnerable to noise, which is prevalent in real-world scenarios; ii) the guiding principle ``an enemy of my enemy is my friend'', rooted in \… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05472v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05472v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05472v1-abstract-full" style="display: none;"> Signed graph clustering is a critical technique for discovering community structures in graphs that exhibit both positive and negative relationships. We have identified two significant challenges in this domain: i) existing signed spectral methods are highly vulnerable to noise, which is prevalent in real-world scenarios; ii) the guiding principle ``an enemy of my enemy is my friend'', rooted in \textit{Social Balance Theory}, often narrows or disrupts cluster boundaries in mainstream signed graph neural networks. Addressing these challenges, we propose the \underline{D}eep \underline{S}igned \underline{G}raph \underline{C}lustering framework (DSGC), which leverages \textit{Weak Balance Theory} to enhance preprocessing and encoding for robust representation learning. First, DSGC introduces Violation Sign-Refine to denoise the signed network by correcting noisy edges with high-order neighbor information. Subsequently, Density-based Augmentation enhances semantic structures by adding positive edges within clusters and negative edges across clusters, following \textit{Weak Balance} principles. The framework then utilizes \textit{Weak Balance} principles to develop clustering-oriented signed neural networks to broaden cluster boundaries by emphasizing distinctions between negatively linked nodes. Finally, DSGC optimizes clustering assignments by minimizing a regularized clustering loss. Comprehensive experiments on synthetic and real-world datasets demonstrate DSGC consistently outperforms all baselines, establishing a new benchmark in signed graph clustering. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05472v1-abstract-full').style.display = 'none'; document.getElementById('2502.05472v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by WWW25 conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05414">arXiv:2502.05414</a> <span> [<a href="https://arxiv.org/pdf/2502.05414">pdf</a>, <a href="https://arxiv.org/format/2502.05414">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Graph-based Molecular In-context Learning Grounded on Morgan Fingerprints </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Al-Lawati%2C+A">Ali Al-Lawati</a>, <a href="/search/cs?searchtype=author&query=Lucas%2C+J">Jason Lucas</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhiwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Mitra%2C+P">Prasenjit Mitra</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Suhang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05414v1-abstract-short" style="display: inline;"> In-context learning (ICL) effectively conditions large language models (LLMs) for molecular tasks, such as property prediction and molecule captioning, by embedding carefully selected demonstration examples into the input prompt. This approach avoids the computational overhead of extensive pertaining and fine-tuning. However, current prompt retrieval methods for molecular tasks have relied on mole… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05414v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05414v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05414v1-abstract-full" style="display: none;"> In-context learning (ICL) effectively conditions large language models (LLMs) for molecular tasks, such as property prediction and molecule captioning, by embedding carefully selected demonstration examples into the input prompt. This approach avoids the computational overhead of extensive pertaining and fine-tuning. However, current prompt retrieval methods for molecular tasks have relied on molecule feature similarity, such as Morgan fingerprints, which do not adequately capture the global molecular and atom-binding relationships. As a result, these methods fail to represent the full complexity of molecular structures during inference. Moreover, small-to-medium-sized LLMs, which offer simpler deployment requirements in specialized systems, have remained largely unexplored in the molecular ICL literature. To address these gaps, we propose a self-supervised learning technique, GAMIC (Graph-Aligned Molecular In-Context learning, which aligns global molecular structures, represented by graph neural networks (GNNs), with textual captions (descriptions) while leveraging local feature similarity through Morgan fingerprints. In addition, we introduce a Maximum Marginal Relevance (MMR) based diversity heuristic during retrieval to optimize input prompt demonstration samples. Our experimental findings using diverse benchmark datasets show GAMIC outperforms simple Morgan-based ICL retrieval methods across all tasks by up to 45%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05414v1-abstract-full').style.display = 'none'; document.getElementById('2502.05414v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05210">arXiv:2502.05210</a> <span> [<a href="https://arxiv.org/pdf/2502.05210">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Statistical Finance">q-fin.ST</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Regression and Forecasting of U.S. Stock Returns Based on LSTM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Shicheng Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zizhou Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Rong Zhang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Y">Yuchen Yin</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+C+H">Chia Hong Chang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Q">Qinyan Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05210v1-abstract-short" style="display: inline;"> This paper analyses the investment returns of three stock sectors, Manuf, Hitec, and Other, in the U.S. stock market, based on the Fama-French three-factor model, the Carhart four-factor model, and the Fama-French five-factor model, in order to test the validity of the Fama-French three-factor model, the Carhart four-factor model, and the Fama-French five-factor model for the three sectors of the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05210v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05210v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05210v1-abstract-full" style="display: none;"> This paper analyses the investment returns of three stock sectors, Manuf, Hitec, and Other, in the U.S. stock market, based on the Fama-French three-factor model, the Carhart four-factor model, and the Fama-French five-factor model, in order to test the validity of the Fama-French three-factor model, the Carhart four-factor model, and the Fama-French five-factor model for the three sectors of the market. French five-factor model for the three sectors of the market. Also, the LSTM model is used to explore the additional factors affecting stock returns. The empirical results show that the Fama-French five-factor model has better validity for the three segments of the market under study, and the LSTM model has the ability to capture the factors affecting the returns of certain industries, and can better regress and predict the stock returns of the relevant industries. Keywords- Fama-French model; Carhart model; Factor model; LSTM model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05210v1-abstract-full').style.display = 'none'; document.getElementById('2502.05210v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05187">arXiv:2502.05187</a> <span> [<a href="https://arxiv.org/pdf/2502.05187">pdf</a>, <a href="https://arxiv.org/format/2502.05187">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> An Adaptable Budget Planner for Enhancing Budget-Constrained Auto-Bidding in Online Advertising </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Duan%2C+Z">Zhijian Duan</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+Y">Yusen Huo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tianyu Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhilin Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yeshu Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+C">Chuan Yu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jian Xu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+B">Bo Zheng</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+X">Xiaotie Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05187v1-abstract-short" style="display: inline;"> In online advertising, advertisers commonly utilize auto-bidding services to bid for impression opportunities. A typical objective of the auto-bidder is to optimize the advertiser's cumulative value of winning impressions within specified budget constraints. However, such a problem is challenging due to the complex bidding environment faced by diverse advertisers. To address this challenge, we int… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05187v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05187v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05187v1-abstract-full" style="display: none;"> In online advertising, advertisers commonly utilize auto-bidding services to bid for impression opportunities. A typical objective of the auto-bidder is to optimize the advertiser's cumulative value of winning impressions within specified budget constraints. However, such a problem is challenging due to the complex bidding environment faced by diverse advertisers. To address this challenge, we introduce ABPlanner, a few-shot adaptable budget planner designed to improve budget-constrained auto-bidding. ABPlanner is based on a hierarchical bidding framework that decomposes the bidding process into shorter, manageable stages. Within this framework, ABPlanner allocates the budget across all stages, allowing a low-level auto-bidder to bids based on the budget allocation plan. The adaptability of ABPlanner is achieved through a sequential decision-making approach, inspired by in-context reinforcement learning. For each advertiser, ABPlanner adjusts the budget allocation plan episode by episode, using data from previous episodes as prompt for current decisions. This enables ABPlanner to quickly adapt to different advertisers with few-shot data, providing a sample-efficient solution. Extensive simulation experiments and real-world A/B testing validate the effectiveness of ABPlanner, demonstrating its capability to enhance the cumulative value achieved by auto-bidders. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05187v1-abstract-full').style.display = 'none'; document.getElementById('2502.05187v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In KDD 2025 ADS Track August</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05165">arXiv:2502.05165</a> <span> [<a href="https://arxiv.org/pdf/2502.05165">pdf</a>, <a href="https://arxiv.org/format/2502.05165">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multitwine: Multi-Object Compositing with Text and Layout Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tarr%C3%A9s%2C+G+C">Gemma Canet Tarr茅s</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhe Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhifei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">He Zhang</a>, <a href="/search/cs?searchtype=author&query=Gilbert%2C+A">Andrew Gilbert</a>, <a href="/search/cs?searchtype=author&query=Collomosse%2C+J">John Collomosse</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S+Y">Soo Ye Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05165v1-abstract-short" style="display: inline;"> We introduce the first generative model capable of simultaneous multi-object compositing, guided by both text and layout. Our model allows for the addition of multiple objects within a scene, capturing a range of interactions from simple positional relations (e.g., next to, in front of) to complex actions requiring reposing (e.g., hugging, playing guitar). When an interaction implies additional pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05165v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05165v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05165v1-abstract-full" style="display: none;"> We introduce the first generative model capable of simultaneous multi-object compositing, guided by both text and layout. Our model allows for the addition of multiple objects within a scene, capturing a range of interactions from simple positional relations (e.g., next to, in front of) to complex actions requiring reposing (e.g., hugging, playing guitar). When an interaction implies additional props, like `taking a selfie', our model autonomously generates these supporting objects. By jointly training for compositing and subject-driven generation, also known as customization, we achieve a more balanced integration of textual and visual inputs for text-driven object compositing. As a result, we obtain a versatile model with state-of-the-art performance in both tasks. We further present a data generation pipeline leveraging visual and language models to effortlessly synthesize multimodal, aligned training data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05165v1-abstract-full').style.display = 'none'; document.getElementById('2502.05165v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04778">arXiv:2502.04778</a> <span> [<a href="https://arxiv.org/pdf/2502.04778">pdf</a>, <a href="https://arxiv.org/format/2502.04778">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Behavior-Regularized Diffusion Policy Optimization for Offline Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+C">Chen-Xiao Gao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chenyang Wu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+M">Mingjun Cao</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+C">Chenjun Xiao</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yang Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zongzhang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04778v1-abstract-short" style="display: inline;"> The primary focus of offline reinforcement learning (RL) is to manage the risk of hazardous exploitation of out-of-distribution actions. An effective approach to achieve this goal is through behavior regularization, which augments conventional RL objectives by incorporating constraints that enforce the policy to remain close to the behavior policy. Nevertheless, existing literature on behavior-reg… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04778v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04778v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04778v1-abstract-full" style="display: none;"> The primary focus of offline reinforcement learning (RL) is to manage the risk of hazardous exploitation of out-of-distribution actions. An effective approach to achieve this goal is through behavior regularization, which augments conventional RL objectives by incorporating constraints that enforce the policy to remain close to the behavior policy. Nevertheless, existing literature on behavior-regularized RL primarily focuses on explicit policy parameterizations, such as Gaussian policies. Consequently, it remains unclear how to extend this framework to more advanced policy parameterizations, such as diffusion models. In this paper, we introduce BDPO, a principled behavior-regularized RL framework tailored for diffusion-based policies, thereby combining the expressive power of diffusion policies and the robustness provided by regularization. The key ingredient of our method is to calculate the Kullback-Leibler (KL) regularization analytically as the accumulated discrepancies in reverse-time transition kernels along the diffusion trajectory. By integrating the regularization, we develop an efficient two-time-scale actor-critic RL algorithm that produces the optimal policy while respecting the behavior constraint. Comprehensive evaluations conducted on synthetic 2D tasks and continuous control tasks from the D4RL benchmark validate its effectiveness and superior performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04778v1-abstract-full').style.display = 'none'; document.getElementById('2502.04778v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04670">arXiv:2502.04670</a> <span> [<a href="https://arxiv.org/pdf/2502.04670">pdf</a>, <a href="https://arxiv.org/format/2502.04670">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CCS: Controllable and Constrained Sampling with Diffusion Models via Initial Noise Perturbation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+B">Bowen Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zecheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Z">Zhaoxu Luo</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jason Hu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+W">Wei Yuan</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+J">Jing Jia</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhengxu Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guanyang Wang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+L">Liyue Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04670v1-abstract-short" style="display: inline;"> Diffusion models have emerged as powerful tools for generative tasks, producing high-quality outputs across diverse domains. However, how the generated data responds to the initial noise perturbation in diffusion models remains under-explored, which hinders understanding the controllability of the sampling process. In this work, we first observe an interesting phenomenon: the relationship between… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04670v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04670v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04670v1-abstract-full" style="display: none;"> Diffusion models have emerged as powerful tools for generative tasks, producing high-quality outputs across diverse domains. However, how the generated data responds to the initial noise perturbation in diffusion models remains under-explored, which hinders understanding the controllability of the sampling process. In this work, we first observe an interesting phenomenon: the relationship between the change of generation outputs and the scale of initial noise perturbation is highly linear through the diffusion ODE sampling. Then we provide both theoretical and empirical study to justify this linearity property of this input-output (noise-generation data) relationship. Inspired by these new insights, we propose a novel Controllable and Constrained Sampling method (CCS) together with a new controller algorithm for diffusion models to sample with desired statistical properties while preserving good sample quality. We perform extensive experiments to compare our proposed sampling approach with other methods on both sampling controllability and sampled data quality. Results show that our CCS method achieves more precisely controlled sampling while maintaining superior sample quality and diversity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04670v1-abstract-full').style.display = 'none'; document.getElementById('2502.04670v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04543">arXiv:2502.04543</a> <span> [<a href="https://arxiv.org/pdf/2502.04543">pdf</a>, <a href="https://arxiv.org/format/2502.04543">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Sparsity-Based Interpolation of External, Internal and Swap Regret </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zhou Lu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y+J">Y. Jennifer Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhiyu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04543v1-abstract-short" style="display: inline;"> Focusing on the expert problem in online learning, this paper studies the interpolation of several performance metrics via $蠁$-regret minimization, which measures the performance of an algorithm by its regret with respect to an arbitrary action modification rule $蠁$. With $d$ experts and $T\gg d$ rounds in total, we present a single algorithm achieving the instance-adaptive $蠁$-regret bound \begin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04543v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04543v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04543v1-abstract-full" style="display: none;"> Focusing on the expert problem in online learning, this paper studies the interpolation of several performance metrics via $蠁$-regret minimization, which measures the performance of an algorithm by its regret with respect to an arbitrary action modification rule $蠁$. With $d$ experts and $T\gg d$ rounds in total, we present a single algorithm achieving the instance-adaptive $蠁$-regret bound \begin{equation*} \tilde O\left(\min\left\{\sqrt{d-d^{\mathrm{unif}}_蠁+1},\sqrt{d-d^{\mathrm{self}}_蠁}\right\}\cdot\sqrt{T}\right), \end{equation*} where $d^{\mathrm{unif}}_蠁$ is the maximum amount of experts modified identically by $蠁$, and $d^{\mathrm{self}}_蠁$ is the amount of experts that $蠁$ trivially modifies to themselves. By recovering the optimal $O(\sqrt{T\log d})$ external regret bound when $d^{\mathrm{unif}}_蠁=d$, the standard $\tilde O(\sqrt{T})$ internal regret bound when $d^{\mathrm{self}}_蠁=d-1$ and the optimal $\tilde O(\sqrt{dT})$ swap regret bound in the worst case, we improve existing results in the intermediate regimes. In addition, the same algorithm achieves the optimal quantile regret bound, which corresponds to even easier settings of $蠁$ than the external regret. Building on the classical reduction from $蠁$-regret minimization to external regret minimization on stochastic matrices, our main idea is to further convert the latter to online linear regression using Haar-wavelet-inspired matrix features. Then, we apply a particular $L_1$-version of comparator-adaptive online learning algorithms to exploit the sparsity in this regression subroutine. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04543v1-abstract-full').style.display = 'none'; document.getElementById('2502.04543v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Equal contribution, alphabetical order</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04375">arXiv:2502.04375</a> <span> [<a href="https://arxiv.org/pdf/2502.04375">pdf</a>, <a href="https://arxiv.org/format/2502.04375">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> An Analysis for Reasoning Bias of Language Models with Small Initialization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+J">Junjie Yao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhongwang Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z+J">Zhi-Qin John Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04375v1-abstract-short" style="display: inline;"> Transformer-based Large Language Models (LLMs) have revolutionized Natural Language Processing by demonstrating exceptional performance across diverse tasks. This study investigates the impact of the parameter initialization scale on the training behavior and task preferences of LLMs. We discover that smaller initialization scales encourage models to favor reasoning tasks, whereas larger initializ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04375v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04375v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04375v1-abstract-full" style="display: none;"> Transformer-based Large Language Models (LLMs) have revolutionized Natural Language Processing by demonstrating exceptional performance across diverse tasks. This study investigates the impact of the parameter initialization scale on the training behavior and task preferences of LLMs. We discover that smaller initialization scales encourage models to favor reasoning tasks, whereas larger initialization scales lead to a preference for memorization tasks. We validate this reasoning bias via real datasets and meticulously designed anchor functions. Further analysis of initial training dynamics suggests that specific model components, particularly the embedding space and self-attention mechanisms, play pivotal roles in shaping these learning biases. We provide a theoretical framework from the perspective of model training dynamics to explain these phenomena. Additionally, experiments on real-world language tasks corroborate our theoretical insights. This work enhances our understanding of how initialization strategies influence LLM performance on reasoning tasks and offers valuable guidelines for training models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04375v1-abstract-full').style.display = 'none'; document.getElementById('2502.04375v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">30 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04074">arXiv:2502.04074</a> <span> [<a href="https://arxiv.org/pdf/2502.04074">pdf</a>, <a href="https://arxiv.org/format/2502.04074">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> 3D Prior is All You Need: Cross-Task Few-shot 2D Gaze Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yihua Cheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hengfei Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhongqun Zhang</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+Y">Yang Yue</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+B+E">Bo Eun Kim</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+F">Feng Lu</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+H+J">Hyung Jin Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04074v1-abstract-short" style="display: inline;"> 3D and 2D gaze estimation share the fundamental objective of capturing eye movements but are traditionally treated as two distinct research domains. In this paper, we introduce a novel cross-task few-shot 2D gaze estimation approach, aiming to adapt a pre-trained 3D gaze estimation network for 2D gaze prediction on unseen devices using only a few training images. This task is highly challenging du… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04074v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04074v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04074v1-abstract-full" style="display: none;"> 3D and 2D gaze estimation share the fundamental objective of capturing eye movements but are traditionally treated as two distinct research domains. In this paper, we introduce a novel cross-task few-shot 2D gaze estimation approach, aiming to adapt a pre-trained 3D gaze estimation network for 2D gaze prediction on unseen devices using only a few training images. This task is highly challenging due to the domain gap between 3D and 2D gaze, unknown screen poses, and limited training data. To address these challenges, we propose a novel framework that bridges the gap between 3D and 2D gaze. Our framework contains a physics-based differentiable projection module with learnable parameters to model screen poses and project 3D gaze into 2D gaze. The framework is fully differentiable and can integrate into existing 3D gaze networks without modifying their original architecture. Additionally, we introduce a dynamic pseudo-labelling strategy for flipped images, which is particularly challenging for 2D labels due to unknown screen poses. To overcome this, we reverse the projection process by converting 2D labels to 3D space, where flipping is performed. Notably, this 3D space is not aligned with the camera coordinate system, so we learn a dynamic transformation matrix to compensate for this misalignment. We evaluate our method on MPIIGaze, EVE, and GazeCapture datasets, collected respectively on laptops, desktop computers, and mobile devices. The superior performance highlights the effectiveness of our approach, and demonstrates its strong potential for real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04074v1-abstract-full').style.display = 'none'; document.getElementById('2502.04074v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03954">arXiv:2502.03954</a> <span> [<a href="https://arxiv.org/pdf/2502.03954">pdf</a>, <a href="https://arxiv.org/format/2502.03954">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MAQInstruct: Instruction-based Unified Event Relation Extraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jun Xu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Mengshu Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhiqiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jun Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03954v1-abstract-short" style="display: inline;"> Extracting event relations that deviate from known schemas has proven challenging for previous methods based on multi-class classification, MASK prediction, or prototype matching. Recent advancements in large language models have shown impressive performance through instruction tuning. Nevertheless, in the task of event relation extraction, instruction-based methods face several challenges: there… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03954v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03954v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03954v1-abstract-full" style="display: none;"> Extracting event relations that deviate from known schemas has proven challenging for previous methods based on multi-class classification, MASK prediction, or prototype matching. Recent advancements in large language models have shown impressive performance through instruction tuning. Nevertheless, in the task of event relation extraction, instruction-based methods face several challenges: there are a vast number of inference samples, and the relations between events are non-sequential. To tackle these challenges, we present an improved instruction-based event relation extraction framework named MAQInstruct. Firstly, we transform the task from extracting event relations using given event-event instructions to selecting events using given event-relation instructions, which reduces the number of samples required for inference. Then, by incorporating a bipartite matching loss, we reduce the dependency of the instruction-based method on the generation sequence. Our experimental results demonstrate that MAQInstruct significantly improves the performance of event relation extraction across multiple LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03954v1-abstract-full').style.display = 'none'; document.getElementById('2502.03954v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by WWW 2025 short</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03850">arXiv:2502.03850</a> <span> [<a href="https://arxiv.org/pdf/2502.03850">pdf</a>, <a href="https://arxiv.org/format/2502.03850">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Electromagnetic Channel Modeling and Capacity Analysis for HMIMO Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+L">Li Wei</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+S+S+A">Shuai S. A. Yuan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chongwen Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianhua Zhang</a>, <a href="/search/cs?searchtype=author&query=Bader%2C+F">Faouzi Bader</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhaoyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Muhaidat%2C+S">Sami Muhaidat</a>, <a href="/search/cs?searchtype=author&query=Debbah%2C+M">Merouane Debbah</a>, <a href="/search/cs?searchtype=author&query=Yuen%2C+C">Chau Yuen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03850v1-abstract-short" style="display: inline;"> Advancements in emerging technologies, e.g., reconfigurable intelligent surfaces and holographic MIMO (HMIMO), facilitate unprecedented manipulation of electromagnetic (EM) waves, significantly enhancing the performance of wireless communication systems. To accurately characterize the achievable performance limits of these systems, it is crucial to develop a universal EM-compliant channel model. T… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03850v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03850v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03850v1-abstract-full" style="display: none;"> Advancements in emerging technologies, e.g., reconfigurable intelligent surfaces and holographic MIMO (HMIMO), facilitate unprecedented manipulation of electromagnetic (EM) waves, significantly enhancing the performance of wireless communication systems. To accurately characterize the achievable performance limits of these systems, it is crucial to develop a universal EM-compliant channel model. This paper addresses this necessity by proposing a comprehensive EM channel model tailored for realistic multi-path environments, accounting for the combined effects of antenna array configurations and propagation conditions in HMIMO communications. Both polarization phenomena and spatial correlation are incorporated into this probabilistic channel model. Additionally, physical constraints of antenna configurations, such as mutual coupling effects and energy consumption, are integrated into the channel modeling framework. Simulation results validate the effectiveness of the proposed probabilistic channel model, indicating that traditional Rician and Rayleigh fading models cannot accurately depict the channel characteristics and underestimate the channel capacity. More importantly, the proposed channel model outperforms free-space Green's functions in accurately depicting both near-field gain and multi-path effects in radiative near-field regions. These gains are much more evident in tri-polarized systems, highlighting the necessity of polarization interference elimination techniques. Moreover, the theoretical analysis accurately verifies that capacity decreases with expanding communication regions of two-user communications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03850v1-abstract-full').style.display = 'none'; document.getElementById('2502.03850v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03845">arXiv:2502.03845</a> <span> [<a href="https://arxiv.org/pdf/2502.03845">pdf</a>, <a href="https://arxiv.org/format/2502.03845">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> PAGNet: Pluggable Adaptive Generative Networks for Information Completion in Multi-Agent Communication </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhuohui Zhang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+B">Bin Cheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhipeng Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yanmin Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Gang Li</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+P">Ping Lu</a>, <a href="/search/cs?searchtype=author&query=He%2C+B">Bin He</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jie Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03845v1-abstract-short" style="display: inline;"> For partially observable cooperative tasks, multi-agent systems must develop effective communication and understand the interplay among agents in order to achieve cooperative goals. However, existing multi-agent reinforcement learning (MARL) with communication methods lack evaluation metrics for information weights and information-level communication modeling. This causes agents to neglect the agg… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03845v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03845v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03845v1-abstract-full" style="display: none;"> For partially observable cooperative tasks, multi-agent systems must develop effective communication and understand the interplay among agents in order to achieve cooperative goals. However, existing multi-agent reinforcement learning (MARL) with communication methods lack evaluation metrics for information weights and information-level communication modeling. This causes agents to neglect the aggregation of multiple messages, thereby significantly reducing policy learning efficiency. In this paper, we propose pluggable adaptive generative networks (PAGNet), a novel framework that integrates generative models into MARL to enhance communication and decision-making. PAGNet enables agents to synthesize global states representations from weighted local observations and use these representations alongside learned communication weights for coordinated decision-making. This pluggable approach reduces the computational demands typically associated with the joint training of communication and policy networks. Extensive experimental evaluations across diverse benchmarks and communication scenarios demonstrate the significant performance improvements achieved by PAGNet. Furthermore, we analyze the emergent communication patterns and the quality of generated global states, providing insights into operational mechanisms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03845v1-abstract-full').style.display = 'none'; document.getElementById('2502.03845v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03843">arXiv:2502.03843</a> <span> [<a href="https://arxiv.org/pdf/2502.03843">pdf</a>, <a href="https://arxiv.org/format/2502.03843">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Improving Natural Language Understanding for LLMs via Large-Scale Instruction Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+L">Lin Yuan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jun Xu</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+H">Honghao Gui</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Mengshu Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhiqiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+L">Lei Liang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jun Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03843v1-abstract-short" style="display: inline;"> High-quality, large-scale instructions are crucial for aligning large language models (LLMs), however, there is a severe shortage of instruction in the field of natural language understanding (NLU). Previous works on constructing NLU instructions mainly focus on information extraction (IE), neglecting tasks such as machine reading comprehension, question answering, and text classification. Further… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03843v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03843v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03843v1-abstract-full" style="display: none;"> High-quality, large-scale instructions are crucial for aligning large language models (LLMs), however, there is a severe shortage of instruction in the field of natural language understanding (NLU). Previous works on constructing NLU instructions mainly focus on information extraction (IE), neglecting tasks such as machine reading comprehension, question answering, and text classification. Furthermore, the lack of diversity in the data has led to a decreased generalization ability of trained LLMs in other NLU tasks and a noticeable decline in the fundamental model's general capabilities. To address this issue, we propose Hum, a large-scale, high-quality synthetic instruction corpus for NLU tasks, designed to enhance the NLU capabilities of LLMs. Specifically, Hum includes IE (either close IE or open IE), machine reading comprehension, text classification, and instruction generalist tasks, thereby enriching task diversity. Additionally, we introduce a human-LLMs collaborative mechanism to synthesize instructions, which enriches instruction diversity by incorporating guidelines, preference rules, and format variants. We conduct extensive experiments on 5 NLU tasks and 28 general capability evaluation datasets for LLMs. Experimental results show that Hum enhances the NLU capabilities of six LLMs by an average of 3.1\%, with no significant decline observed in other general capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03843v1-abstract-full').style.display = 'none'; document.getElementById('2502.03843v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03709">arXiv:2502.03709</a> <span> [<a href="https://arxiv.org/pdf/2502.03709">pdf</a>, <a href="https://arxiv.org/format/2502.03709">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> How to Make Your Multi-Image Posts Popular? An Approach to Enhanced Grid for Nine Images on Social Media </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xi%2C+Q">Qi Xi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shulin Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhiqi Gao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zibo Zhang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Shunye Tang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianchao Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liangxu Wang</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+Y">Yiru Niu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Binhui Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03709v1-abstract-short" style="display: inline;"> The nine-grid layout is commonly used for multi-image posts, arranging nine images in a tic-tac-toe board. This layout effectively presents content within limited space. Moreover, due to the numerous possible arrangements within the nine-image grid, the optimal arrangement that yields the highest level of attractiveness remains unknown. Our study investigates how the arrangement of images within a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03709v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03709v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03709v1-abstract-full" style="display: none;"> The nine-grid layout is commonly used for multi-image posts, arranging nine images in a tic-tac-toe board. This layout effectively presents content within limited space. Moreover, due to the numerous possible arrangements within the nine-image grid, the optimal arrangement that yields the highest level of attractiveness remains unknown. Our study investigates how the arrangement of images within a nine-grid layout affects the overall popularity of the image set, aiming to explore alignment schemes more aligned with user preferences. Based on survey results regarding user preferences in image arrangement, we have identified two ordering sequences that are widely recognized: sequential order and center prioritization, considering both image visual content and aesthetic quality as alignment metrics, resulting in four layout schemes. Finally, we recruited participants to annotate various layout schemes of the same set of images. Our experience-centered evaluation indicates that layout schemes based on aesthetic quality outperformed others. This research yields empirical evidence supporting the optimization of the nine-grid layout for multi-image posts, thereby furnishing content creators with valuable insights to enhance both attractiveness and user experience. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03709v1-abstract-full').style.display = 'none'; document.getElementById('2502.03709v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">to be published in Proceedings of the 2024 IEEE International Conference on Ubiquitous Intelligence and Computing</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03589">arXiv:2502.03589</a> <span> [<a href="https://arxiv.org/pdf/2502.03589">pdf</a>, <a href="https://arxiv.org/format/2502.03589">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> HACK: Homomorphic Acceleration via Compression of the Key-Value Cache for Disaggregated LLM Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zeyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+H">Haiying Shen</a>, <a href="/search/cs?searchtype=author&query=Vargaftik%2C+S">Shay Vargaftik</a>, <a href="/search/cs?searchtype=author&query=Basat%2C+R+B">Ran Ben Basat</a>, <a href="/search/cs?searchtype=author&query=Mitzenmacher%2C+M">Michael Mitzenmacher</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+M">Minlan Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03589v1-abstract-short" style="display: inline;"> Disaggregated Large Language Model (LLM) inference has gained popularity as it separates the computation-intensive prefill stage from the memory-intensive decode stage, avoiding the prefill-decode interference and improving resource utilization. However, transmitting Key-Value (KV) data between the two stages can be a bottleneck, especially for long prompts. Additionally, the computation time over… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03589v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03589v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03589v1-abstract-full" style="display: none;"> Disaggregated Large Language Model (LLM) inference has gained popularity as it separates the computation-intensive prefill stage from the memory-intensive decode stage, avoiding the prefill-decode interference and improving resource utilization. However, transmitting Key-Value (KV) data between the two stages can be a bottleneck, especially for long prompts. Additionally, the computation time overhead for prefill and decode is key for optimizing Job Completion Time (JCT), and KV data size can become prohibitive for long prompts and sequences. Existing KV quantization methods can alleviate the transmission bottleneck and reduce memory requirements, but they introduce significant dequantization overhead, exacerbating the computation time. We propose Homomorphic Acceleration via Compression of the KV cache (HACK) for disaggregated LLM inference. HACK eliminates the heavy KV dequantization step, and directly performs computations on quantized KV data to approximate and reduce the cost of the expensive matrix-multiplication step. Extensive trace-driven experiments show that HACK reduces JCT by up to 70.9% compared to disaggregated LLM inference baseline and by up to 52.3% compared to state-of-the-art KV quantization methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03589v1-abstract-full').style.display = 'none'; document.getElementById('2502.03589v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03506">arXiv:2502.03506</a> <span> [<a href="https://arxiv.org/pdf/2502.03506">pdf</a>, <a href="https://arxiv.org/format/2502.03506">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Optimistic 蔚-Greedy Exploration for Cooperative Multi-Agent Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruoning Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Siying Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenyu Chen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yang Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhitong Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zixuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruijie Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03506v1-abstract-short" style="display: inline;"> The Centralized Training with Decentralized Execution (CTDE) paradigm is widely used in cooperative multi-agent reinforcement learning. However, due to the representational limitations of traditional monotonic value decomposition methods, algorithms can underestimate optimal actions, leading policies to suboptimal solutions. To address this challenge, we propose Optimistic $蔚$-Greedy Exploration,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03506v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03506v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03506v1-abstract-full" style="display: none;"> The Centralized Training with Decentralized Execution (CTDE) paradigm is widely used in cooperative multi-agent reinforcement learning. However, due to the representational limitations of traditional monotonic value decomposition methods, algorithms can underestimate optimal actions, leading policies to suboptimal solutions. To address this challenge, we propose Optimistic $蔚$-Greedy Exploration, focusing on enhancing exploration to correct value estimations. The underestimation arises from insufficient sampling of optimal actions during exploration, as our analysis indicated. We introduce an optimistic updating network to identify optimal actions and sample actions from its distribution with a probability of $蔚$ during exploration, increasing the selection frequency of optimal actions. Experimental results in various environments reveal that the Optimistic $蔚$-Greedy Exploration effectively prevents the algorithm from suboptimal solutions and significantly improves its performance compared to other algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03506v1-abstract-full').style.display = 'none'; document.getElementById('2502.03506v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03236">arXiv:2502.03236</a> <span> [<a href="https://arxiv.org/pdf/2502.03236">pdf</a>, <a href="https://arxiv.org/format/2502.03236">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Pioneer: Physics-informed Riemannian Graph ODE for Entropy-increasing Dynamics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+L">Li Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Ziheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zixi Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yujie Wang</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+Q">Qiqi Wan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hao Li</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+H">Hao Peng</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+P+S">Philip S. Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03236v1-abstract-short" style="display: inline;"> Dynamic interacting system modeling is important for understanding and simulating real world systems. The system is typically described as a graph, where multiple objects dynamically interact with each other and evolve over time. In recent years, graph Ordinary Differential Equations (ODE) receive increasing research attentions. While achieving encouraging results, existing solutions prioritize th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03236v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03236v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03236v1-abstract-full" style="display: none;"> Dynamic interacting system modeling is important for understanding and simulating real world systems. The system is typically described as a graph, where multiple objects dynamically interact with each other and evolve over time. In recent years, graph Ordinary Differential Equations (ODE) receive increasing research attentions. While achieving encouraging results, existing solutions prioritize the traditional Euclidean space, and neglect the intrinsic geometry of the system and physics laws, e.g., the principle of entropy increasing. The limitations above motivate us to rethink the system dynamics from a fresh perspective of Riemannian geometry, and pose a more realistic problem of physics-informed dynamic system modeling, considering the underlying geometry and physics law for the first time. In this paper, we present a novel physics-informed Riemannian graph ODE for a wide range of entropy-increasing dynamic systems (termed as Pioneer). In particular, we formulate a differential system on the Riemannian manifold, where a manifold-valued graph ODE is governed by the proposed constrained Ricci flow, and a manifold preserving Gyro-transform aware of system geometry. Theoretically, we report the provable entropy non-decreasing of our formulation, obeying the physics laws. Empirical results show the superiority of Pioneer on real datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03236v1-abstract-full').style.display = 'none'; document.getElementById('2502.03236v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI25</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03125">arXiv:2502.03125</a> <span> [<a href="https://arxiv.org/pdf/2502.03125">pdf</a>, <a href="https://arxiv.org/format/2502.03125">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Double Distillation Network for Multi-Agent Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yang Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Siying Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenyu Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruoning Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhitong Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zixuan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03125v1-abstract-short" style="display: inline;"> Multi-agent reinforcement learning typically employs a centralized training-decentralized execution (CTDE) framework to alleviate the non-stationarity in environment. However, the partial observability during execution may lead to cumulative gap errors gathered by agents, impairing the training of effective collaborative policies. To overcome this challenge, we introduce the Double Distillation Ne… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03125v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03125v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03125v1-abstract-full" style="display: none;"> Multi-agent reinforcement learning typically employs a centralized training-decentralized execution (CTDE) framework to alleviate the non-stationarity in environment. However, the partial observability during execution may lead to cumulative gap errors gathered by agents, impairing the training of effective collaborative policies. To overcome this challenge, we introduce the Double Distillation Network (DDN), which incorporates two distillation modules aimed at enhancing robust coordination and facilitating the collaboration process under constrained information. The external distillation module uses a global guiding network and a local policy network, employing distillation to reconcile the gap between global training and local execution. In addition, the internal distillation module introduces intrinsic rewards, drawn from state information, to enhance the exploration capabilities of agents. Extensive experiments demonstrate that DDN significantly improves performance across multiple scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03125v1-abstract-full').style.display = 'none'; document.getElementById('2502.03125v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository