CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 7,047 results for author: <span class="mathjax">Zhang, J</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Zhang%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zhang, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zhang%2C+J&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zhang, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10385">arXiv:2502.10385</a> <span> [<a href="https://arxiv.org/pdf/2502.10385">pdf</a>, <a href="https://arxiv.org/format/2502.10385">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Simplifying DINO via Coding Rate Regularization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Ziyang Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Pai%2C+D">Druv Pai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">XuDong Wang</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+C">Chandan Singh</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jianwei Yang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jianfeng Gao</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yi Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10385v1-abstract-short" style="display: inline;"> DINO and DINOv2 are two model families being widely used to learn representations from unlabeled imagery data at large scales. Their learned representations often enable state-of-the-art performance for downstream tasks, such as image classification and segmentation. However, they employ many empirically motivated design choices and their training pipelines are highly complex and unstable -- many… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10385v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10385v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10385v1-abstract-full" style="display: none;"> DINO and DINOv2 are two model families being widely used to learn representations from unlabeled imagery data at large scales. Their learned representations often enable state-of-the-art performance for downstream tasks, such as image classification and segmentation. However, they employ many empirically motivated design choices and their training pipelines are highly complex and unstable -- many hyperparameters need to be carefully tuned to ensure that the representations do not collapse -- which poses considerable difficulty to improving them or adapting them to new domains. In this work, we posit that we can remove most such-motivated idiosyncrasies in the pre-training pipelines, and only need to add an explicit coding rate term in the loss function to avoid collapse of the representations. As a result, we obtain highly simplified variants of the DINO and DINOv2 which we call SimDINO and SimDINOv2, respectively. Remarkably, these simplified models are more robust to different design choices, such as network architecture and hyperparameters, and they learn even higher-quality representations, measured by performance on downstream tasks, offering a Pareto improvement over the corresponding DINO and DINOv2 models. This work highlights the potential of using simplifying design principles to improve the empirical practice of deep learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10385v1-abstract-full').style.display = 'none'; document.getElementById('2502.10385v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10248">arXiv:2502.10248</a> <span> [<a href="https://arxiv.org/pdf/2502.10248">pdf</a>, <a href="https://arxiv.org/format/2502.10248">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Step-Video-T2V Technical Report: The Practice, Challenges, and Future of Video Foundation Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+G">Guoqing Ma</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Haoyang Huang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+K">Kun Yan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liangyu Chen</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+N">Nan Duan</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+S">Shengming Yin</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+C">Changyi Wan</a>, <a href="/search/cs?searchtype=author&query=Ming%2C+R">Ranchen Ming</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xiaoniu Song</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xing Chen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yu Zhou</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+D">Deshan Sun</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+D">Deyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jian Zhou</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+K">Kaijun Tan</a>, <a href="/search/cs?searchtype=author&query=An%2C+K">Kang An</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mei Chen</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+W">Wei Ji</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qiling Wu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+W">Wen Sun</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xin Han</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yanan Wei</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+Z">Zheng Ge</a>, <a href="/search/cs?searchtype=author&query=Li%2C+A">Aojie Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bin Wang</a> , et al. (90 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10248v1-abstract-short" style="display: inline;"> We present Step-Video-T2V, a state-of-the-art text-to-video pre-trained model with 30B parameters and the ability to generate videos up to 204 frames in length. A deep compression Variational Autoencoder, Video-VAE, is designed for video generation tasks, achieving 16x16 spatial and 8x temporal compression ratios, while maintaining exceptional video reconstruction quality. User prompts are encoded… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10248v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10248v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10248v1-abstract-full" style="display: none;"> We present Step-Video-T2V, a state-of-the-art text-to-video pre-trained model with 30B parameters and the ability to generate videos up to 204 frames in length. A deep compression Variational Autoencoder, Video-VAE, is designed for video generation tasks, achieving 16x16 spatial and 8x temporal compression ratios, while maintaining exceptional video reconstruction quality. User prompts are encoded using two bilingual text encoders to handle both English and Chinese. A DiT with 3D full attention is trained using Flow Matching and is employed to denoise input noise into latent frames. A video-based DPO approach, Video-DPO, is applied to reduce artifacts and improve the visual quality of the generated videos. We also detail our training strategies and share key observations and insights. Step-Video-T2V's performance is evaluated on a novel video generation benchmark, Step-Video-T2V-Eval, demonstrating its state-of-the-art text-to-video quality when compared with both open-source and commercial engines. Additionally, we discuss the limitations of current diffusion-based model paradigm and outline future directions for video foundation models. We make both Step-Video-T2V and Step-Video-T2V-Eval available at https://github.com/stepfun-ai/Step-Video-T2V. The online version can be accessed from https://yuewen.cn/videos as well. Our goal is to accelerate the innovation of video foundation models and empower video content creators. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10248v1-abstract-full').style.display = 'none'; document.getElementById('2502.10248v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">35 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10162">arXiv:2502.10162</a> <span> [<a href="https://arxiv.org/pdf/2502.10162">pdf</a>, <a href="https://arxiv.org/format/2502.10162">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Revisiting Generalization Power of a DNN in Terms of Symbolic Interactions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+L">Lei Cheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Junpeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+Q">Qihan Ren</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Quanshi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10162v1-abstract-short" style="display: inline;"> This paper aims to analyze the generalization power of deep neural networks (DNNs) from the perspective of interactions. Unlike previous analysis of a DNN's generalization power in a highdimensional feature space, we find that the generalization power of a DNN can be explained as the generalization power of the interactions. We found that the generalizable interactions follow a decay-shaped distri… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10162v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10162v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10162v1-abstract-full" style="display: none;"> This paper aims to analyze the generalization power of deep neural networks (DNNs) from the perspective of interactions. Unlike previous analysis of a DNN's generalization power in a highdimensional feature space, we find that the generalization power of a DNN can be explained as the generalization power of the interactions. We found that the generalizable interactions follow a decay-shaped distribution, while non-generalizable interactions follow a spindle-shaped distribution. Furthermore, our theory can effectively disentangle these two types of interactions from a DNN. We have verified that our theory can well match real interactions in a DNN in experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10162v1-abstract-full').style.display = 'none'; document.getElementById('2502.10162v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2407.19198</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09771">arXiv:2502.09771</a> <span> [<a href="https://arxiv.org/pdf/2502.09771">pdf</a>, <a href="https://arxiv.org/format/2502.09771">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Knowledge-Enhanced Program Repair for Data Science Code </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ouyang%2C+S">Shuyin Ouyang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J+M">Jie M. Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zeyu Sun</a>, <a href="/search/cs?searchtype=author&query=Penuela%2C+A+M">Albert Merono Penuela</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09771v1-abstract-short" style="display: inline;"> This paper introduces DSrepair, a knowledge-enhanced program repair method designed to repair the buggy code generated by LLMs in the data science domain. DSrepair uses knowledge graph based RAG for API knowledge retrieval as well as bug knowledge enrichment to construct repair prompts for LLMs. Specifically, to enable knowledge graph based API retrieval, we construct DS-KG (Data Science Knowledge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09771v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09771v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09771v1-abstract-full" style="display: none;"> This paper introduces DSrepair, a knowledge-enhanced program repair method designed to repair the buggy code generated by LLMs in the data science domain. DSrepair uses knowledge graph based RAG for API knowledge retrieval as well as bug knowledge enrichment to construct repair prompts for LLMs. Specifically, to enable knowledge graph based API retrieval, we construct DS-KG (Data Science Knowledge Graph) for widely used data science libraries. For bug knowledge enrichment, we employ an abstract syntax tree (AST) to localize errors at the AST node level. DSrepair's effectiveness is evaluated against five state-of-the-art LLM-based repair baselines using four advanced LLMs on the DS-1000 dataset. The results show that DSrepair surpasses all five baselines. Specifically, when compared to the second-best baseline, DSrepair demonstrates significant improvements, fixing 44.4%, 14.2%, 20.6%, and 32.1% more buggy code snippets for each of the four evaluated LLMs, respectively. Additionally, it achieves greater efficiency, reducing the number of tokens required per code task by 17.49%, 34.24%, 24.71%, and 17.59%, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09771v1-abstract-full').style.display = 'none'; document.getElementById('2502.09771v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09670">arXiv:2502.09670</a> <span> [<a href="https://arxiv.org/pdf/2502.09670">pdf</a>, <a href="https://arxiv.org/format/2502.09670">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> The Science of Evaluating Foundation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jiayi Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiamu Zhang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+A">Andrew Wen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xia Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09670v1-abstract-short" style="display: inline;"> The emergent phenomena of large foundation models have revolutionized natural language processing. However, evaluating these models presents significant challenges due to their size, capabilities, and deployment across diverse applications. Existing literature often focuses on individual aspects, such as benchmark performance or specific tasks, but fails to provide a cohesive process that integrat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09670v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09670v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09670v1-abstract-full" style="display: none;"> The emergent phenomena of large foundation models have revolutionized natural language processing. However, evaluating these models presents significant challenges due to their size, capabilities, and deployment across diverse applications. Existing literature often focuses on individual aspects, such as benchmark performance or specific tasks, but fails to provide a cohesive process that integrates the nuances of diverse use cases with broader ethical and operational considerations. This work focuses on three key aspects: (1) Formalizing the Evaluation Process by providing a structured framework tailored to specific use-case contexts, (2) Offering Actionable Tools and Frameworks such as checklists and templates to ensure thorough, reproducible, and practical evaluations, and (3) Surveying Recent Work with a targeted review of advancements in LLM evaluation, emphasizing real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09670v1-abstract-full').style.display = 'none'; document.getElementById('2502.09670v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09656">arXiv:2502.09656</a> <span> [<a href="https://arxiv.org/pdf/2502.09656">pdf</a>, <a href="https://arxiv.org/format/2502.09656">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.compbiomed.2023.107684">10.1016/j.compbiomed.2023.107684 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Multi-Omics Fusion with Soft Labeling for Enhanced Prediction of Distant Metastasis in Nasopharyngeal Carcinoma Patients after Radiotherapy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sheng%2C+J">Jiabao Sheng</a>, <a href="/search/cs?searchtype=author&query=Lam%2C+S">SaiKit Lam</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuanpeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+J">Jing Cai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09656v1-abstract-short" style="display: inline;"> Omics fusion has emerged as a crucial preprocessing approach in the field of medical image processing, providing significant assistance to several studies. One of the challenges encountered in the integration of omics data is the presence of unpredictability arising from disparities in data sources and medical imaging equipment. In order to overcome this challenge and facilitate the integration of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09656v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09656v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09656v1-abstract-full" style="display: none;"> Omics fusion has emerged as a crucial preprocessing approach in the field of medical image processing, providing significant assistance to several studies. One of the challenges encountered in the integration of omics data is the presence of unpredictability arising from disparities in data sources and medical imaging equipment. In order to overcome this challenge and facilitate the integration of their joint application to specific medical objectives, this study aims to develop a fusion methodology that mitigates the disparities inherent in omics data. The utilization of the multi-kernel late-fusion method has gained significant popularity as an effective strategy for addressing this particular challenge. An efficient representation of the data may be achieved by utilizing a suitable single-kernel function to map the inherent features and afterward merging them in a space with a high number of dimensions. This approach effectively addresses the differences noted before. The inflexibility of label fitting poses a constraint on the use of multi-kernel late-fusion methods in complex nasopharyngeal carcinoma (NPC) datasets, hence affecting the efficacy of general classifiers in dealing with high-dimensional characteristics. This innovative methodology aims to increase the disparity between the two cohorts, hence providing a more flexible structure for the allocation of labels. The examination of the NPC-ContraParotid dataset demonstrates the model's robustness and efficacy, indicating its potential as a valuable tool for predicting distant metastases in patients with nasopharyngeal carcinoma (NPC). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09656v1-abstract-full').style.display = 'none'; document.getElementById('2502.09656v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Computers in Biology and Medicine, 168, 107684 (2024) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09649">arXiv:2502.09649</a> <span> [<a href="https://arxiv.org/pdf/2502.09649">pdf</a>, <a href="https://arxiv.org/format/2502.09649">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Imit Diff: Semantics Guided Diffusion Transformer with Dual Resolution Fusion for Imitation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yuhang Dong</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+H">Haizhou Ge</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yupei Zeng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiangning Zhang</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+B">Beiwen Tian</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+G">Guanzhong Tian</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Hongrui Zhu</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yufei Jia</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruixiang Wang</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+R">Ran Yi</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+G">Guyue Zhou</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+L">Longhua Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09649v1-abstract-short" style="display: inline;"> Visuomotor imitation learning enables embodied agents to effectively acquire manipulation skills from video demonstrations and robot proprioception. However, as scene complexity and visual distractions increase, existing methods that perform well in simple scenes tend to degrade in performance. To address this challenge, we introduce Imit Diff, a semanstic guided diffusion transformer with dual re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09649v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09649v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09649v1-abstract-full" style="display: none;"> Visuomotor imitation learning enables embodied agents to effectively acquire manipulation skills from video demonstrations and robot proprioception. However, as scene complexity and visual distractions increase, existing methods that perform well in simple scenes tend to degrade in performance. To address this challenge, we introduce Imit Diff, a semanstic guided diffusion transformer with dual resolution fusion for imitation learning. Our approach leverages prior knowledge from vision language foundation models to translate high-level semantic instruction into pixel-level visual localization. This information is explicitly integrated into a multi-scale visual enhancement framework, constructed with a dual resolution encoder. Additionally, we introduce an implementation of Consistency Policy within the diffusion transformer architecture to improve both real-time performance and motion smoothness in embodied agent control.We evaluate Imit Diff on several challenging real-world tasks. Due to its task-oriented visual localization and fine-grained scene perception, it significantly outperforms state-of-the-art methods, especially in complex scenes with visual distractions, including zero-shot experiments focused on visual distraction and category generalization. The code will be made publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09649v1-abstract-full').style.display = 'none'; document.getElementById('2502.09649v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09560">arXiv:2502.09560</a> <span> [<a href="https://arxiv.org/pdf/2502.09560">pdf</a>, <a href="https://arxiv.org/format/2502.09560">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EmbodiedBench: Comprehensive Benchmarking Multi-modal Large Language Models for Vision-Driven Embodied Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+R">Rui Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hanyang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Junyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+M">Mark Zhao</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+C">Cheng Qian</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kangrui Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qineng Wang</a>, <a href="/search/cs?searchtype=author&query=Koripella%2C+T+V">Teja Venkat Koripella</a>, <a href="/search/cs?searchtype=author&query=Movahedi%2C+M">Marziyeh Movahedi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Manling Li</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+H">Heng Ji</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Huan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tong Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09560v1-abstract-short" style="display: inline;"> Leveraging Multi-modal Large Language Models (MLLMs) to create embodied agents offers a promising avenue for tackling real-world tasks. While language-centric embodied agents have garnered substantial attention, MLLM-based embodied agents remain underexplored due to the lack of comprehensive evaluation frameworks. To bridge this gap, we introduce EmbodiedBench, an extensive benchmark designed to e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09560v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09560v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09560v1-abstract-full" style="display: none;"> Leveraging Multi-modal Large Language Models (MLLMs) to create embodied agents offers a promising avenue for tackling real-world tasks. While language-centric embodied agents have garnered substantial attention, MLLM-based embodied agents remain underexplored due to the lack of comprehensive evaluation frameworks. To bridge this gap, we introduce EmbodiedBench, an extensive benchmark designed to evaluate vision-driven embodied agents. EmbodiedBench features: (1) a diverse set of 1,128 testing tasks across four environments, ranging from high-level semantic tasks (e.g., household) to low-level tasks involving atomic actions (e.g., navigation and manipulation); and (2) six meticulously curated subsets evaluating essential agent capabilities like commonsense reasoning, complex instruction understanding, spatial awareness, visual perception, and long-term planning. Through extensive experiments, we evaluated 13 leading proprietary and open-source MLLMs within EmbodiedBench. Our findings reveal that: MLLMs excel at high-level tasks but struggle with low-level manipulation, with the best model, GPT-4o, scoring only 28.9% on average. EmbodiedBench provides a multifaceted standardized evaluation platform that not only highlights existing challenges but also offers valuable insights to advance MLLM-based embodied agents. Our code is available at https://embodiedbench.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09560v1-abstract-full').style.display = 'none'; document.getElementById('2502.09560v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">51 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09412">arXiv:2502.09412</a> <span> [<a href="https://arxiv.org/pdf/2502.09412">pdf</a>, <a href="https://arxiv.org/format/2502.09412">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Combinatorics">math.CO</span> </div> </div> <p class="title is-5 mathjax"> A LP-rounding based algorithm for soft capacitated facility location problem with submodular penalties </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+H">Hanyin Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiaming Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhikang Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weidong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09412v1-abstract-short" style="display: inline;"> The soft capacitated facility location problem (SCFLP) is a classic combinatorial optimization problem, with its variants widely applied in the fields of operations research and computer science. In the SCFLP, given a set $\mathcal{F}$ of facilities and a set $\mathcal{D}$ of clients, each facility has a capacity and an open cost, allowing to open multiple times, and each client has a demand. Th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09412v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09412v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09412v1-abstract-full" style="display: none;"> The soft capacitated facility location problem (SCFLP) is a classic combinatorial optimization problem, with its variants widely applied in the fields of operations research and computer science. In the SCFLP, given a set $\mathcal{F}$ of facilities and a set $\mathcal{D}$ of clients, each facility has a capacity and an open cost, allowing to open multiple times, and each client has a demand. This problem is to find a subset of facilities in $\mathcal{F}$ and connect each client to the facilities opened, such that the total cost including open cost and connection cost is minimied. SCFLP is a NP-hard problem, which has led to a focus on approximation algorithms. Based on this, we consider a variant, that is, soft capacitated facility location problem with submodular penalties (SCFLPSP), which allows some clients not to be served by accepting the penalty cost. And we consider the integer splittable case of demand, that is, the demand of each client is served by multiple facilities with the integer service amount by each facility. Based on LP-rounding, we propose a $(位R+4)$-approximation algorithm, where $R=\frac{\max_{i \in \mathcal{F} }f_i}{\min_{i \in \mathcal{F} }f_i},位=\frac{R+\sqrt{R^2+8R}}{2R}$. In particular, when the open cost is uniform, the approximation ratio is 6. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09412v1-abstract-full').style.display = 'none'; document.getElementById('2502.09412v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08996">arXiv:2502.08996</a> <span> [<a href="https://arxiv.org/pdf/2502.08996">pdf</a>, <a href="https://arxiv.org/format/2502.08996">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Masked Modulation: High-Throughput Half-Duplex ISAC Transmission Waveform Design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yifeng Xiong</a>, <a href="/search/cs?searchtype=author&query=Mu%2C+J">Junsheng Mu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuangyang Li</a>, <a href="/search/cs?searchtype=author&query=Lops%2C+M">Marco Lops</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianhua Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08996v1-abstract-short" style="display: inline;"> Integrated sensing and communication (ISAC) enables numerous innovative wireless applications. Communication-centric design is a practical choice for the construction of the sixth generation (6G) ISAC networks. Continuous-wave-based ISAC systems, with orthogonal frequency-division multiplexing (OFDM) being a representative example, suffer from the self-interference (SI) problem, and hence are less… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08996v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08996v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08996v1-abstract-full" style="display: none;"> Integrated sensing and communication (ISAC) enables numerous innovative wireless applications. Communication-centric design is a practical choice for the construction of the sixth generation (6G) ISAC networks. Continuous-wave-based ISAC systems, with orthogonal frequency-division multiplexing (OFDM) being a representative example, suffer from the self-interference (SI) problem, and hence are less suitable for long-range sensing. On the other hand, pulse-based half-duplex ISAC systems are free of SI, but are also less favourable for high-throughput communication scenarios. In this treatise, we propose MASked Modulation (MASM), a half-duplex ISAC waveform design scheme, which minimises a range blindness metric, referred to as "range glint", given a duty cycle (proportional to communication throughput) constraint. In particular, MASM is capable of supporting high-throughput communication (~50% duty cycle) under mild range glint. Moreover, MASM can be flexibly adapted to frame-level waveform designs by operating on the slow-time scale. In terms of optimal transmit mask design, a set of masks is shown to be ideal in the sense of sidelobe level and range glint intensity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08996v1-abstract-full').style.display = 'none'; document.getElementById('2502.08996v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to IEEE JSAC for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08991">arXiv:2502.08991</a> <span> [<a href="https://arxiv.org/pdf/2502.08991">pdf</a>, <a href="https://arxiv.org/format/2502.08991">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Task Generalization With AutoRegressive Compositional Structure: Can Learning From $\d$ Tasks Generalize to $\d^{T}$ Tasks? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Abedsoltan%2C+A">Amirhesam Abedsoltan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Huaqing Zhang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+K">Kaiyue Wen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hongzhou Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingzhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Belkin%2C+M">Mikhail Belkin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08991v1-abstract-short" style="display: inline;"> Large language models (LLMs) exhibit remarkable task generalization, solving tasks they were never explicitly trained on with only a few demonstrations. This raises a fundamental question: When can learning from a small set of tasks generalize to a large task family? In this paper, we investigate task generalization through the lens of AutoRegressive Compositional (ARC) structure, where each task… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08991v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08991v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08991v1-abstract-full" style="display: none;"> Large language models (LLMs) exhibit remarkable task generalization, solving tasks they were never explicitly trained on with only a few demonstrations. This raises a fundamental question: When can learning from a small set of tasks generalize to a large task family? In this paper, we investigate task generalization through the lens of AutoRegressive Compositional (ARC) structure, where each task is a composition of $T$ operations, and each operation is among a finite family of $\d$ subtasks. This yields a total class of size~\( \d^\TT \). We first show that generalization to all \( \d^\TT \) tasks is theoretically achievable by training on only \( \tilde{O}(\d) \) tasks. Empirically, we demonstrate that Transformers achieve such exponential task generalization on sparse parity functions via in-context learning (ICL) and Chain-of-Thought (CoT) reasoning. We further demonstrate this generalization in arithmetic and language translation, extending beyond parity functions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08991v1-abstract-full').style.display = 'none'; document.getElementById('2502.08991v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08691">arXiv:2502.08691</a> <span> [<a href="https://arxiv.org/pdf/2502.08691">pdf</a>, <a href="https://arxiv.org/format/2502.08691">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AgentSociety: Large-Scale Simulation of LLM-Driven Generative Agents Advances Understanding of Human Behaviors and Society </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Piao%2C+J">Jinghua Piao</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yuwei Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jun Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+N">Nian Li</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Junbo Yan</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+X">Xiaochong Lan</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zhihong Lu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zhiheng Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J+Y">Jing Yi Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+D">Di Zhou</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+C">Chen Gao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+F">Fengli Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fang Zhang</a>, <a href="/search/cs?searchtype=author&query=Rong%2C+K">Ke Rong</a>, <a href="/search/cs?searchtype=author&query=Su%2C+J">Jun Su</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08691v1-abstract-short" style="display: inline;"> Understanding human behavior and society is a central focus in social sciences, with the rise of generative social science marking a significant paradigmatic shift. By leveraging bottom-up simulations, it replaces costly and logistically challenging traditional experiments with scalable, replicable, and systematic computational approaches for studying complex social dynamics. Recent advances in la… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08691v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08691v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08691v1-abstract-full" style="display: none;"> Understanding human behavior and society is a central focus in social sciences, with the rise of generative social science marking a significant paradigmatic shift. By leveraging bottom-up simulations, it replaces costly and logistically challenging traditional experiments with scalable, replicable, and systematic computational approaches for studying complex social dynamics. Recent advances in large language models (LLMs) have further transformed this research paradigm, enabling the creation of human-like generative social agents and realistic simulacra of society. In this paper, we propose AgentSociety, a large-scale social simulator that integrates LLM-driven agents, a realistic societal environment, and a powerful large-scale simulation engine. Based on the proposed simulator, we generate social lives for over 10k agents, simulating their 5 million interactions both among agents and between agents and their environment. Furthermore, we explore the potential of AgentSociety as a testbed for computational social experiments, focusing on four key social issues: polarization, the spread of inflammatory messages, the effects of universal basic income policies, and the impact of external shocks such as hurricanes. These four issues serve as valuable cases for assessing AgentSociety's support for typical research methods -- such as surveys, interviews, and interventions -- as well as for investigating the patterns, causes, and underlying mechanisms of social issues. The alignment between AgentSociety's outcomes and real-world experimental results not only demonstrates its ability to capture human behaviors and their underlying mechanisms, but also underscores its potential as an important platform for social scientists and policymakers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08691v1-abstract-full').style.display = 'none'; document.getElementById('2502.08691v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08625">arXiv:2502.08625</a> <span> [<a href="https://arxiv.org/pdf/2502.08625">pdf</a>, <a href="https://arxiv.org/format/2502.08625">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Randomness of Low-Layer Parameters Determines Confusing Samples in Terms of Interaction Representations of a DNN </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Junpeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+L">Lei Cheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qing Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+L">Liang Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Quanshi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08625v1-abstract-short" style="display: inline;"> In this paper, we find that the complexity of interactions encoded by a deep neural network (DNN) can explain its generalization power. We also discover that the confusing samples of a DNN, which are represented by non-generalizable interactions, are determined by its low-layer parameters. In comparison, other factors, such as high-layer parameters and network architecture, have much less impact o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08625v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08625v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08625v1-abstract-full" style="display: none;"> In this paper, we find that the complexity of interactions encoded by a deep neural network (DNN) can explain its generalization power. We also discover that the confusing samples of a DNN, which are represented by non-generalizable interactions, are determined by its low-layer parameters. In comparison, other factors, such as high-layer parameters and network architecture, have much less impact on the composition of confusing samples. Two DNNs with different low-layer parameters usually have fully different sets of confusing samples, even though they have similar performance. This finding extends the understanding of the lottery ticket hypothesis, and well explains distinctive representation power of different DNNs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08625v1-abstract-full').style.display = 'none'; document.getElementById('2502.08625v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08513">arXiv:2502.08513</a> <span> [<a href="https://arxiv.org/pdf/2502.08513">pdf</a>, <a href="https://arxiv.org/format/2502.08513">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3706598.3714257">10.1145/3706598.3714257 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> "You'll Be Alice Adventuring in Wonderland!" Processes, Challenges, and Opportunities of Creating Animated Virtual Reality Stories </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+L">Lin-Ping Yuan</a>, <a href="/search/cs?searchtype=author&query=Han%2C+F">Feilin Han</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+L">Liwenhan Xie</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Junjie Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jian Zhao</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+H">Huamin Qu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08513v1-abstract-short" style="display: inline;"> Animated virtual reality (VR) stories, combining the presence of VR and the artistry of computer animation, offer a compelling way to deliver messages and evoke emotions. Motivated by the growing demand for immersive narrative experiences, more creators are creating animated VR stories. However, a holistic understanding of their creation processes and challenges involved in crafting these stories… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08513v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08513v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08513v1-abstract-full" style="display: none;"> Animated virtual reality (VR) stories, combining the presence of VR and the artistry of computer animation, offer a compelling way to deliver messages and evoke emotions. Motivated by the growing demand for immersive narrative experiences, more creators are creating animated VR stories. However, a holistic understanding of their creation processes and challenges involved in crafting these stories is still limited. Based on semi-structured interviews with 21 animated VR story creators, we identify ten common stages in their end-to-end creation processes, ranging from idea generation to evaluation, which form diverse workflows that are story-driven or visual-driven. Additionally, we highlight nine unique issues that arise during the creation process, such as a lack of reference material for multi-element plots, the absence of specific functionalities for story integration, and inadequate support for audience evaluation. We compare the creation of animated VR stories to general XR applications and distill several future research opportunities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08513v1-abstract-full').style.display = 'none'; document.getElementById('2502.08513v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Conditionally accepted to the ACM Conference on Human Factors in Computing Systems (CHI'25)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08482">arXiv:2502.08482</a> <span> [<a href="https://arxiv.org/pdf/2502.08482">pdf</a>, <a href="https://arxiv.org/format/2502.08482">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Auto-regressive Chain-of-Thought through Loop-Aligned Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+Q">Qifan Yu</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhenyu He</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Sijie Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xun Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jun Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jingjing Xu</a>, <a href="/search/cs?searchtype=author&query=He%2C+D">Di He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08482v1-abstract-short" style="display: inline;"> Chain-of-Thought (CoT) prompting has emerged as a powerful technique for enhancing language model's reasoning capabilities. However, generating long and correct CoT trajectories is challenging. Recent studies have demonstrated that Looped Transformers possess remarkable length generalization capabilities, but their limited generality and adaptability prevent them from serving as an alternative to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08482v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08482v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08482v1-abstract-full" style="display: none;"> Chain-of-Thought (CoT) prompting has emerged as a powerful technique for enhancing language model's reasoning capabilities. However, generating long and correct CoT trajectories is challenging. Recent studies have demonstrated that Looped Transformers possess remarkable length generalization capabilities, but their limited generality and adaptability prevent them from serving as an alternative to auto-regressive solutions. To better leverage the strengths of Looped Transformers, we propose RELAY (REasoning through Loop Alignment iterativelY). Specifically, we align the steps of Chain-of-Thought (CoT) reasoning with loop iterations and apply intermediate supervision during the training of Looped Transformers. This additional iteration-wise supervision not only preserves the Looped Transformer's ability for length generalization but also enables it to predict CoT reasoning steps for unseen data. Therefore, we leverage this Looped Transformer to generate accurate reasoning chains for complex problems that exceed the training length, which will then be used to fine-tune an auto-regressive model. We conduct extensive experiments, and the results demonstrate the effectiveness of our approach, with significant improvements in the performance of the auto-regressive model. Code will be released at https://github.com/qifanyu/RELAY. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08482v1-abstract-full').style.display = 'none'; document.getElementById('2502.08482v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08285">arXiv:2502.08285</a> <span> [<a href="https://arxiv.org/pdf/2502.08285">pdf</a>, <a href="https://arxiv.org/format/2502.08285">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Fully-Geometric Cross-Attention for Point Cloud Registration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weijie Wang</a>, <a href="/search/cs?searchtype=author&query=Mei%2C+G">Guofeng Mei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jian Zhang</a>, <a href="/search/cs?searchtype=author&query=Sebe%2C+N">Nicu Sebe</a>, <a href="/search/cs?searchtype=author&query=Lepri%2C+B">Bruno Lepri</a>, <a href="/search/cs?searchtype=author&query=Poiesi%2C+F">Fabio Poiesi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08285v1-abstract-short" style="display: inline;"> Point cloud registration approaches often fail when the overlap between point clouds is low due to noisy point correspondences. This work introduces a novel cross-attention mechanism tailored for Transformer-based architectures that tackles this problem, by fusing information from coordinates and features at the super-point level between point clouds. This formulation has remained unexplored prima… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08285v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08285v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08285v1-abstract-full" style="display: none;"> Point cloud registration approaches often fail when the overlap between point clouds is low due to noisy point correspondences. This work introduces a novel cross-attention mechanism tailored for Transformer-based architectures that tackles this problem, by fusing information from coordinates and features at the super-point level between point clouds. This formulation has remained unexplored primarily because it must guarantee rotation and translation invariance since point clouds reside in different and independent reference frames. We integrate the Gromov-Wasserstein distance into the cross-attention formulation to jointly compute distances between points across different point clouds and account for their geometric structure. By doing so, points from two distinct point clouds can attend to each other under arbitrary rigid transformations. At the point level, we also devise a self-attention mechanism that aggregates the local geometric structure information into point features for fine matching. Our formulation boosts the number of inlier correspondences, thereby yielding more precise registration results compared to state-of-the-art approaches. We have conducted an extensive evaluation on 3DMatch, 3DLoMatch, KITTI, and 3DCSR datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08285v1-abstract-full').style.display = 'none'; document.getElementById('2502.08285v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08277">arXiv:2502.08277</a> <span> [<a href="https://arxiv.org/pdf/2502.08277">pdf</a>, <a href="https://arxiv.org/format/2502.08277">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> ChorusCVR: Chorus Supervision for Entire Space Post-Click Conversion Rate Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+W">Wei Cheng</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yucheng Lu</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+B">Boyang Xia</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jiangxia Cao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+K">Kuan Xu</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+M">Mingxing Wen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+W">Wei Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiaming Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhaojie Liu</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Liyin Hong</a>, <a href="/search/cs?searchtype=author&query=Gai%2C+K">Kun Gai</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+G">Guorui Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08277v2-abstract-short" style="display: inline;"> Post-click conversion rate (CVR) estimation is a vital task in many recommender systems of revenue businesses, e.g., e-commerce and advertising. In a perspective of sample, a typical CVR positive sample usually goes through a funnel of exposure to click to conversion. For lack of post-event labels for un-clicked samples, CVR learning task commonly only utilizes clicked samples, rather than all exp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08277v2-abstract-full').style.display = 'inline'; document.getElementById('2502.08277v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08277v2-abstract-full" style="display: none;"> Post-click conversion rate (CVR) estimation is a vital task in many recommender systems of revenue businesses, e.g., e-commerce and advertising. In a perspective of sample, a typical CVR positive sample usually goes through a funnel of exposure to click to conversion. For lack of post-event labels for un-clicked samples, CVR learning task commonly only utilizes clicked samples, rather than all exposed samples as for click-through rate (CTR) learning task. However, during online inference, CVR and CTR are estimated on the same assumed exposure space, which leads to a inconsistency of sample space between training and inference, i.e., sample selection bias (SSB). To alleviate SSB, previous wisdom proposes to design novel auxiliary tasks to enable the CVR learning on un-click training samples, such as CTCVR and counterfactual CVR, etc. Although alleviating SSB to some extent, none of them pay attention to the discrimination between ambiguous negative samples (un-clicked) and factual negative samples (clicked but un-converted) during modelling, which makes CVR model lacks robustness. To full this gap, we propose a novel ChorusCVR model to realize debiased CVR learning in entire-space. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08277v2-abstract-full').style.display = 'none'; document.getElementById('2502.08277v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08182">arXiv:2502.08182</a> <span> [<a href="https://arxiv.org/pdf/2502.08182">pdf</a>, <a href="https://arxiv.org/format/2502.08182">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Memory Offloading for Large Language Model Inference with Latency SLO Guarantees </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chenxiang Ma</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Z">Zhisheng Ye</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hanyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zehua Yang</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+T">Tianhao Fu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jiaxun Han</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jie Zhang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yingwei Luo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaolin Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhenlin Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yong Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+D">Diyu Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08182v1-abstract-short" style="display: inline;"> Offloading large language models (LLMs) state to host memory during inference promises to reduce operational costs by supporting larger models, longer inputs, and larger batch sizes. However, the design of existing memory offloading mechanisms does not take latency service-level objectives (SLOs) into consideration. As a result, they either lead to frequent SLO violations or underutilize host memo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08182v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08182v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08182v1-abstract-full" style="display: none;"> Offloading large language models (LLMs) state to host memory during inference promises to reduce operational costs by supporting larger models, longer inputs, and larger batch sizes. However, the design of existing memory offloading mechanisms does not take latency service-level objectives (SLOs) into consideration. As a result, they either lead to frequent SLO violations or underutilize host memory, thereby incurring economic loss and thus defeating the purpose of memory offloading. This paper presents Select-N, a latency-SLO-aware memory offloading system for LLM serving. A key challenge in designing Select-N is to reconcile the tension between meeting SLOs and maximizing host memory usage. Select-N overcomes it by exploiting a unique characteristic of modern LLMs: during serving, the computation time of each decoder layer is deterministic. Leveraging this, Select-N introduces offloading interval, an internal tunable knob that captures the tradeoff between SLOs and host memory usage, thereby reducing the aforementioned challenge to pick an optimal offloading interval. With that, Select-N proposes a two-stage approach to automatically pick the offloading interval. The first stage is offline that generates the range of optimal offloading interval, while the second stage adjusts offloading interval at the granularity of inference iteration based on runtime hardware status. Our evaluation shows that Select-N consistently meets SLOs and improves the serving throughput over existing mechanisms by 1.85X due to maximizing the use of host memory. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08182v1-abstract-full').style.display = 'none'; document.getElementById('2502.08182v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07870">arXiv:2502.07870</a> <span> [<a href="https://arxiv.org/pdf/2502.07870">pdf</a>, <a href="https://arxiv.org/format/2502.07870">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TextAtlas5M: A Large-scale Dataset for Dense Text Image Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+A+J">Alex Jinpeng Wang</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+D">Dongxing Mao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiawei Zhang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+W">Weiming Han</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Z">Zhuobai Dong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Linjie Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yiqi Lin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhengyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+L">Libo Qin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fuwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lijuan Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Min Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07870v1-abstract-short" style="display: inline;"> Text-conditioned image generation has gained significant attention in recent years and are processing increasingly longer and comprehensive text prompt. In everyday life, dense and intricate text appears in contexts like advertisements, infographics, and signage, where the integration of both text and visuals is essential for conveying complex information. However, despite these advances, the gene… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07870v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07870v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07870v1-abstract-full" style="display: none;"> Text-conditioned image generation has gained significant attention in recent years and are processing increasingly longer and comprehensive text prompt. In everyday life, dense and intricate text appears in contexts like advertisements, infographics, and signage, where the integration of both text and visuals is essential for conveying complex information. However, despite these advances, the generation of images containing long-form text remains a persistent challenge, largely due to the limitations of existing datasets, which often focus on shorter and simpler text. To address this gap, we introduce TextAtlas5M, a novel dataset specifically designed to evaluate long-text rendering in text-conditioned image generation. Our dataset consists of 5 million long-text generated and collected images across diverse data types, enabling comprehensive evaluation of large-scale generative models on long-text image generation. We further curate 3000 human-improved test set TextAtlasEval across 3 data domains, establishing one of the most extensive benchmarks for text-conditioned generation. Evaluations suggest that the TextAtlasEval benchmarks present significant challenges even for the most advanced proprietary models (e.g. GPT4o with DallE-3), while their open-source counterparts show an even larger performance gap. These evidences position TextAtlas5M as a valuable dataset for training and evaluating future-generation text-conditioned image generation models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07870v1-abstract-full').style.display = 'none'; document.getElementById('2502.07870v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27 pages, 15 figures. Dataset Website: https://textatlas5m.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07837">arXiv:2502.07837</a> <span> [<a href="https://arxiv.org/pdf/2502.07837">pdf</a>, <a href="https://arxiv.org/format/2502.07837">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> RoboBERT: An End-to-end Multimodal Robotic Manipulation Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sicheng Wang</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+J">Jianhua Shan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Haozhang Gao</a>, <a href="/search/cs?searchtype=author&query=Han%2C+H">Hailiang Han</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yipeng Chen</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+K">Kang Wei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chengkun Zhang</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+K">Kairos Wong</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jie Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Lei Zhao</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+B">Bin Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07837v1-abstract-short" style="display: inline;"> Embodied intelligence integrates multiple modalities, enabling agents to understand images, language, and actions simultaneously. However, existing models always depend on additional datasets or extensive pre-training to maximize performance improvements, consuming abundant training time and expensive hardware cost. To tackle this issue, we present RoboBERT, a novel end-to-end robotic manipulation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07837v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07837v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07837v1-abstract-full" style="display: none;"> Embodied intelligence integrates multiple modalities, enabling agents to understand images, language, and actions simultaneously. However, existing models always depend on additional datasets or extensive pre-training to maximize performance improvements, consuming abundant training time and expensive hardware cost. To tackle this issue, we present RoboBERT, a novel end-to-end robotic manipulation model integrated with a unique training strategy. This model utilizes a CNN-based diffusion policy, enhancing and stabilizing the effectiveness of this model by separating training processes for different modalities. It also underscores the importance of data augmentation, verifying various techniques to significantly boost performance. Unlike models that depend on extra data or large foundation models, RoboBERT achieves a highly competitive success rate while using only language-labeled expert demonstrations and maintaining a relatively smaller model size. Specifically, RoboBERT achieves an average length of 4.52 on the CALVIN benchmark for \(ABCD \rightarrow D\) task, setting a new state-of-the-art (SOTA) record. Furthermore, when tested on a real robot, the model demonstrates superior performance, achieving a higher success rate than other methods trained with the same data. We propose that these concepts and methodologies of RoboBERT demonstrate extensive versatility and compatibility, contributing significantly to the development of lightweight multimodal robotic models. The code can be accessed on https://github.com/PeterWangsicheng/RoboBERT <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07837v1-abstract-full').style.display = 'none'; document.getElementById('2502.07837v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07685">arXiv:2502.07685</a> <span> [<a href="https://arxiv.org/pdf/2502.07685">pdf</a>, <a href="https://arxiv.org/format/2502.07685">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Matrix3D: Large Photogrammetry Model All-in-One </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yuanxun Lu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+T">Tian Fang</a>, <a href="/search/cs?searchtype=author&query=Nahmias%2C+J">Jean-Daniel Nahmias</a>, <a href="/search/cs?searchtype=author&query=Tsin%2C+Y">Yanghai Tsin</a>, <a href="/search/cs?searchtype=author&query=Quan%2C+L">Long Quan</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xun Cao</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yao Yao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shiwei Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07685v1-abstract-short" style="display: inline;"> We present Matrix3D, a unified model that performs several photogrammetry subtasks, including pose estimation, depth prediction, and novel view synthesis using just the same model. Matrix3D utilizes a multi-modal diffusion transformer (DiT) to integrate transformations across several modalities, such as images, camera parameters, and depth maps. The key to Matrix3D's large-scale multi-modal traini… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07685v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07685v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07685v1-abstract-full" style="display: none;"> We present Matrix3D, a unified model that performs several photogrammetry subtasks, including pose estimation, depth prediction, and novel view synthesis using just the same model. Matrix3D utilizes a multi-modal diffusion transformer (DiT) to integrate transformations across several modalities, such as images, camera parameters, and depth maps. The key to Matrix3D's large-scale multi-modal training lies in the incorporation of a mask learning strategy. This enables full-modality model training even with partially complete data, such as bi-modality data of image-pose and image-depth pairs, thus significantly increases the pool of available training data. Matrix3D demonstrates state-of-the-art performance in pose estimation and novel view synthesis tasks. Additionally, it offers fine-grained control through multi-round interactions, making it an innovative tool for 3D content creation. Project page: https://nju-3dv.github.io/projects/matrix3d. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07685v1-abstract-full').style.display = 'none'; document.getElementById('2502.07685v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://nju-3dv.github.io/projects/matrix3d</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07527">arXiv:2502.07527</a> <span> [<a href="https://arxiv.org/pdf/2502.07527">pdf</a>, <a href="https://arxiv.org/format/2502.07527">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> NatureLM: Deciphering the Language of Nature for Scientific Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Yingce Xia</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+P">Peiran Jin</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shufang Xie</a>, <a href="/search/cs?searchtype=author&query=He%2C+L">Liang He</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+C">Chuan Cao</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Renqian Luo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+G">Guoqing Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yue Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zequn Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuan-Jyue Chen</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zekun Guo</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yeqi Bai</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+P">Pan Deng</a>, <a href="/search/cs?searchtype=author&query=Min%2C+Y">Yaosen Min</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Ziheng Lu</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+H">Hongxia Hao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Han Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jielan Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chang Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jia Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jianwei Zhu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+K">Kehan Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+K">Kaiyuan Gao</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+Q">Qizhi Pei</a> , et al. (20 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07527v1-abstract-short" style="display: inline;"> Foundation models have revolutionized natural language processing and artificial intelligence, significantly enhancing how machines comprehend and generate human languages. Inspired by the success of these foundation models, researchers have developed foundation models for individual scientific domains, including small molecules, materials, proteins, DNA, and RNA. However, these models are typical… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07527v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07527v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07527v1-abstract-full" style="display: none;"> Foundation models have revolutionized natural language processing and artificial intelligence, significantly enhancing how machines comprehend and generate human languages. Inspired by the success of these foundation models, researchers have developed foundation models for individual scientific domains, including small molecules, materials, proteins, DNA, and RNA. However, these models are typically trained in isolation, lacking the ability to integrate across different scientific domains. Recognizing that entities within these domains can all be represented as sequences, which together form the "language of nature", we introduce Nature Language Model (briefly, NatureLM), a sequence-based science foundation model designed for scientific discovery. Pre-trained with data from multiple scientific domains, NatureLM offers a unified, versatile model that enables various applications including: (i) generating and optimizing small molecules, proteins, RNA, and materials using text instructions; (ii) cross-domain generation/design, such as protein-to-molecule and protein-to-RNA generation; and (iii) achieving state-of-the-art performance in tasks like SMILES-to-IUPAC translation and retrosynthesis on USPTO-50k. NatureLM offers a promising generalist approach for various scientific tasks, including drug discovery (hit generation/optimization, ADMET optimization, synthesis), novel material design, and the development of therapeutic proteins or nucleotides. We have developed NatureLM models in different sizes (1 billion, 8 billion, and 46.7 billion parameters) and observed a clear improvement in performance as the model size increases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07527v1-abstract-full').style.display = 'none'; document.getElementById('2502.07527v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">81 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07414">arXiv:2502.07414</a> <span> [<a href="https://arxiv.org/pdf/2502.07414">pdf</a>, <a href="https://arxiv.org/format/2502.07414">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Sample Weight Averaging for Stable Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+H">Han Yu</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yue He</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Renzhe Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dongbai Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiayin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+W">Wenchao Zou</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+P">Peng Cui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07414v1-abstract-short" style="display: inline;"> The challenge of Out-of-Distribution (OOD) generalization poses a foundational concern for the application of machine learning algorithms to risk-sensitive areas. Inspired by traditional importance weighting and propensity weighting methods, prior approaches employ an independence-based sample reweighting procedure. They aim at decorrelating covariates to counteract the bias introduced by spurious… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07414v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07414v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07414v1-abstract-full" style="display: none;"> The challenge of Out-of-Distribution (OOD) generalization poses a foundational concern for the application of machine learning algorithms to risk-sensitive areas. Inspired by traditional importance weighting and propensity weighting methods, prior approaches employ an independence-based sample reweighting procedure. They aim at decorrelating covariates to counteract the bias introduced by spurious correlations between unstable variables and the outcome, thus enhancing generalization and fulfilling stable prediction under covariate shift. Nonetheless, these methods are prone to experiencing an inflation of variance, primarily attributable to the reduced efficacy in utilizing training samples during the reweighting process. Existing remedies necessitate either environmental labels or substantially higher time costs along with additional assumptions and supervised information. To mitigate this issue, we propose SAmple Weight Averaging (SAWA), a simple yet efficacious strategy that can be universally integrated into various sample reweighting algorithms to decrease the variance and coefficient estimation error, thus boosting the covariate-shift generalization and achieving stable prediction across different environments. We prove its rationality and benefits theoretically. Experiments across synthetic datasets and real-world datasets consistently underscore its superiority against covariate shift. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07414v1-abstract-full').style.display = 'none'; document.getElementById('2502.07414v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07384">arXiv:2502.07384</a> <span> [<a href="https://arxiv.org/pdf/2502.07384">pdf</a>, <a href="https://arxiv.org/format/2502.07384">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> SAGEPhos: Sage Bio-Coupled and Augmented Fusion for Phosphorylation Site Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingjie Zhang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+H">Hanqun Cao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zijun Gao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaorui Wang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+C">Chunbin Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07384v1-abstract-short" style="display: inline;"> Phosphorylation site prediction based on kinase-substrate interaction plays a vital role in understanding cellular signaling pathways and disease mechanisms. Computational methods for this task can be categorized into kinase-family-focused and individual kinase-targeted approaches. Individual kinase-targeted methods have gained prominence for their ability to explore a broader protein space and pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07384v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07384v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07384v1-abstract-full" style="display: none;"> Phosphorylation site prediction based on kinase-substrate interaction plays a vital role in understanding cellular signaling pathways and disease mechanisms. Computational methods for this task can be categorized into kinase-family-focused and individual kinase-targeted approaches. Individual kinase-targeted methods have gained prominence for their ability to explore a broader protein space and provide more precise target information for kinase inhibitors. However, most existing individual kinase-based approaches focus solely on sequence inputs, neglecting crucial structural information. To address this limitation, we introduce SAGEPhos (Structure-aware kinAse-substrate bio-coupled and bio-auGmented nEtwork for Phosphorylation site prediction), a novel framework that modifies the semantic space of main protein inputs using auxiliary inputs at two distinct modality levels. At the inter-modality level, SAGEPhos introduces a Bio-Coupled Modal Fusion method, distilling essential kinase sequence information to refine task-oriented local substrate feature space, creating a shared semantic space that captures crucial kinase-substrate interaction patterns. Within the substrate's intra-modality domain, it focuses on Bio-Augmented Fusion, emphasizing 2D local sequence information while selectively incorporating 3D spatial information from predicted structures to complement the sequence space. Moreover, to address the lack of structural information in current datasets, we contribute a new, refined phosphorylation site prediction dataset, which incorporates crucial structural elements and will serve as a new benchmark for the field. Experimental results demonstrate that SAGEPhos significantly outperforms baseline methods. We release the SAGEPhos models and code at https://github.com/ZhangJJ26/SAGEPhos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07384v1-abstract-full').style.display = 'none'; document.getElementById('2502.07384v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Extended from ICLR2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07350">arXiv:2502.07350</a> <span> [<a href="https://arxiv.org/pdf/2502.07350">pdf</a>, <a href="https://arxiv.org/format/2502.07350">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> KABB: Knowledge-Aware Bayesian Bandits for Dynamic Expert Coordination in Multi-Agent Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jusheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zimeng Huang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Y">Yijia Fan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+N">Ningyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingyan Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhuojie Yang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+J">Jiawei Yao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jian Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Keze Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07350v1-abstract-short" style="display: inline;"> As scaling large language models faces prohibitive costs, multi-agent systems emerge as a promising alternative, though challenged by static knowledge assumptions and coordination inefficiencies. We introduces Knowledge-Aware Bayesian Bandits (KABB), a novel framework that enhances multi-agent system coordination through semantic understanding and dynamic adaptation. The framework features three k… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07350v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07350v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07350v1-abstract-full" style="display: none;"> As scaling large language models faces prohibitive costs, multi-agent systems emerge as a promising alternative, though challenged by static knowledge assumptions and coordination inefficiencies. We introduces Knowledge-Aware Bayesian Bandits (KABB), a novel framework that enhances multi-agent system coordination through semantic understanding and dynamic adaptation. The framework features three key innovations: a three-dimensional knowledge distance model for deep semantic understanding, a dual-adaptation mechanism for continuous expert optimization, and a knowledge-aware Thompson Sampling strategy for efficient expert selection. Extensive evaluation demonstrates KABB achieves an optimal cost-performance balance, maintaining high performance while keeping computational demands relatively low in multi-agent coordination. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07350v1-abstract-full').style.display = 'none'; document.getElementById('2502.07350v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07323">arXiv:2502.07323</a> <span> [<a href="https://arxiv.org/pdf/2502.07323">pdf</a>, <a href="https://arxiv.org/format/2502.07323">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Semantic to Structure: Learning Structural Representations for Infringement Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chuanwei Huang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Z">Zexi Jia</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hongyan Fei</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yeshuang Zhu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Z">Zhiqiang Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jinchao Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jie Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07323v1-abstract-short" style="display: inline;"> Structural information in images is crucial for aesthetic assessment, and it is widely recognized in the artistic field that imitating the structure of other works significantly infringes on creators' rights. The advancement of diffusion models has led to AI-generated content imitating artists' structural creations, yet effective detection methods are still lacking. In this paper, we define this p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07323v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07323v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07323v1-abstract-full" style="display: none;"> Structural information in images is crucial for aesthetic assessment, and it is widely recognized in the artistic field that imitating the structure of other works significantly infringes on creators' rights. The advancement of diffusion models has led to AI-generated content imitating artists' structural creations, yet effective detection methods are still lacking. In this paper, we define this phenomenon as "structural infringement" and propose a corresponding detection method. Additionally, we develop quantitative metrics and create manually annotated datasets for evaluation: the SIA dataset of synthesized data, and the SIR dataset of real data. Due to the current lack of datasets for structural infringement detection, we propose a new data synthesis strategy based on diffusion models and LLM, successfully training a structural infringement detection model. Experimental results show that our method can successfully detect structural infringements and achieve notable improvements on annotated test sets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07323v1-abstract-full').style.display = 'none'; document.getElementById('2502.07323v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06892">arXiv:2502.06892</a> <span> [<a href="https://arxiv.org/pdf/2502.06892">pdf</a>, <a href="https://arxiv.org/format/2502.06892">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Certifying Language Model Robustness with Fuzzed Randomized Smoothing: An Efficient Defense Against Backdoor Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+B">Bowei He</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+L">Lihao Yin</a>, <a href="/search/cs?searchtype=author&query=Zhen%2C+H">Hui-Ling Zhen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianping Zhang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lanqing Hong</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+M">Mingxuan Yuan</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chen Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06892v1-abstract-short" style="display: inline;"> The widespread deployment of pre-trained language models (PLMs) has exposed them to textual backdoor attacks, particularly those planted during the pre-training stage. These attacks pose significant risks to high-reliability applications, as they can stealthily affect multiple downstream tasks. While certifying robustness against such threats is crucial, existing defenses struggle with the high-di… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06892v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06892v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06892v1-abstract-full" style="display: none;"> The widespread deployment of pre-trained language models (PLMs) has exposed them to textual backdoor attacks, particularly those planted during the pre-training stage. These attacks pose significant risks to high-reliability applications, as they can stealthily affect multiple downstream tasks. While certifying robustness against such threats is crucial, existing defenses struggle with the high-dimensional, interdependent nature of textual data and the lack of access to original poisoned pre-training data. To address these challenges, we introduce \textbf{F}uzzed \textbf{R}andomized \textbf{S}moothing (\textbf{FRS}), a novel approach for efficiently certifying language model robustness against backdoor attacks. FRS integrates software robustness certification techniques with biphased model parameter smoothing, employing Monte Carlo tree search for proactive fuzzing to identify vulnerable textual segments within the Damerau-Levenshtein space. This allows for targeted and efficient text randomization, while eliminating the need for access to poisoned training data during model smoothing. Our theoretical analysis demonstrates that FRS achieves a broader certified robustness radius compared to existing methods. Extensive experiments across various datasets, model configurations, and attack strategies validate FRS's superiority in terms of defense efficiency, accuracy, and robustness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06892v1-abstract-full').style.display = 'none'; document.getElementById('2502.06892v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06868">arXiv:2502.06868</a> <span> [<a href="https://arxiv.org/pdf/2502.06868">pdf</a>, <a href="https://arxiv.org/format/2502.06868">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Related Knowledge Perturbation Matters: Rethinking Multiple Pieces of Knowledge Editing in Same-Subject </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Duan%2C+Z">Zenghao Duan</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+W">Wenbin Duan</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhiyi Yin</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yinghan Shen</a>, <a href="/search/cs?searchtype=author&query=Jing%2C+S">Shaoling Jing</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jie Zhang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+H">Huawei Shen</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xueqi Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06868v1-abstract-short" style="display: inline;"> Knowledge editing has become a promising approach for efficiently and precisely updating knowledge embedded in large language models (LLMs). In this work, we focus on Same-Subject Editing, which involves modifying multiple attributes of a single entity to ensure comprehensive and consistent updates to entity-centric knowledge. Through preliminary observation, we identify a significant challenge: C… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06868v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06868v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06868v1-abstract-full" style="display: none;"> Knowledge editing has become a promising approach for efficiently and precisely updating knowledge embedded in large language models (LLMs). In this work, we focus on Same-Subject Editing, which involves modifying multiple attributes of a single entity to ensure comprehensive and consistent updates to entity-centric knowledge. Through preliminary observation, we identify a significant challenge: Current state-of-the-art editing methods struggle when tasked with editing multiple related knowledge pieces for the same subject. To address the lack of relevant editing data for identical subjects in traditional benchmarks, we introduce the $\text{S}^2\text{RKE}$(Same-Subject Related Knowledge Editing) benchmark. Our extensive experiments reveal that only mainstream locate-then-edit methods, such as ROME and MEMIT, exhibit "related knowledge perturbation," where subsequent edits interfere with earlier ones. Further analysis reveals that these methods over-rely on subject information, neglecting other critical factors, resulting in reduced editing effectiveness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06868v1-abstract-full').style.display = 'none'; document.getElementById('2502.06868v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NAACL 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06855">arXiv:2502.06855</a> <span> [<a href="https://arxiv.org/pdf/2502.06855">pdf</a>, <a href="https://arxiv.org/format/2502.06855">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Self-Supervised Prompt Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiang%2C+J">Jinyu Xiang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiayi Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhaoyang Yu</a>, <a href="/search/cs?searchtype=author&query=Teng%2C+F">Fengwei Teng</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+J">Jinhao Tu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xinbing Liang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+S">Sirui Hong</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chenglin Wu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yuyu Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06855v1-abstract-short" style="display: inline;"> Well-designed prompts are crucial for enhancing Large language models' (LLMs) reasoning capabilities while aligning their outputs with task requirements across diverse domains. However, manually designed prompts require expertise and iterative experimentation. While existing prompt optimization methods aim to automate this process, they rely heavily on external references such as ground truth or b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06855v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06855v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06855v1-abstract-full" style="display: none;"> Well-designed prompts are crucial for enhancing Large language models' (LLMs) reasoning capabilities while aligning their outputs with task requirements across diverse domains. However, manually designed prompts require expertise and iterative experimentation. While existing prompt optimization methods aim to automate this process, they rely heavily on external references such as ground truth or by humans, limiting their applicability in real-world scenarios where such data is unavailable or costly to obtain. To address this, we propose Self-Supervised Prompt Optimization (SPO), a cost-efficient framework that discovers effective prompts for both closed and open-ended tasks without requiring external reference. Motivated by the observations that prompt quality manifests directly in LLM outputs and LLMs can effectively assess adherence to task requirements, we derive evaluation and optimization signals purely from output comparisons. Specifically, SPO selects superior prompts through pairwise output comparisons evaluated by an LLM evaluator, followed by an LLM optimizer that aligns outputs with task requirements. Extensive experiments demonstrate that SPO outperforms state-of-the-art prompt optimization methods, achieving comparable or superior results with significantly lower costs (e.g., 1.1% to 5.6% of existing methods) and fewer samples (e.g., three samples). The code is available at https://github.com/geekan/MetaGPT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06855v1-abstract-full').style.display = 'none'; document.getElementById('2502.06855v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06846">arXiv:2502.06846</a> <span> [<a href="https://arxiv.org/pdf/2502.06846">pdf</a>, <a href="https://arxiv.org/format/2502.06846">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> Prot2Chat: Protein LLM with Early Fusion of Sequence and Structure </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhicong Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zicheng Ma</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Z">Ziqiang Cao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+C">Changlong Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jun Zhang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yiqin Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06846v1-abstract-short" style="display: inline;"> Proteins play a pivotal role in living organisms, yet understanding their functions presents significant challenges, including the limited flexibility of classification-based methods, the inability to effectively leverage spatial structural information, and the lack of systematic evaluation metrics for protein Q&A systems. To address these limitations, we propose Prot2Chat, a novel framework that… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06846v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06846v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06846v1-abstract-full" style="display: none;"> Proteins play a pivotal role in living organisms, yet understanding their functions presents significant challenges, including the limited flexibility of classification-based methods, the inability to effectively leverage spatial structural information, and the lack of systematic evaluation metrics for protein Q&A systems. To address these limitations, we propose Prot2Chat, a novel framework that integrates multimodal protein representations with natural language through a unified module, enabling large language model (LLM)-driven answer generation. Our model incorporates a modified ProteinMPNN encoder, which encodes protein sequence and structural information in a unified manner, a protein-text adapter with cross-attention mechanisms, and a LLaMA3 decoder. To optimize training efficiency, we freeze the encoder and employ LoRA techniques for the decoder. We conducted experiments on two datasets, both automated metrics and expert evaluations demonstrate the superior performance of our model. Furthermore, zero-shot prediction results highlight its strong generalization capabilities. This framework offers a promising solution for bridging protein domain knowledge with natural language understanding, paving the way for transformative advancements in protein-related research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06846v1-abstract-full').style.display = 'none'; document.getElementById('2502.06846v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06817">arXiv:2502.06817</a> <span> [<a href="https://arxiv.org/pdf/2502.06817">pdf</a>, <a href="https://arxiv.org/format/2502.06817">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Diffusion-empowered AutoPrompt MedSAM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+P">Peng Huang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shu Hu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+B">Bo Peng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiashu Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Hongtu Zhu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xi Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06817v1-abstract-short" style="display: inline;"> MedSAM, a medical foundation model derived from the SAM architecture, has demonstrated notable success across diverse medical domains. However, its clinical application faces two major challenges: the dependency on labor-intensive manual prompt generation, which imposes a significant burden on clinicians, and the absence of semantic labeling in the generated segmentation masks for organs or lesion… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06817v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06817v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06817v1-abstract-full" style="display: none;"> MedSAM, a medical foundation model derived from the SAM architecture, has demonstrated notable success across diverse medical domains. However, its clinical application faces two major challenges: the dependency on labor-intensive manual prompt generation, which imposes a significant burden on clinicians, and the absence of semantic labeling in the generated segmentation masks for organs or lesions, limiting its practicality for non-expert users. To address these limitations, we propose AutoMedSAM, an end-to-end framework derived from SAM, designed to enhance usability and segmentation performance. AutoMedSAM retains MedSAM's image encoder and mask decoder structure while introducing a novel diffusion-based class prompt encoder. The diffusion-based encoder employs a dual-decoder structure to collaboratively generate prompt embeddings guided by sparse and dense prompt definitions. These embeddings enhance the model's ability to understand and process clinical imagery autonomously. With this encoder, AutoMedSAM leverages class prompts to embed semantic information into the model's predictions, transforming MedSAM's semi-automated pipeline into a fully automated workflow. Furthermore, AutoMedSAM employs an uncertainty-aware joint optimization strategy during training to effectively inherit MedSAM's pre-trained knowledge while improving generalization by integrating multiple loss functions. Experimental results across diverse datasets demonstrate that AutoMedSAM achieves superior performance while broadening its applicability to both clinical settings and non-expert users. Code is available at https://github.com/HP-ML/AutoPromptMedSAM.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06817v1-abstract-full').style.display = 'none'; document.getElementById('2502.06817v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06805">arXiv:2502.06805</a> <span> [<a href="https://arxiv.org/pdf/2502.06805">pdf</a>, <a href="https://arxiv.org/format/2502.06805">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Efficient Diffusion Models: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+H">Hui Shen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingxuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+B">Boning Xiong</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+R">Rui Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shoufa Chen</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+Z">Zhongwei Wan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Z">Zixuan Gong</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+G">Guangyin Bao</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+C">Chaofan Tao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yongfeng Huang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Ye Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06805v1-abstract-short" style="display: inline;"> Diffusion models have emerged as powerful generative models capable of producing high-quality contents such as images, videos, and audio, demonstrating their potential to revolutionize digital content creation. However, these capabilities come at the cost of their significant computational resources and lengthy generation time, underscoring the critical need to develop efficient techniques for pra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06805v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06805v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06805v1-abstract-full" style="display: none;"> Diffusion models have emerged as powerful generative models capable of producing high-quality contents such as images, videos, and audio, demonstrating their potential to revolutionize digital content creation. However, these capabilities come at the cost of their significant computational resources and lengthy generation time, underscoring the critical need to develop efficient techniques for practical deployment. In this survey, we provide a systematic and comprehensive review of research on efficient diffusion models. We organize the literature in a taxonomy consisting of three main categories, covering distinct yet interconnected efficient diffusion model topics from algorithm-level, system-level, and framework perspective, respectively. We have also created a GitHub repository where we organize the papers featured in this survey at https://github.com/AIoT-MLSys-Lab/Efficient-Diffusion-Model-Survey. We hope our survey can serve as a valuable resource to help researchers and practitioners gain a systematic understanding of efficient diffusion model research and inspire them to contribute to this important and exciting field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06805v1-abstract-full').style.display = 'none'; document.getElementById('2502.06805v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06663">arXiv:2502.06663</a> <span> [<a href="https://arxiv.org/pdf/2502.06663">pdf</a>, <a href="https://arxiv.org/format/2502.06663">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> EfficientLLM: Scalable Pruning-Aware Pretraining for Architecture-Agnostic Edge Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xing%2C+X">Xingrun Xing</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zheng Liu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+S">Shitao Xiao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+B">Boyan Gao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yiming Liang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wanpeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haokun Lin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guoqi Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiajun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06663v2-abstract-short" style="display: inline;"> Modern large language models (LLMs) driven by scaling laws, achieve intelligence emergency in large model sizes. Recently, the increasing concerns about cloud costs, latency, and privacy make it an urgent requirement to develop compact edge language models. Distinguished from direct pretraining that bounded by the scaling law, this work proposes the pruning-aware pretraining, focusing on retaining… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06663v2-abstract-full').style.display = 'inline'; document.getElementById('2502.06663v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06663v2-abstract-full" style="display: none;"> Modern large language models (LLMs) driven by scaling laws, achieve intelligence emergency in large model sizes. Recently, the increasing concerns about cloud costs, latency, and privacy make it an urgent requirement to develop compact edge language models. Distinguished from direct pretraining that bounded by the scaling law, this work proposes the pruning-aware pretraining, focusing on retaining performance of much larger optimized models. It features following characteristics: 1) Data-scalable: we introduce minimal parameter groups in LLM and continuously optimize structural pruning, extending post-training pruning methods like LLM-Pruner and SparseGPT into the pretraining phase. 2) Architecture-agnostic: the LLM architecture is auto-designed using saliency-driven pruning, which is the first time to exceed SoTA human-designed LLMs in modern pretraining. We reveal that it achieves top-quality edge language models, termed EfficientLLM, by scaling up LLM compression and extending its boundary. EfficientLLM significantly outperforms SoTA baselines with $100M \sim 1B$ parameters, such as MobileLLM, SmolLM, Qwen2.5-0.5B, OLMo-1B, Llama3.2-1B in common sense benchmarks. As the first attempt, EfficientLLM bridges the performance gap between traditional LLM compression and direct pretraining methods, and we will fully open source at https://github.com/Xingrun-Xing2/EfficientLLM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06663v2-abstract-full').style.display = 'none'; document.getElementById('2502.06663v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06557">arXiv:2502.06557</a> <span> [<a href="https://arxiv.org/pdf/2502.06557">pdf</a>, <a href="https://arxiv.org/format/2502.06557">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> LiveForesighter: Generating Future Information for Live-Streaming Recommendations at Kuaishou </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yucheng Lu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jiangxia Cao</a>, <a href="/search/cs?searchtype=author&query=Kuan%2C+X">Xu Kuan</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+W">Wei Cheng</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+W">Wei Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiaming Zhang</a>, <a href="/search/cs?searchtype=author&query=Shuang%2C+Y">Yang Shuang</a>, <a href="/search/cs?searchtype=author&query=Zhaojie%2C+L">Liu Zhaojie</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Liyin Hong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06557v1-abstract-short" style="display: inline;"> Live-streaming, as a new-generation media to connect users and authors, has attracted a lot of attention and experienced rapid growth in recent years. Compared with the content-static short-video recommendation, the live-streaming recommendation faces more challenges in giving our users a satisfactory experience: (1) Live-streaming content is dynamically ever-changing along time. (2) valuable beha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06557v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06557v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06557v1-abstract-full" style="display: none;"> Live-streaming, as a new-generation media to connect users and authors, has attracted a lot of attention and experienced rapid growth in recent years. Compared with the content-static short-video recommendation, the live-streaming recommendation faces more challenges in giving our users a satisfactory experience: (1) Live-streaming content is dynamically ever-changing along time. (2) valuable behaviors (e.g., send digital-gift, buy products) always require users to watch for a long-time (>10 min). Combining the two attributes, here raising a challenging question for live-streaming recommendation: How to discover the live-streamings that the content user is interested in at the current moment, and further a period in the future? <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06557v1-abstract-full').style.display = 'none'; document.getElementById('2502.06557v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06116">arXiv:2502.06116</a> <span> [<a href="https://arxiv.org/pdf/2502.06116">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Instrumentation and Detectors">physics.ins-det</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Event Vision Sensor: A Review </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qin%2C+X">Xinyue Qin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Junlin Zhang</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+W">Wenzhong Bao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chun Lin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Honglei Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06116v1-abstract-short" style="display: inline;"> By monitoring temporal contrast, event-based vision sensors can provide high temporal resolution and low latency while maintaining low power consumption and simplicity in circuit structure. These characteristics have garnered significant attention in both academia and industry. In recent years, the application of back-illuminated (BSI) technology, wafer stacking techniques, and industrial interfac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06116v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06116v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06116v1-abstract-full" style="display: none;"> By monitoring temporal contrast, event-based vision sensors can provide high temporal resolution and low latency while maintaining low power consumption and simplicity in circuit structure. These characteristics have garnered significant attention in both academia and industry. In recent years, the application of back-illuminated (BSI) technology, wafer stacking techniques, and industrial interfaces has brought new opportunities for enhancing the performance of event-based vision sensors. This is evident in the substantial advancements made in reducing noise, improving resolution, and increasing readout rates. Additionally, the integration of these technologies has enhanced the compatibility of event-based vision sensors with current and edge vision systems, providing greater possibilities for their practical applications. This paper will review the progression from neuromorphic engineering to state-of-the-art event-based vision sensor technologies, including their development trends, operating principles, and key features. Moreover, we will delve into the sensitivity of event-based vision sensors and the opportunities and challenges they face in the realm of infrared imaging, providing references for future research and applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06116v1-abstract-full').style.display = 'none'; document.getElementById('2502.06116v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06019">arXiv:2502.06019</a> <span> [<a href="https://arxiv.org/pdf/2502.06019">pdf</a>, <a href="https://arxiv.org/format/2502.06019">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Noise is an Efficient Learner for Zero-Shot Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Imam%2C+R">Raza Imam</a>, <a href="/search/cs?searchtype=author&query=Hanif%2C+A">Asif Hanif</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jian Zhang</a>, <a href="/search/cs?searchtype=author&query=Dawoud%2C+K+W">Khaled Waleed Dawoud</a>, <a href="/search/cs?searchtype=author&query=Kementchedjhieva%2C+Y">Yova Kementchedjhieva</a>, <a href="/search/cs?searchtype=author&query=Yaqub%2C+M">Mohammad Yaqub</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06019v1-abstract-short" style="display: inline;"> Recently, test-time adaptation has garnered attention as a method for tuning models without labeled data. The conventional modus operandi for adapting pre-trained vision-language models (VLMs) during test-time primarily focuses on tuning learnable prompts; however, this approach overlooks potential distribution shifts in the visual representations themselves. In this work, we address this limitati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06019v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06019v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06019v1-abstract-full" style="display: none;"> Recently, test-time adaptation has garnered attention as a method for tuning models without labeled data. The conventional modus operandi for adapting pre-trained vision-language models (VLMs) during test-time primarily focuses on tuning learnable prompts; however, this approach overlooks potential distribution shifts in the visual representations themselves. In this work, we address this limitation by introducing Test-Time Noise Tuning (TNT), a novel method for handling unpredictable shifts in the visual space. TNT leverages, for the first time, a noise adaptation strategy that optimizes learnable noise directly in the visual input space, enabling adaptive feature learning from a single test sample. We further introduce a novel approach for inter-view representation alignment by explicitly enforcing coherence in embedding distances, ensuring consistent feature representations across views. Combined with scaled logits and confident view selection at inference, TNT substantially enhances VLM generalization and calibration, achieving average gains of +7.38% on natural distributions benchmark and +0.80% on cross-dataset evaluations over zero-shot CLIP. These improvements lay a strong foundation for adaptive out-of-distribution handling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06019v1-abstract-full').style.display = 'none'; document.getElementById('2502.06019v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Our code is available at https://github.com/Razaimam45/TNT</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06018">arXiv:2502.06018</a> <span> [<a href="https://arxiv.org/pdf/2502.06018">pdf</a>, <a href="https://arxiv.org/format/2502.06018">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Kolmogorov-Arnold Fourier Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jusheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Y">Yijia Fan</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+K">Kaitong Cai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Keze Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06018v1-abstract-short" style="display: inline;"> Although Kolmogorov-Arnold based interpretable networks (KAN) have strong theoretical expressiveness, they face significant parameter explosion and high-frequency feature capture challenges in high-dimensional tasks. To address this issue, we propose the Kolmogorov-Arnold-Fourier Network (KAF), which effectively integrates trainable Random Fourier Features (RFF) and a novel hybrid GELU-Fourier act… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06018v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06018v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06018v1-abstract-full" style="display: none;"> Although Kolmogorov-Arnold based interpretable networks (KAN) have strong theoretical expressiveness, they face significant parameter explosion and high-frequency feature capture challenges in high-dimensional tasks. To address this issue, we propose the Kolmogorov-Arnold-Fourier Network (KAF), which effectively integrates trainable Random Fourier Features (RFF) and a novel hybrid GELU-Fourier activation mechanism to balance parameter efficiency and spectral representation capabilities. Our key technical contributions include: (1) merging KAN's dual-matrix structure through matrix association properties to substantially reduce parameters; (2) introducing learnable RFF initialization strategies to eliminate spectral distortion in high-dimensional approximation tasks; (3) implementing an adaptive hybrid activation function that progressively enhances frequency representation during the training process. Comprehensive experiments demonstrate the superiority of our KAF across various domains including vision, NLP, audio processing, and differential equation-solving tasks, effectively combining theoretical interpretability with practical utility and computational efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06018v1-abstract-full').style.display = 'none'; document.getElementById('2502.06018v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05905">arXiv:2502.05905</a> <span> [<a href="https://arxiv.org/pdf/2502.05905">pdf</a>, <a href="https://arxiv.org/format/2502.05905">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> QP-SNN: Quantized and Pruned Spiking Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+W">Wenjie Wei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Malu Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zijian Zhou</a>, <a href="/search/cs?searchtype=author&query=Belatreche%2C+A">Ammar Belatreche</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+Y">Yimeng Shan</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yu Liang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+H">Honglin Cao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jieyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05905v1-abstract-short" style="display: inline;"> Brain-inspired Spiking Neural Networks (SNNs) leverage sparse spikes to encode information and operate in an asynchronous event-driven manner, offering a highly energy-efficient paradigm for machine intelligence. However, the current SNN community focuses primarily on performance improvement by developing large-scale models, which limits the applicability of SNNs in resource-limited edge devices.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05905v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05905v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05905v1-abstract-full" style="display: none;"> Brain-inspired Spiking Neural Networks (SNNs) leverage sparse spikes to encode information and operate in an asynchronous event-driven manner, offering a highly energy-efficient paradigm for machine intelligence. However, the current SNN community focuses primarily on performance improvement by developing large-scale models, which limits the applicability of SNNs in resource-limited edge devices. In this paper, we propose a hardware-friendly and lightweight SNN, aimed at effectively deploying high-performance SNN in resource-limited scenarios. Specifically, we first develop a baseline model that integrates uniform quantization and structured pruning, called QP-SNN baseline. While this baseline significantly reduces storage demands and computational costs, it suffers from performance decline. To address this, we conduct an in-depth analysis of the challenges in quantization and pruning that lead to performance degradation and propose solutions to enhance the baseline's performance. For weight quantization, we propose a weight rescaling strategy that utilizes bit width more effectively to enhance the model's representation capability. For structured pruning, we propose a novel pruning criterion using the singular value of spatiotemporal spike activities to enable more accurate removal of redundant kernels. Extensive experiments demonstrate that integrating two proposed methods into the baseline allows QP-SNN to achieve state-of-the-art performance and efficiency, underscoring its potential for enhancing SNN deployment in edge intelligence computing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05905v1-abstract-full').style.display = 'none'; document.getElementById('2502.05905v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 pages, 17 figures, Published as a conference paper at ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05853">arXiv:2502.05853</a> <span> [<a href="https://arxiv.org/pdf/2502.05853">pdf</a>, <a href="https://arxiv.org/ps/2502.05853">ps</a>, <a href="https://arxiv.org/format/2502.05853">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Zak-Transform-Induced Optimal Sequences and Their Applications in OTFS </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+X">Xiuping Peng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Congying Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zilong Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chunlei Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianye Zhang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+P">Pingzhi Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05853v1-abstract-short" style="display: inline;"> This paper introduces a novel finite Zak transform (FZT)-aided framework for constructing multiple zero-correlation zone (ZCZ) sequence sets with optimal correlation properties. Specifically, each sequence is perfect with zero auto-correlation sidelobes, each ZCZ sequence set meets the Tang-Fan-Matsufuji bound with equality, and the maximum inter-set cross-correlation of multiple sequence sets mee… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05853v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05853v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05853v1-abstract-full" style="display: none;"> This paper introduces a novel finite Zak transform (FZT)-aided framework for constructing multiple zero-correlation zone (ZCZ) sequence sets with optimal correlation properties. Specifically, each sequence is perfect with zero auto-correlation sidelobes, each ZCZ sequence set meets the Tang-Fan-Matsufuji bound with equality, and the maximum inter-set cross-correlation of multiple sequence sets meets the Sarwate bound with equality. Our study shows that these sequences can be sparsely expressed in the Zak domain through properly selected index and phase matrices. Particularly, it is found that the maximum inter-set cross-correlation beats the Sarwate bound if every index matrix is a circular Florentine array. Several construction methods of multiple ZCZ sequence sets are proposed, demonstrating both the optimality and high flexibility. Additionally, it is shown that excellent synchronization performance can be achieved by the proposed sequences in orthogonal-time-frequency-space (OTFS) systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05853v1-abstract-full').style.display = 'none'; document.getElementById('2502.05853v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05812">arXiv:2502.05812</a> <span> [<a href="https://arxiv.org/pdf/2502.05812">pdf</a>, <a href="https://arxiv.org/format/2502.05812">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Multi-Agent Reinforcement Learning in Wireless Distributed Networks for 6G </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiayi Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziheng Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yiyang Zhu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+E">Enyu Shi</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+B">Bokai Xu</a>, <a href="/search/cs?searchtype=author&query=Yuen%2C+C">Chau Yuen</a>, <a href="/search/cs?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/cs?searchtype=author&query=Debbah%2C+M">M茅rouane Debbah</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+S">Shi Jin</a>, <a href="/search/cs?searchtype=author&query=Ai%2C+B">Bo Ai</a>, <a href="/search/cs?searchtype=author&query=Xuemin"> Xuemin</a>, <a href="/search/cs?searchtype=author&query=Shen"> Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05812v1-abstract-short" style="display: inline;"> The introduction of intelligent interconnectivity between the physical and human worlds has attracted great attention for future sixth-generation (6G) networks, emphasizing massive capacity, ultra-low latency, and unparalleled reliability. Wireless distributed networks and multi-agent reinforcement learning (MARL), both of which have evolved from centralized paradigms, are two promising solutions… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05812v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05812v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05812v1-abstract-full" style="display: none;"> The introduction of intelligent interconnectivity between the physical and human worlds has attracted great attention for future sixth-generation (6G) networks, emphasizing massive capacity, ultra-low latency, and unparalleled reliability. Wireless distributed networks and multi-agent reinforcement learning (MARL), both of which have evolved from centralized paradigms, are two promising solutions for the great attention. Given their distinct capabilities, such as decentralization and collaborative mechanisms, integrating these two paradigms holds great promise for unleashing the full power of 6G, attracting significant research and development attention. This paper provides a comprehensive study on MARL-assisted wireless distributed networks for 6G. In particular, we introduce the basic mathematical background and evolution of wireless distributed networks and MARL, as well as demonstrate their interrelationships. Subsequently, we analyze different structures of wireless distributed networks from the perspectives of homogeneous and heterogeneous. Furthermore, we introduce the basic concepts of MARL and discuss two typical categories, including model-based and model-free. We then present critical challenges faced by MARL-assisted wireless distributed networks, providing important guidance and insights for actual implementation. We also explore an interplay between MARL-assisted wireless distributed networks and emerging techniques, such as information bottleneck and mirror learning, delivering in-depth analyses and application scenarios. Finally, we outline several compelling research directions for future MARL-assisted wireless distributed networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05812v1-abstract-full').style.display = 'none'; document.getElementById('2502.05812v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05788">arXiv:2502.05788</a> <span> [<a href="https://arxiv.org/pdf/2502.05788">pdf</a>, <a href="https://arxiv.org/format/2502.05788">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> EPBC-YOLOv8: An efficient and accurate improved YOLOv8 underwater detector based on an attention mechanism </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xing Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+X">Xiting Zhuang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jisheng Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jian Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05788v1-abstract-short" style="display: inline;"> In this study, we enhance underwater target detection by integrating channel and spatial attention into YOLOv8's backbone, applying Pointwise Convolution in FasterNeXt for the FasterPW model, and leveraging Weighted Concat in a BiFPN-inspired WFPN structure for improved cross-scale connections and robustness. Utilizing CARAFE for refined feature reassembly, our framework addresses underwater image… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05788v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05788v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05788v1-abstract-full" style="display: none;"> In this study, we enhance underwater target detection by integrating channel and spatial attention into YOLOv8's backbone, applying Pointwise Convolution in FasterNeXt for the FasterPW model, and leveraging Weighted Concat in a BiFPN-inspired WFPN structure for improved cross-scale connections and robustness. Utilizing CARAFE for refined feature reassembly, our framework addresses underwater image degradation, achieving mAP at 0.5 scores of 76.7 percent and 79.0 percent on URPC2019 and URPC2020 datasets, respectively. These scores are 2.3 percent and 0.7 percent higher than the original YOLOv8, showcasing enhanced precision in detecting marine organisms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05788v1-abstract-full').style.display = 'none'; document.getElementById('2502.05788v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05766">arXiv:2502.05766</a> <span> [<a href="https://arxiv.org/pdf/2502.05766">pdf</a>, <a href="https://arxiv.org/format/2502.05766">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Audio-Visual Representation Learning via Knowledge Distillation from Speech Foundation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing-Xuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+G">Genshun Wan</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jianqing Gao</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+Z">Zhen-Hua Ling</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05766v1-abstract-short" style="display: inline;"> Audio-visual representation learning is crucial for advancing multimodal speech processing tasks, such as lipreading and audio-visual speech recognition. Recently, speech foundation models (SFMs) have shown remarkable generalization capabilities across various speech-related tasks. Building on this progress, we propose an audio-visual representation learning model that leverages cross-modal knowle… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05766v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05766v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05766v1-abstract-full" style="display: none;"> Audio-visual representation learning is crucial for advancing multimodal speech processing tasks, such as lipreading and audio-visual speech recognition. Recently, speech foundation models (SFMs) have shown remarkable generalization capabilities across various speech-related tasks. Building on this progress, we propose an audio-visual representation learning model that leverages cross-modal knowledge distillation from SFMs. In our method, SFMs serve as teachers, from which multi-layer hidden representations are extracted using clean audio inputs. We also introduce a multi-teacher ensemble method to distill the student, which receives audio-visual data as inputs. A novel representational knowledge distillation loss is employed to train the student during pretraining, which is also applied during finetuning to further enhance the performance on downstream tasks. Our experiments utilized both a self-supervised SFM, WavLM, and a supervised SFM, iFLYTEK-speech. The results demonstrated that our proposed method achieved superior or at least comparable performance to previous state-of-the-art baselines across automatic speech recognition, visual speech recognition, and audio-visual speech recognition tasks. Additionally, comprehensive ablation studies and the visualization of learned representations were conducted to evaluate the effectiveness of our proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05766v1-abstract-full').style.display = 'none'; document.getElementById('2502.05766v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted to Pattern Recognition</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05629">arXiv:2502.05629</a> <span> [<a href="https://arxiv.org/pdf/2502.05629">pdf</a>, <a href="https://arxiv.org/format/2502.05629">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> TrackDiffuser: Nearly Model-Free Bayesian Filtering with Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+Y">Yangguang He</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenhao Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Minzhe Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Juan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiangfeng Wang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+B">Bo Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05629v1-abstract-short" style="display: inline;"> State estimation remains a fundamental challenge across numerous domains, from autonomous driving, aircraft tracking to quantum system control. Although Bayesian filtering has been the cornerstone solution, its classical model-based paradigm faces two major limitations: it struggles with inaccurate state space model (SSM) and requires extensive prior knowledge of noise characteristics. We present… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05629v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05629v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05629v1-abstract-full" style="display: none;"> State estimation remains a fundamental challenge across numerous domains, from autonomous driving, aircraft tracking to quantum system control. Although Bayesian filtering has been the cornerstone solution, its classical model-based paradigm faces two major limitations: it struggles with inaccurate state space model (SSM) and requires extensive prior knowledge of noise characteristics. We present TrackDiffuser, a generative framework addressing both challenges by reformulating Bayesian filtering as a conditional diffusion model. Our approach implicitly learns system dynamics from data to mitigate the effects of inaccurate SSM, while simultaneously circumventing the need for explicit measurement models and noise priors by establishing a direct relationship between measurements and states. Through an implicit predict-and-update mechanism, TrackDiffuser preserves the interpretability advantage of traditional model-based filtering methods. Extensive experiments demonstrate that our framework substantially outperforms both classical and contemporary hybrid methods, especially in challenging non-linear scenarios involving non-Gaussian noises. Notably, TrackDiffuser exhibits remarkable robustness to SSM inaccuracies, offering a practical solution for real-world state estimation problems where perfect models and prior knowledge are unavailable. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05629v1-abstract-full').style.display = 'none'; document.getElementById('2502.05629v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05567">arXiv:2502.05567</a> <span> [<a href="https://arxiv.org/pdf/2502.05567">pdf</a>, <a href="https://arxiv.org/format/2502.05567">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ATLAS: Autoformalizing Theorems through Lifting, Augmentation, and Synthesis of Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoyang Liu</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+K">Kangjie Bao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiashuo Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yunqi Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yu Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuntian Liu</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+Y">Yang Jiao</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+T">Tao Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05567v1-abstract-short" style="display: inline;"> Autoformalization, the process of automatically translating natural language mathematics into machine-verifiable formal language, has demonstrated advancements with the progress of large language models (LLMs). However, a key obstacle to further advancements is the scarcity of paired datasets that align natural language with formal language. To address this challenge, we introduce ATLAS (Autoforma… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05567v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05567v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05567v1-abstract-full" style="display: none;"> Autoformalization, the process of automatically translating natural language mathematics into machine-verifiable formal language, has demonstrated advancements with the progress of large language models (LLMs). However, a key obstacle to further advancements is the scarcity of paired datasets that align natural language with formal language. To address this challenge, we introduce ATLAS (Autoformalizing Theorems through Lifting, Augmentation, and Synthesis of Data), an iterative data generation framework designed to produce large-scale, high-quality parallel theorem statements. With the proposed ATLAS running for 10 iterations, we construct an undergraduate-level dataset comprising 300k theorem statements and develop the ATLAS translator, achieving accuracies of 80.59% (pass@8) and 92.99% (pass@128) on ProofNet, significantly outperforming the base model (23.99% and 47.17%) and InternLM2-Math-Plus-7B (50.94% and 80.32%). Furthermore, the ATLAS translator also achieves state-of-the-art performance on both the high-school-level miniF2F dataset and the graduate-level MathQual dataset introduced in this work. The datasets, model, and code will be released to the public soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05567v1-abstract-full').style.display = 'none'; document.getElementById('2502.05567v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05485">arXiv:2502.05485</a> <span> </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HAMSTER: Hierarchical Action Models For Open-World Robot Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yi Li</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Y">Yuquan Deng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jesse Zhang</a>, <a href="/search/cs?searchtype=author&query=Jang%2C+J">Joel Jang</a>, <a href="/search/cs?searchtype=author&query=Memme%2C+M">Marius Memme</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Raymond Yu</a>, <a href="/search/cs?searchtype=author&query=Garrett%2C+C+R">Caelan Reed Garrett</a>, <a href="/search/cs?searchtype=author&query=Ramos%2C+F">Fabio Ramos</a>, <a href="/search/cs?searchtype=author&query=Fox%2C+D">Dieter Fox</a>, <a href="/search/cs?searchtype=author&query=Li%2C+A">Anqi Li</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+A">Abhishek Gupta</a>, <a href="/search/cs?searchtype=author&query=Goyal%2C+A">Ankit Goyal</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05485v2-abstract-short" style="display: inline;"> Large foundation models have shown strong open-world generalization to complex problems in vision and language, but similar levels of generalization have yet to be achieved in robotics. One fundamental challenge is the lack of robotic data, which are typically obtained through expensive on-robot operation. A promising remedy is to leverage cheaper, off-domain data such as action-free videos, hand-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05485v2-abstract-full').style.display = 'inline'; document.getElementById('2502.05485v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05485v2-abstract-full" style="display: none;"> Large foundation models have shown strong open-world generalization to complex problems in vision and language, but similar levels of generalization have yet to be achieved in robotics. One fundamental challenge is the lack of robotic data, which are typically obtained through expensive on-robot operation. A promising remedy is to leverage cheaper, off-domain data such as action-free videos, hand-drawn sketches or simulation data. In this work, we posit that hierarchical vision-language-action (VLA) models can be more effective in utilizing off-domain data than standard monolithic VLA models that directly finetune vision-language models (VLMs) to predict actions. In particular, we study a class of hierarchical VLA models, where the high-level VLM is finetuned to produce a coarse 2D path indicating the desired robot end-effector trajectory given an RGB image and a task description. The intermediate 2D path prediction is then served as guidance to the low-level, 3D-aware control policy capable of precise manipulation. Doing so alleviates the high-level VLM from fine-grained action prediction, while reducing the low-level policy's burden on complex task-level reasoning. We show that, with the hierarchical design, the high-level VLM can transfer across significant domain gaps between the off-domain finetuning data and real-robot testing scenarios, including differences on embodiments, dynamics, visual appearances and task semantics, etc. In the real-robot experiments, we observe an average of 20% improvement in success rate across seven different axes of generalization over OpenVLA, representing a 50% relative gain. Visual results are provided at: https://hamster-robot.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05485v2-abstract-full').style.display = 'none'; document.getElementById('2502.05485v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">We require NVIDIA's approval before proceeding with the release, and we are currently processing it</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05413">arXiv:2502.05413</a> <span> [<a href="https://arxiv.org/pdf/2502.05413">pdf</a>, <a href="https://arxiv.org/format/2502.05413">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Operating Systems">cs.OS</span> </div> </div> <p class="title is-5 mathjax"> XPUTimer: Anomaly Diagnostics for Divergent LLM Training in GPU Clusters of Thousand-Plus Scale </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cui%2C+W">Weihao Cui</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Ji Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Han Zhao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chao Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Sha%2C+J">Jian Sha</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Quan Chen</a>, <a href="/search/cs?searchtype=author&query=He%2C+B">Bingsheng He</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+M">Minyi Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05413v1-abstract-short" style="display: inline;"> The rapid proliferation of large language models has driven the need for efficient GPU training clusters. However, ensuring high-performance training in these clusters is challenging due to the complexity of software-hardware interactions and the frequent occurrence of training anomalies. Since existing diagnostic tools are narrowly tailored to specific issues, there are gaps in their ability to a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05413v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05413v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05413v1-abstract-full" style="display: none;"> The rapid proliferation of large language models has driven the need for efficient GPU training clusters. However, ensuring high-performance training in these clusters is challenging due to the complexity of software-hardware interactions and the frequent occurrence of training anomalies. Since existing diagnostic tools are narrowly tailored to specific issues, there are gaps in their ability to address anomalies spanning the entire training stack. In response, we introduce XPUTimer, a real-time diagnostic framework designed for distributed LLM training at scale. XPUTimer first integrates a lightweight tracing daemon to monitor key code segments with minimal overhead. Additionally, it features a diagnostic engine that employs novel intra-kernel tracing and holistic aggregated metrics to efficiently identify and resolve anomalies. Deployment of XPUTimer across 6,000 GPUs over eight months demonstrated significant improvements across the training stack, validating its effectiveness in real-world scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05413v1-abstract-full').style.display = 'none'; document.getElementById('2502.05413v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05213">arXiv:2502.05213</a> <span> [<a href="https://arxiv.org/pdf/2502.05213">pdf</a>, <a href="https://arxiv.org/format/2502.05213">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DERMARK: A Dynamic, Efficient and Robust Multi-bit Watermark for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Q">Qihao Lin</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+C">Chen Tang</a>, <a href="/search/cs?searchtype=author&query=zhang%2C+L">Lan zhang</a>, <a href="/search/cs?searchtype=author&query=zhang%2C+J">Junyang zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiangyang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05213v1-abstract-short" style="display: inline;"> Well-trained large language models (LLMs) present significant risks, including potential malicious use and copyright infringement. Current studies aim to trace the distribution of LLM-generated texts by implicitly embedding watermarks. Among these, the single-bit watermarking method can only determine whether a given text was generated by an LLM. In contrast, the multi-bit watermarking method embe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05213v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05213v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05213v1-abstract-full" style="display: none;"> Well-trained large language models (LLMs) present significant risks, including potential malicious use and copyright infringement. Current studies aim to trace the distribution of LLM-generated texts by implicitly embedding watermarks. Among these, the single-bit watermarking method can only determine whether a given text was generated by an LLM. In contrast, the multi-bit watermarking method embeds richer information into the generated text, which can identify which LLM generated and distributed a given text to which user. However, existing efforts embed the multi-bit watermark directly into the generated text without accounting for its watermarking capacity. This approach can result in embedding failures when the text's watermarking capacity is insufficient. In this paper, we derive the watermark embedding distribution based on the logits of LLMs and propose a formal inequality to segment the text optimally for watermark embedding. Building on this foundation, we propose DERMARK, a dynamic, efficient, and robust multi-bit watermarking method. DERMARK divides the text into segments of varying lengths for each bit embedding, adaptively matching the text's capacity. It achieves this with negligible overhead and robust performance against text editing by minimizing watermark extraction loss. Comprehensive experiments demonstrate that, compared to the SOTA method, our method reduces the number of tokens required for embedding each bit by 20\%, reduces watermark embedding time by 50\%, and is robust to text editing and watermark erasure attacks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05213v1-abstract-full').style.display = 'none'; document.getElementById('2502.05213v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 15 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05206">arXiv:2502.05206</a> <span> [<a href="https://arxiv.org/pdf/2502.05206">pdf</a>, <a href="https://arxiv.org/format/2502.05206">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Safety at Scale: A Comprehensive Survey of Large Model Safety </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xingjun Ma</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yifeng Gao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yixu Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruofan Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Ye Sun</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Yifan Ding</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hengyuan Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yunhao Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yunhan Zhao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hanxun Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yige Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiaming Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+X">Xiang Zheng</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yang Bai</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zuxuan Wu</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xipeng Qiu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingfeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yiming Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jun Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Cong Wang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jindong Gu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Baoyuan Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Siheng Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianwei Zhang</a> , et al. (19 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05206v2-abstract-short" style="display: inline;"> The rapid advancement of large models, driven by their exceptional abilities in learning and generalization through large-scale pre-training, has reshaped the landscape of Artificial Intelligence (AI). These models are now foundational to a wide range of applications, including conversational AI, recommendation systems, autonomous driving, content generation, medical diagnostics, and scientific di… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05206v2-abstract-full').style.display = 'inline'; document.getElementById('2502.05206v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05206v2-abstract-full" style="display: none;"> The rapid advancement of large models, driven by their exceptional abilities in learning and generalization through large-scale pre-training, has reshaped the landscape of Artificial Intelligence (AI). These models are now foundational to a wide range of applications, including conversational AI, recommendation systems, autonomous driving, content generation, medical diagnostics, and scientific discovery. However, their widespread deployment also exposes them to significant safety risks, raising concerns about robustness, reliability, and ethical implications. This survey provides a systematic review of current safety research on large models, covering Vision Foundation Models (VFMs), Large Language Models (LLMs), Vision-Language Pre-training (VLP) models, Vision-Language Models (VLMs), Diffusion Models (DMs), and large-model-based Agents. Our contributions are summarized as follows: (1) We present a comprehensive taxonomy of safety threats to these models, including adversarial attacks, data poisoning, backdoor attacks, jailbreak and prompt injection attacks, energy-latency attacks, data and model extraction attacks, and emerging agent-specific threats. (2) We review defense strategies proposed for each type of attacks if available and summarize the commonly used datasets and benchmarks for safety research. (3) Building on this, we identify and discuss the open challenges in large model safety, emphasizing the need for comprehensive safety evaluations, scalable and effective defense mechanisms, and sustainable data practices. More importantly, we highlight the necessity of collective efforts from the research community and international collaboration. Our work can serve as a useful reference for researchers and practitioners, fostering the ongoing development of comprehensive defense systems and platforms to safeguard AI models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05206v2-abstract-full').style.display = 'none'; document.getElementById('2502.05206v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">47 pages, 3 figures, 11 tables GitHub: https://github.com/xingjunm/Awesome-Large-Model-Safety</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05163">arXiv:2502.05163</a> <span> [<a href="https://arxiv.org/pdf/2502.05163">pdf</a>, <a href="https://arxiv.org/format/2502.05163">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DuoGuard: A Two-Player RL-Driven Framework for Multilingual LLM Guardrails </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+Y">Yihe Deng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yu Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Junkai Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bo Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05163v1-abstract-short" style="display: inline;"> The rapid advancement of large language models (LLMs) has increased the need for guardrail models to ensure responsible use, particularly in detecting unsafe and illegal content. While substantial safety data exist in English, multilingual guardrail modeling remains underexplored due to the scarcity of open-source safety data in other languages. To address this gap, we propose a novel two-player R… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05163v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05163v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05163v1-abstract-full" style="display: none;"> The rapid advancement of large language models (LLMs) has increased the need for guardrail models to ensure responsible use, particularly in detecting unsafe and illegal content. While substantial safety data exist in English, multilingual guardrail modeling remains underexplored due to the scarcity of open-source safety data in other languages. To address this gap, we propose a novel two-player Reinforcement Learning (RL) framework, where a generator and a guardrail model co-evolve adversarially to produce high-quality synthetic data for multilingual guardrail training. We theoretically formalize this interaction as a two-player game, proving convergence to a Nash equilibrium. Empirical evaluations show that our model \ours outperforms state-of-the-art models, achieving nearly 10% improvement over LlamaGuard3 (8B) on English benchmarks while being 4.5x faster at inference with a significantly smaller model (0.5B). We achieve substantial advancements in multilingual safety tasks, particularly in addressing the imbalance for lower-resource languages in a collected real dataset. Ablation studies emphasize the critical role of synthetic data generation in bridging the imbalance in open-source data between English and other languages. These findings establish a scalable and efficient approach to synthetic data generation, paving the way for improved multilingual guardrail models to enhance LLM safety. Code, model, and data will be open-sourced at https://github.com/yihedeng9/DuoGuard. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05163v1-abstract-full').style.display = 'none'; document.getElementById('2502.05163v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">24 pages, 9 figures, 5 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05142">arXiv:2502.05142</a> <span> [<a href="https://arxiv.org/pdf/2502.05142">pdf</a>, <a href="https://arxiv.org/format/2502.05142">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Chest X-ray Foundation Model with Global and Local Representations Integration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zefan Yang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xuanang Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiajin Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Ge Wang</a>, <a href="/search/cs?searchtype=author&query=Kalra%2C+M+K">Mannudeep K. Kalra</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+P">Pingkun Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05142v1-abstract-short" style="display: inline;"> Chest X-ray (CXR) is the most frequently ordered imaging test, supporting diverse clinical tasks from thoracic disease detection to postoperative monitoring. However, task-specific classification models are limited in scope, require costly labeled data, and lack generalizability to out-of-distribution datasets. To address these challenges, we introduce CheXFound, a self-supervised vision foundatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05142v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05142v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05142v1-abstract-full" style="display: none;"> Chest X-ray (CXR) is the most frequently ordered imaging test, supporting diverse clinical tasks from thoracic disease detection to postoperative monitoring. However, task-specific classification models are limited in scope, require costly labeled data, and lack generalizability to out-of-distribution datasets. To address these challenges, we introduce CheXFound, a self-supervised vision foundation model that learns robust CXR representations and generalizes effectively across a wide range of downstream tasks. We pretrain CheXFound on a curated CXR-1M dataset, comprising over one million unique CXRs from publicly available sources. We propose a Global and Local Representations Integration (GLoRI) module for downstream adaptations, by incorporating disease-specific local features with global image features for enhanced performance in multilabel classification. Our experimental results show that CheXFound outperforms state-of-the-art models in classifying 40 disease findings across different prevalence levels on the CXR-LT 24 dataset and exhibits superior label efficiency on downstream tasks with limited training data. Additionally, CheXFound achieved significant improvements on new tasks with out-of-distribution datasets, including opportunistic cardiovascular disease risk estimation and mortality prediction. These results highlight CheXFound's strong generalization capabilities, enabling diverse adaptations with improved label efficiency. The project source code is publicly available at https://github.com/RPIDIAL/CheXFound. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05142v1-abstract-full').style.display = 'none'; document.getElementById('2502.05142v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>