CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 6,685 results for author: <span class="mathjax">Chen, Y</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Chen%2C+Y">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Chen, Y"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Chen%2C+Y&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Chen, Y"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Chen%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Chen%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09166">arXiv:2502.09166</a> <span> [<a href="https://arxiv.org/pdf/2502.09166">pdf</a>, <a href="https://arxiv.org/format/2502.09166">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Integrated Sensing and Communication with Distributed Rate-Limited Helpers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yiqi Chen</a>, <a href="/search/cs?searchtype=author&query=Boche%2C+H">Holger Boche</a>, <a href="/search/cs?searchtype=author&query=Oechtering%2C+T+J">Tobias J. Oechtering</a>, <a href="/search/cs?searchtype=author&query=Skoglund%2C+M">Mikael Skoglund</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09166v1-abstract-short" style="display: inline;"> This paper studies integrated sensing and communication (ISAC) systems with two rate-limited helpers who observe the channel state sequence and the feedback sequence, respectively. Depending on the timing of compressing and using the state information, our proposed coding scheme gives an inner bound of the capacity-compression-distortion tradeoff region. The tradeoff is realized by sending part of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09166v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09166v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09166v1-abstract-full" style="display: none;"> This paper studies integrated sensing and communication (ISAC) systems with two rate-limited helpers who observe the channel state sequence and the feedback sequence, respectively. Depending on the timing of compressing and using the state information, our proposed coding scheme gives an inner bound of the capacity-compression-distortion tradeoff region. The tradeoff is realized by sending part of the state information at the beginning of the transmission to facilitate the communication and compressing the remaining part together with the feedback signal. The inner bound becomes tight bounds in several special cases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09166v1-abstract-full').style.display = 'none'; document.getElementById('2502.09166v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Part of the results were accepted by ICC2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09130">arXiv:2502.09130</a> <span> [<a href="https://arxiv.org/pdf/2502.09130">pdf</a>, <a href="https://arxiv.org/format/2502.09130">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Finite-Time Analysis of Discrete-Time Stochastic Interpolants </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuhao Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yu Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+R">Rui Hu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Longbo Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09130v1-abstract-short" style="display: inline;"> The stochastic interpolant framework offers a powerful approach for constructing generative models based on ordinary differential equations (ODEs) or stochastic differential equations (SDEs) to transform arbitrary data distributions. However, prior analyses of this framework have primarily focused on the continuous-time setting, assuming a perfect solution of the underlying equations. In this work… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09130v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09130v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09130v1-abstract-full" style="display: none;"> The stochastic interpolant framework offers a powerful approach for constructing generative models based on ordinary differential equations (ODEs) or stochastic differential equations (SDEs) to transform arbitrary data distributions. However, prior analyses of this framework have primarily focused on the continuous-time setting, assuming a perfect solution of the underlying equations. In this work, we present the first discrete-time analysis of the stochastic interpolant framework, where we introduce an innovative discrete-time sampler and derive a finite-time upper bound on its distribution estimation error. Our result provides a novel quantification of how different factors, including the distance between source and target distributions and estimation accuracy, affect the convergence rate and also offers a new principled way to design efficient schedules for convergence acceleration. Finally, numerical experiments are conducted on the discrete-time sampler to corroborate our theoretical findings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09130v1-abstract-full').style.display = 'none'; document.getElementById('2502.09130v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09129">arXiv:2502.09129</a> <span> [<a href="https://arxiv.org/pdf/2502.09129">pdf</a>, <a href="https://arxiv.org/format/2502.09129">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> </div> </div> <p class="title is-5 mathjax"> Differentially Private Distributed Nash Equilibrium Seeking over Time-Varying Digraphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Ying Chen</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Q">Qian Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09129v1-abstract-short" style="display: inline;"> This paper proposes a new differentially private distributed Nash equilibrium seeking algorithm for aggregative games under time-varying unbalanced directed communication graphs. Random independent Laplace noises are injected into the transmitted information to protect players' sensitive information. Then, the push-sum consensus protocol is utilized to estimate the aggregate function with the pert… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09129v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09129v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09129v1-abstract-full" style="display: none;"> This paper proposes a new differentially private distributed Nash equilibrium seeking algorithm for aggregative games under time-varying unbalanced directed communication graphs. Random independent Laplace noises are injected into the transmitted information to protect players' sensitive information. Then, the push-sum consensus protocol is utilized to estimate the aggregate function with the perturbed information under the time-varying topologies. The weakening factor and the momentum term are designed to attenuate the negative affect of the noise and guarantee the convergence of the algorithm, respectively. The algorithm is then proven to ensure the almost sure convergence, as well as rigorous differential privacy with a finite cumulative privacy budget, without requiring a trade-off between provable convergence and differential privacy. Finally, the simulation is provided to demonstrate the effectiveness of the proposed algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09129v1-abstract-full').style.display = 'none'; document.getElementById('2502.09129v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08927">arXiv:2502.08927</a> <span> [<a href="https://arxiv.org/pdf/2502.08927">pdf</a>, <a href="https://arxiv.org/format/2502.08927">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Dynamic watermarks in images generated by diffusion models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yunzhuo Chen</a>, <a href="/search/cs?searchtype=author&query=Akhtar%2C+N">Naveed Akhtar</a>, <a href="/search/cs?searchtype=author&query=Haldar%2C+N+A+H">Nur Al Hasan Haldar</a>, <a href="/search/cs?searchtype=author&query=Mian%2C+A">Ajmal Mian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08927v1-abstract-short" style="display: inline;"> High-fidelity text-to-image diffusion models have revolutionized visual content generation, but their widespread use raises significant ethical concerns, including intellectual property protection and the misuse of synthetic media. To address these challenges, we propose a novel multi-stage watermarking framework for diffusion models, designed to establish copyright and trace generated images back… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08927v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08927v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08927v1-abstract-full" style="display: none;"> High-fidelity text-to-image diffusion models have revolutionized visual content generation, but their widespread use raises significant ethical concerns, including intellectual property protection and the misuse of synthetic media. To address these challenges, we propose a novel multi-stage watermarking framework for diffusion models, designed to establish copyright and trace generated images back to their source. Our multi-stage watermarking technique involves embedding: (i) a fixed watermark that is localized in the diffusion model's learned noise distribution and, (ii) a human-imperceptible, dynamic watermark in generates images, leveraging a fine-tuned decoder. By leveraging the Structural Similarity Index Measure (SSIM) and cosine similarity, we adapt the watermark's shape and color to the generated content while maintaining robustness. We demonstrate that our method enables reliable source verification through watermark classification, even when the dynamic watermark is adjusted for content-specific variations. Source model verification is enabled through watermark classification. o support further research, we generate a dataset of watermarked images and introduce a methodology to evaluate the statistical impact of watermarking on generated content.Additionally, we rigorously test our framework against various attack scenarios, demonstrating its robustness and minimal impact on image quality. Our work advances the field of AI-generated content security by providing a scalable solution for model ownership verification and misuse prevention. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08927v1-abstract-full').style.display = 'none'; document.getElementById('2502.08927v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08878">arXiv:2502.08878</a> <span> [<a href="https://arxiv.org/pdf/2502.08878">pdf</a>, <a href="https://arxiv.org/format/2502.08878">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Scalable Private Partition Selection via Adaptive Weighting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J+Y">Justin Y. Chen</a>, <a href="/search/cs?searchtype=author&query=Cohen-Addad%2C+V">Vincent Cohen-Addad</a>, <a href="/search/cs?searchtype=author&query=Epasto%2C+A">Alessandro Epasto</a>, <a href="/search/cs?searchtype=author&query=Zadimoghaddam%2C+M">Morteza Zadimoghaddam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08878v1-abstract-short" style="display: inline;"> In the differentially private partition selection problem (a.k.a. private set union, private key discovery), users hold subsets of items from an unbounded universe. The goal is to output as many items as possible from the union of the users' sets while maintaining user-level differential privacy. Solutions to this problem are a core building block for many privacy-preserving ML applications includ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08878v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08878v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08878v1-abstract-full" style="display: none;"> In the differentially private partition selection problem (a.k.a. private set union, private key discovery), users hold subsets of items from an unbounded universe. The goal is to output as many items as possible from the union of the users' sets while maintaining user-level differential privacy. Solutions to this problem are a core building block for many privacy-preserving ML applications including vocabulary extraction in a private corpus, computing statistics over categorical data, and learning embeddings over user-provided items. We propose an algorithm for this problem, MaximumAdaptiveDegree (MAD), which adaptively reroutes weight from items with weight far above the threshold needed for privacy to items with smaller weight, thereby increasing the probability that less frequent items are output. Our algorithm can be efficiently implemented in massively parallel computation systems allowing scalability to very large datasets. We prove that our algorithm stochastically dominates the standard parallel algorithm for this problem. We also develop a two-round version of our algorithm where results of the computation in the first round are used to bias the weighting in the second round to maximize the number of items output. In experiments, our algorithms provide the best results across the board among parallel algorithms and scale to datasets with hundreds of billions of items, up to three orders of magnitude larger than those analyzed by prior sequential algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08878v1-abstract-full').style.display = 'none'; document.getElementById('2502.08878v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08695">arXiv:2502.08695</a> <span> [<a href="https://arxiv.org/pdf/2502.08695">pdf</a>, <a href="https://arxiv.org/format/2502.08695">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Bayesian Nonparametric Perspective on Mahalanobis Distance for Out of Distribution Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Linderman%2C+R+W">Randolph W. Linderman</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yiran Chen</a>, <a href="/search/cs?searchtype=author&query=Linderman%2C+S+W">Scott W. Linderman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08695v1-abstract-short" style="display: inline;"> Bayesian nonparametric methods are naturally suited to the problem of out-of-distribution (OOD) detection. However, these techniques have largely been eschewed in favor of simpler methods based on distances between pre-trained or learned embeddings of data points. Here we show a formal relationship between Bayesian nonparametric models and the relative Mahalanobis distance score (RMDS), a commonly… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08695v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08695v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08695v1-abstract-full" style="display: none;"> Bayesian nonparametric methods are naturally suited to the problem of out-of-distribution (OOD) detection. However, these techniques have largely been eschewed in favor of simpler methods based on distances between pre-trained or learned embeddings of data points. Here we show a formal relationship between Bayesian nonparametric models and the relative Mahalanobis distance score (RMDS), a commonly used method for OOD detection. Building on this connection, we propose Bayesian nonparametric mixture models with hierarchical priors that generalize the RMDS. We evaluate these models on the OpenOOD detection benchmark and show that Bayesian nonparametric methods can improve upon existing OOD methods, especially in regimes where training classes differ in their covariance structure and where there are relatively few data points per class. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08695v1-abstract-full').style.display = 'none'; document.getElementById('2502.08695v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">32 pages, 5 figures, code is available at https://github.com/rwl93/bnp4ood</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08645">arXiv:2502.08645</a> <span> [<a href="https://arxiv.org/pdf/2502.08645">pdf</a>, <a href="https://arxiv.org/format/2502.08645">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Re$^3$Sim: Generating High-Fidelity Simulation Data via 3D-Photorealistic Real-to-Sim for Robotic Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiaoshen Han</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Minghuan Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yilun Chen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Junqiu Yu</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+X">Xiaoyang Lyu</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yang Tian</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bolun Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weinan Zhang</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+J">Jiangmiao Pang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08645v2-abstract-short" style="display: inline;"> Real-world data collection for robotics is costly and resource-intensive, requiring skilled operators and expensive hardware. Simulations offer a scalable alternative but often fail to achieve sim-to-real generalization due to geometric and visual gaps. To address these challenges, we propose a 3D-photorealistic real-to-sim system, namely, RE$^3$SIM, addressing geometric and visual sim-to-real gap… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08645v2-abstract-full').style.display = 'inline'; document.getElementById('2502.08645v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08645v2-abstract-full" style="display: none;"> Real-world data collection for robotics is costly and resource-intensive, requiring skilled operators and expensive hardware. Simulations offer a scalable alternative but often fail to achieve sim-to-real generalization due to geometric and visual gaps. To address these challenges, we propose a 3D-photorealistic real-to-sim system, namely, RE$^3$SIM, addressing geometric and visual sim-to-real gaps. RE$^3$SIM employs advanced 3D reconstruction and neural rendering techniques to faithfully recreate real-world scenarios, enabling real-time rendering of simulated cross-view cameras within a physics-based simulator. By utilizing privileged information to collect expert demonstrations efficiently in simulation, and train robot policies with imitation learning, we validate the effectiveness of the real-to-sim-to-real pipeline across various manipulation task scenarios. Notably, with only simulated data, we can achieve zero-shot sim-to-real transfer with an average success rate exceeding 58%. To push the limit of real-to-sim, we further generate a large-scale simulation dataset, demonstrating how a robust policy can be built from simulation data that generalizes across various objects. Codes and demos are available at: http://xshenhan.github.io/Re3Sim/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08645v2-abstract-full').style.display = 'none'; document.getElementById('2502.08645v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08505">arXiv:2502.08505</a> <span> [<a href="https://arxiv.org/pdf/2502.08505">pdf</a>, <a href="https://arxiv.org/format/2502.08505">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Bridging Domain Adaptation and Graph Neural Networks: A Tensor-Based Framework for Effective Label Propagation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wen%2C+T">Tao Wen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+E">Elynn Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuzhou Chen</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+Q">Qi Lei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08505v1-abstract-short" style="display: inline;"> Graph Neural Networks (GNNs) have recently become the predominant tools for studying graph data. Despite state-of-the-art performance on graph classification tasks, GNNs are overwhelmingly trained in a single domain under supervision, thus necessitating a prohibitively high demand for labels and resulting in poorly transferable representations. To address this challenge, we propose the Label-Propa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08505v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08505v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08505v1-abstract-full" style="display: none;"> Graph Neural Networks (GNNs) have recently become the predominant tools for studying graph data. Despite state-of-the-art performance on graph classification tasks, GNNs are overwhelmingly trained in a single domain under supervision, thus necessitating a prohibitively high demand for labels and resulting in poorly transferable representations. To address this challenge, we propose the Label-Propagation Tensor Graph Neural Network (LP-TGNN) framework to bridge the gap between graph data and traditional domain adaptation methods. It extracts graph topological information holistically with a tensor architecture and then reduces domain discrepancy through label propagation. It is readily compatible with general GNNs and domain adaptation techniques with minimal adjustment through pseudo-labeling. Experiments on various real-world benchmarks show that our LP-TGNN outperforms baselines by a notable margin. We also validate and analyze each component of the proposed framework in the ablation study. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08505v1-abstract-full').style.display = 'none'; document.getElementById('2502.08505v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08340">arXiv:2502.08340</a> <span> [<a href="https://arxiv.org/pdf/2502.08340">pdf</a>, <a href="https://arxiv.org/format/2502.08340">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Hierarchical Learning-based Graph Partition for Large-scale Vehicle Routing Problems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pan%2C+Y">Yuxin Pan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+R">Ruohong Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yize Chen</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Z">Zhiguang Cao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+F">Fangzhen Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08340v1-abstract-short" style="display: inline;"> Neural solvers based on the divide-and-conquer approach for Vehicle Routing Problems (VRPs) in general, and capacitated VRP (CVRP) in particular, integrates the global partition of an instance with local constructions for each subproblem to enhance generalization. However, during the global partition phase, misclusterings within subgraphs have a tendency to progressively compound throughout the mu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08340v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08340v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08340v1-abstract-full" style="display: none;"> Neural solvers based on the divide-and-conquer approach for Vehicle Routing Problems (VRPs) in general, and capacitated VRP (CVRP) in particular, integrates the global partition of an instance with local constructions for each subproblem to enhance generalization. However, during the global partition phase, misclusterings within subgraphs have a tendency to progressively compound throughout the multi-step decoding process of the learning-based partition policy. This suboptimal behavior in the global partition phase, in turn, may lead to a dramatic deterioration in the performance of the overall decomposition-based system, despite using optimal local constructions. To address these challenges, we propose a versatile Hierarchical Learning-based Graph Partition (HLGP) framework, which is tailored to benefit the partition of CVRP instances by synergistically integrating global and local partition policies. Specifically, the global partition policy is tasked with creating the coarse multi-way partition to generate the sequence of simpler two-way partition subtasks. These subtasks mark the initiation of the subsequent K local partition levels. At each local partition level, subtasks exclusive for this level are assigned to the local partition policy which benefits from the insensitive local topological features to incrementally alleviate the compounded errors. This framework is versatile in the sense that it optimizes the involved partition policies towards a unified objective harmoniously compatible with both reinforcement learning (RL) and supervised learning (SL). (*Due to the notification of arXiv "The Abstract field cannot be longer than 1,920 characters", the appeared Abstract is shortened. For the full Abstract, please download the Article.) <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08340v1-abstract-full').style.display = 'none'; document.getElementById('2502.08340v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted as a Full Paper at AAMAS 2025 (24th International Conference on Autonomous Agents and Multiagent Systems)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08216">arXiv:2502.08216</a> <span> [<a href="https://arxiv.org/pdf/2502.08216">pdf</a>, <a href="https://arxiv.org/format/2502.08216">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Deepfake Detection with Spatio-Temporal Consistency and Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yunzhuo Chen</a>, <a href="/search/cs?searchtype=author&query=Akhtar%2C+N">Naveed Akhtar</a>, <a href="/search/cs?searchtype=author&query=Haldar%2C+N+A+H">Nur Al Hasan Haldar</a>, <a href="/search/cs?searchtype=author&query=Mian%2C+A">Ajmal Mian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08216v1-abstract-short" style="display: inline;"> Deepfake videos are causing growing concerns among communities due to their ever-increasing realism. Naturally, automated detection of forged Deepfake videos is attracting a proportional amount of interest of researchers. Current methods for detecting forged videos mainly rely on global frame features and under-utilize the spatio-temporal inconsistencies found in the manipulated videos. Moreover,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08216v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08216v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08216v1-abstract-full" style="display: none;"> Deepfake videos are causing growing concerns among communities due to their ever-increasing realism. Naturally, automated detection of forged Deepfake videos is attracting a proportional amount of interest of researchers. Current methods for detecting forged videos mainly rely on global frame features and under-utilize the spatio-temporal inconsistencies found in the manipulated videos. Moreover, they fail to attend to manipulation-specific subtle and well-localized pattern variations along both spatial and temporal dimensions. Addressing these gaps, we propose a neural Deepfake detector that focuses on the localized manipulative signatures of the forged videos at individual frame level as well as frame sequence level. Using a ResNet backbone, it strengthens the shallow frame-level feature learning with a spatial attention mechanism. The spatial stream of the model is further helped by fusing texture enhanced shallow features with the deeper features. Simultaneously, the model processes frame sequences with a distance attention mechanism that further allows fusion of temporal attention maps with the learned features at the deeper layers. The overall model is trained to detect forged content as a classifier. We evaluate our method on two popular large data sets and achieve significant performance over the state-of-the-art methods.Moreover, our technique also provides memory and computational advantages over the competitive techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08216v1-abstract-full').style.display = 'none'; document.getElementById('2502.08216v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08161">arXiv:2502.08161</a> <span> [<a href="https://arxiv.org/pdf/2502.08161">pdf</a>, <a href="https://arxiv.org/format/2502.08161">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ICDM54844.2022.00070">10.1109/ICDM54844.2022.00070 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> MixDec Sampling: A Soft Link-based Sampling Method of Graph Neural Network for Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+X">Xiangjin Xie</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuxin Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruipeng Wang</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+K">Kai Ouyang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zihan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Hai-Tao Zheng</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+B">Buyue Qian</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Hansen Zheng</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+B">Bo Hu</a>, <a href="/search/cs?searchtype=author&query=Zhuo%2C+C">Chengxiang Zhuo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08161v1-abstract-short" style="display: inline;"> Graph neural networks have been widely used in recent recommender systems, where negative sampling plays an important role. Existing negative sampling methods restrict the relationship between nodes as either hard positive pairs or hard negative pairs. This leads to the loss of structural information, and lacks the mechanism to generate positive pairs for nodes with few neighbors. To overcome limi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08161v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08161v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08161v1-abstract-full" style="display: none;"> Graph neural networks have been widely used in recent recommender systems, where negative sampling plays an important role. Existing negative sampling methods restrict the relationship between nodes as either hard positive pairs or hard negative pairs. This leads to the loss of structural information, and lacks the mechanism to generate positive pairs for nodes with few neighbors. To overcome limitations, we propose a novel soft link-based sampling method, namely MixDec Sampling, which consists of Mixup Sampling module and Decay Sampling module. The Mixup Sampling augments node features by synthesizing new nodes and soft links, which provides sufficient number of samples for nodes with few neighbors. The Decay Sampling strengthens the digestion of graph structure information by generating soft links for node embedding learning. To the best of our knowledge, we are the first to model sampling relationships between nodes by soft links in GNN-based recommender systems. Extensive experiments demonstrate that the proposed MixDec Sampling can significantly and consistently improve the recommendation performance of several representative GNN-based models on various recommendation benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08161v1-abstract-full').style.display = 'none'; document.getElementById('2502.08161v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07937">arXiv:2502.07937</a> <span> [<a href="https://arxiv.org/pdf/2502.07937">pdf</a>, <a href="https://arxiv.org/format/2502.07937">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Active Advantage-Aligned Online Reinforcement Learning with Offline Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xuefeng Liu</a>, <a href="/search/cs?searchtype=author&query=Le%2C+H+T+C">Hung T. C. Le</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Siyu Chen</a>, <a href="/search/cs?searchtype=author&query=Stevens%2C+R">Rick Stevens</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhuoran Yang</a>, <a href="/search/cs?searchtype=author&query=Walter%2C+M+R">Matthew R. Walter</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuxin Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07937v1-abstract-short" style="display: inline;"> Online reinforcement learning (RL) enhances policies through direct interactions with the environment, but faces challenges related to sample efficiency. In contrast, offline RL leverages extensive pre-collected data to learn policies, but often produces suboptimal results due to limited data coverage. Recent efforts have sought to integrate offline and online RL in order to harness the advantages… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07937v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07937v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07937v1-abstract-full" style="display: none;"> Online reinforcement learning (RL) enhances policies through direct interactions with the environment, but faces challenges related to sample efficiency. In contrast, offline RL leverages extensive pre-collected data to learn policies, but often produces suboptimal results due to limited data coverage. Recent efforts have sought to integrate offline and online RL in order to harness the advantages of both approaches. However, effectively combining online and offline RL remains challenging due to issues that include catastrophic forgetting, lack of robustness and sample efficiency. In an effort to address these challenges, we introduce A3 RL , a novel method that actively selects data from combined online and offline sources to optimize policy improvement. We provide theoretical guarantee that validates the effectiveness our active sampling strategy and conduct thorough empirical experiments showing that our method outperforms existing state-of-the-art online RL techniques that utilize offline data. Our code will be publicly available at: https://github.com/xuefeng-cs/A3RL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07937v1-abstract-full').style.display = 'none'; document.getElementById('2502.07937v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07844">arXiv:2502.07844</a> <span> [<a href="https://arxiv.org/pdf/2502.07844">pdf</a>, <a href="https://arxiv.org/format/2502.07844">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> The establishment of static digital humans and the integration with spinal models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ju%2C+F">Fujiao Ju</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuo Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chengyin Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yinbo Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jianfeng Li</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+M">Mingjie Dong</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+B">Bin Fang</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+Q">Qianyu Zhuang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07844v1-abstract-short" style="display: inline;"> Adolescent idiopathic scoliosis (AIS), a prevalent spinal deformity, significantly affects individuals' health and quality of life. Conventional imaging techniques, such as X - rays, computed tomography (CT), and magnetic resonance imaging (MRI), offer static views of the spine. However, they are restricted in capturing the dynamic changes of the spine and its interactions with overall body motion… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07844v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07844v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07844v1-abstract-full" style="display: none;"> Adolescent idiopathic scoliosis (AIS), a prevalent spinal deformity, significantly affects individuals' health and quality of life. Conventional imaging techniques, such as X - rays, computed tomography (CT), and magnetic resonance imaging (MRI), offer static views of the spine. However, they are restricted in capturing the dynamic changes of the spine and its interactions with overall body motion. Therefore, developing new techniques to address these limitations has become extremely important. Dynamic digital human modeling represents a major breakthrough in digital medicine. It enables a three - dimensional (3D) view of the spine as it changes during daily activities, assisting clinicians in detecting deformities that might be missed in static imaging. Although dynamic modeling holds great potential, constructing an accurate static digital human model is a crucial initial step for high - precision simulations. In this study, our focus is on constructing an accurate static digital human model integrating the spine, which is vital for subsequent dynamic digital human research on AIS. First, we generate human point - cloud data by combining the 3D Gaussian method with the Skinned Multi - Person Linear (SMPL) model from the patient's multi - view images. Then, we fit a standard skeletal model to the generated human model. Next, we align the real spine model reconstructed from CT images with the standard skeletal model. We validated the resulting personalized spine model using X - ray data from six AIS patients, with Cobb angles (used to measure the severity of scoliosis) as evaluation metrics. The results indicate that the model's error was within 1 degree of the actual measurements. This study presents an important method for constructing digital humans. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07844v1-abstract-full').style.display = 'none'; document.getElementById('2502.07844v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07837">arXiv:2502.07837</a> <span> [<a href="https://arxiv.org/pdf/2502.07837">pdf</a>, <a href="https://arxiv.org/format/2502.07837">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> RoboBERT: An End-to-end Multimodal Robotic Manipulation Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sicheng Wang</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+J">Jianhua Shan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Haozhang Gao</a>, <a href="/search/cs?searchtype=author&query=Han%2C+H">Hailiang Han</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yipeng Chen</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+K">Kang Wei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chengkun Zhang</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+K">Kairos Wong</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jie Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Lei Zhao</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+B">Bin Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07837v1-abstract-short" style="display: inline;"> Embodied intelligence integrates multiple modalities, enabling agents to understand images, language, and actions simultaneously. However, existing models always depend on additional datasets or extensive pre-training to maximize performance improvements, consuming abundant training time and expensive hardware cost. To tackle this issue, we present RoboBERT, a novel end-to-end robotic manipulation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07837v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07837v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07837v1-abstract-full" style="display: none;"> Embodied intelligence integrates multiple modalities, enabling agents to understand images, language, and actions simultaneously. However, existing models always depend on additional datasets or extensive pre-training to maximize performance improvements, consuming abundant training time and expensive hardware cost. To tackle this issue, we present RoboBERT, a novel end-to-end robotic manipulation model integrated with a unique training strategy. This model utilizes a CNN-based diffusion policy, enhancing and stabilizing the effectiveness of this model by separating training processes for different modalities. It also underscores the importance of data augmentation, verifying various techniques to significantly boost performance. Unlike models that depend on extra data or large foundation models, RoboBERT achieves a highly competitive success rate while using only language-labeled expert demonstrations and maintaining a relatively smaller model size. Specifically, RoboBERT achieves an average length of 4.52 on the CALVIN benchmark for \(ABCD \rightarrow D\) task, setting a new state-of-the-art (SOTA) record. Furthermore, when tested on a real robot, the model demonstrates superior performance, achieving a higher success rate than other methods trained with the same data. We propose that these concepts and methodologies of RoboBERT demonstrate extensive versatility and compatibility, contributing significantly to the development of lightweight multimodal robotic models. The code can be accessed on https://github.com/PeterWangsicheng/RoboBERT <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07837v1-abstract-full').style.display = 'none'; document.getElementById('2502.07837v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07832">arXiv:2502.07832</a> <span> [<a href="https://arxiv.org/pdf/2502.07832">pdf</a>, <a href="https://arxiv.org/format/2502.07832">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SHARP: Accelerating Language Model Inference by SHaring Adjacent layers with Recovery Parameters </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yiping Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hanxian Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yifang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jishen Zhao</a>, <a href="/search/cs?searchtype=author&query=Du%2C+S+S">Simon Shaolei Du</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yuandong Tian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07832v1-abstract-short" style="display: inline;"> While Large language models (LLMs) have advanced natural language processing tasks, their growing computational and memory demands make deployment on resource-constrained devices like mobile phones increasingly challenging. In this paper, we propose SHARP (SHaring Adjacent Layers with Recovery Parameters), a novel approach to accelerate LLM inference by sharing parameters across adjacent layers, t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07832v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07832v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07832v1-abstract-full" style="display: none;"> While Large language models (LLMs) have advanced natural language processing tasks, their growing computational and memory demands make deployment on resource-constrained devices like mobile phones increasingly challenging. In this paper, we propose SHARP (SHaring Adjacent Layers with Recovery Parameters), a novel approach to accelerate LLM inference by sharing parameters across adjacent layers, thus reducing memory load overhead, while introducing low-rank recovery parameters to maintain performance. Inspired by observations that consecutive layers have similar outputs, SHARP employs a two-stage recovery process: Single Layer Warmup (SLW), and Supervised Fine-Tuning (SFT). The SLW stage aligns the outputs of the shared layers using L_2 loss, providing a good initialization for the following SFT stage to further restore the model performance. Extensive experiments demonstrate that SHARP can recover the model's perplexity on various in-distribution tasks using no more than 50k fine-tuning data while reducing the number of stored MLP parameters by 38% to 65%. We also conduct several ablation studies of SHARP and show that replacing layers towards the later parts of the model yields better performance retention, and that different recovery parameterizations perform similarly when parameter counts are matched. Furthermore, SHARP saves 42.8% in model storage and reduces the total inference time by 42.2% compared to the original Llama2-7b model on mobile devices. Our results highlight SHARP as an efficient solution for reducing inference costs in deploying LLMs without the need for pretraining-scale resources. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07832v1-abstract-full').style.display = 'none'; document.getElementById('2502.07832v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">24 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07739">arXiv:2502.07739</a> <span> [<a href="https://arxiv.org/pdf/2502.07739">pdf</a>, <a href="https://arxiv.org/format/2502.07739">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> HRP: High-Rank Preheating for Superior LoRA Initialization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuzhu Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yingjie Wang</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+S">Shi Fu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+L">Li Shen</a>, <a href="/search/cs?searchtype=author&query=Jing%2C+Y">Yongcheng Jing</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+X">Xinmei Tian</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+D">Dacheng Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07739v1-abstract-short" style="display: inline;"> This paper studies the crucial impact of initialization on the convergence properties of Low-Rank Adaptation (LoRA). We theoretically demonstrate that random initialization, a widely used schema, will likely lead LoRA to random low-rank results, rather than the best low-rank result. While this issue can be mitigated by adjusting initialization towards a well-informed direction, it relies on prior… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07739v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07739v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07739v1-abstract-full" style="display: none;"> This paper studies the crucial impact of initialization on the convergence properties of Low-Rank Adaptation (LoRA). We theoretically demonstrate that random initialization, a widely used schema, will likely lead LoRA to random low-rank results, rather than the best low-rank result. While this issue can be mitigated by adjusting initialization towards a well-informed direction, it relies on prior knowledge of the target, which is typically unknown in real-world scenarios. To approximate this well-informed initial direction, we propose High-Rank Preheating (HRP), which fine-tunes high-rank LoRA for a few steps and uses the singular value decomposition of the preheated result as a superior initialization. HRP initialization is theory-supported to combine the convergence strengths of high-rank LoRA and the generalization strengths of low-rank LoRA. Extensive experiments demonstrate that HRP significantly enhances LoRA's effectiveness across various models and tasks, achieving performance comparable to full-parameter fine-tuning and outperforming other initialization strategies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07739v1-abstract-full').style.display = 'none'; document.getElementById('2502.07739v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07590">arXiv:2502.07590</a> <span> [<a href="https://arxiv.org/pdf/2502.07590">pdf</a>, <a href="https://arxiv.org/format/2502.07590">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DSV: Exploiting Dynamic Sparsity to Accelerate Large-Scale Video DiT Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+X">Xin Tan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuetao Chen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yimin Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xing Chen</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+K">Kun Yan</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+N">Nan Duan</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yibo Zhu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+D">Daxin Jiang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hong Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07590v1-abstract-short" style="display: inline;"> Diffusion Transformers (DiTs) have shown remarkable performance in modeling and generating high-quality videos. However, the quadratic computational complexity of 3D full attention mechanism presents significant challenges in scaling video DiT training, especially for high-definition and lengthy videos, where attention can dominate up to 95% of the end-to-end time and necessitate specialized commu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07590v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07590v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07590v1-abstract-full" style="display: none;"> Diffusion Transformers (DiTs) have shown remarkable performance in modeling and generating high-quality videos. However, the quadratic computational complexity of 3D full attention mechanism presents significant challenges in scaling video DiT training, especially for high-definition and lengthy videos, where attention can dominate up to 95% of the end-to-end time and necessitate specialized communication paradigms to handle large input sizes. This paper introduces DSV, a novel framework designed to accelerate and scale the training of video DiTs by leveraging the inherent dynamic attention sparsity throughout the training process. DSV employs a two-stage training algorithm that exploits sparsity patterns, focusing on critical elements supported by efficient, tailored kernels. To accommodate the new sparsity dimension, we develop a hybrid sparsity-aware context parallelism that effectively scales to large inputs by addressing the heterogeneity of sparsity across attention heads and blocks, resulting in optimized sparse computation and communication. Extensive evaluations demonstrate that DSV achieves up to 3.02x gain in training throughput with nearly no quality degradation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07590v1-abstract-full').style.display = 'none'; document.getElementById('2502.07590v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07527">arXiv:2502.07527</a> <span> [<a href="https://arxiv.org/pdf/2502.07527">pdf</a>, <a href="https://arxiv.org/format/2502.07527">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> NatureLM: Deciphering the Language of Nature for Scientific Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Yingce Xia</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+P">Peiran Jin</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shufang Xie</a>, <a href="/search/cs?searchtype=author&query=He%2C+L">Liang He</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+C">Chuan Cao</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Renqian Luo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+G">Guoqing Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yue Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zequn Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuan-Jyue Chen</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zekun Guo</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yeqi Bai</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+P">Pan Deng</a>, <a href="/search/cs?searchtype=author&query=Min%2C+Y">Yaosen Min</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Ziheng Lu</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+H">Hongxia Hao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Han Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jielan Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chang Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jia Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jianwei Zhu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+K">Kehan Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+K">Kaiyuan Gao</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+Q">Qizhi Pei</a> , et al. (20 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07527v1-abstract-short" style="display: inline;"> Foundation models have revolutionized natural language processing and artificial intelligence, significantly enhancing how machines comprehend and generate human languages. Inspired by the success of these foundation models, researchers have developed foundation models for individual scientific domains, including small molecules, materials, proteins, DNA, and RNA. However, these models are typical… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07527v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07527v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07527v1-abstract-full" style="display: none;"> Foundation models have revolutionized natural language processing and artificial intelligence, significantly enhancing how machines comprehend and generate human languages. Inspired by the success of these foundation models, researchers have developed foundation models for individual scientific domains, including small molecules, materials, proteins, DNA, and RNA. However, these models are typically trained in isolation, lacking the ability to integrate across different scientific domains. Recognizing that entities within these domains can all be represented as sequences, which together form the "language of nature", we introduce Nature Language Model (briefly, NatureLM), a sequence-based science foundation model designed for scientific discovery. Pre-trained with data from multiple scientific domains, NatureLM offers a unified, versatile model that enables various applications including: (i) generating and optimizing small molecules, proteins, RNA, and materials using text instructions; (ii) cross-domain generation/design, such as protein-to-molecule and protein-to-RNA generation; and (iii) achieving state-of-the-art performance in tasks like SMILES-to-IUPAC translation and retrosynthesis on USPTO-50k. NatureLM offers a promising generalist approach for various scientific tasks, including drug discovery (hit generation/optimization, ADMET optimization, synthesis), novel material design, and the development of therapeutic proteins or nucleotides. We have developed NatureLM models in different sizes (1 billion, 8 billion, and 46.7 billion parameters) and observed a clear improvement in performance as the model size increases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07527v1-abstract-full').style.display = 'none'; document.getElementById('2502.07527v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">81 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07358">arXiv:2502.07358</a> <span> [<a href="https://arxiv.org/pdf/2502.07358">pdf</a>, <a href="https://arxiv.org/format/2502.07358">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> SymbioSim: Human-in-the-loop Simulation Platform for Bidirectional Continuing Learning in Human-Robot Interaction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haoran Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yiteng Xu</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yiming Ren</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yaoqin Ye</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xinran Li</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+N">Ning Ding</a>, <a href="/search/cs?searchtype=author&query=Cong%2C+P">Peishan Cong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziyi Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bushi Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuhan Chen</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+Z">Zhiyang Dou</a>, <a href="/search/cs?searchtype=author&query=Leng%2C+X">Xiaokun Leng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Manyi Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yuexin Ma</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+C">Changhe Tu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07358v1-abstract-short" style="display: inline;"> The development of intelligent robots seeks to seamlessly integrate them into the human world, providing assistance and companionship in daily life and work, with the ultimate goal of achieving human-robot symbiosis. To realize this vision, robots must continuously learn and evolve through consistent interaction and collaboration with humans, while humans need to gradually develop an understanding… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07358v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07358v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07358v1-abstract-full" style="display: none;"> The development of intelligent robots seeks to seamlessly integrate them into the human world, providing assistance and companionship in daily life and work, with the ultimate goal of achieving human-robot symbiosis. To realize this vision, robots must continuously learn and evolve through consistent interaction and collaboration with humans, while humans need to gradually develop an understanding of and trust in robots through shared experiences. However, training and testing algorithms directly on physical robots involve substantial costs and safety risks. Moreover, current robotic simulators fail to support real human participation, limiting their ability to provide authentic interaction experiences and gather valuable human feedback. In this paper, we introduce SymbioSim, a novel human-in-the-loop robotic simulation platform designed to enable the safe and efficient development, evaluation, and optimization of human-robot interactions. By leveraging a carefully designed system architecture and modules, SymbioSim delivers a natural and realistic interaction experience, facilitating bidirectional continuous learning and adaptation for both humans and robots. Extensive experiments and user studies demonstrate the platform's promising performance and highlight its potential to significantly advance research on human-robot symbiosis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07358v1-abstract-full').style.display = 'none'; document.getElementById('2502.07358v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07309">arXiv:2502.07309</a> <span> [<a href="https://arxiv.org/pdf/2502.07309">pdf</a>, <a href="https://arxiv.org/format/2502.07309">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Semi-Supervised Vision-Centric 3D Occupancy World Model for Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Pengfei Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yupeng Zheng</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+W">Wei Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yan Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yilun Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07309v1-abstract-short" style="display: inline;"> Understanding world dynamics is crucial for planning in autonomous driving. Recent methods attempt to achieve this by learning a 3D occupancy world model that forecasts future surrounding scenes based on current observation. However, 3D occupancy labels are still required to produce promising results. Considering the high annotation cost for 3D outdoor scenes, we propose a semi-supervised vision-c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07309v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07309v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07309v1-abstract-full" style="display: none;"> Understanding world dynamics is crucial for planning in autonomous driving. Recent methods attempt to achieve this by learning a 3D occupancy world model that forecasts future surrounding scenes based on current observation. However, 3D occupancy labels are still required to produce promising results. Considering the high annotation cost for 3D outdoor scenes, we propose a semi-supervised vision-centric 3D occupancy world model, PreWorld, to leverage the potential of 2D labels through a novel two-stage training paradigm: the self-supervised pre-training stage and the fully-supervised fine-tuning stage. Specifically, during the pre-training stage, we utilize an attribute projection head to generate different attribute fields of a scene (e.g., RGB, density, semantic), thus enabling temporal supervision from 2D labels via volume rendering techniques. Furthermore, we introduce a simple yet effective state-conditioned forecasting module to recursively forecast future occupancy and ego trajectory in a direct manner. Extensive experiments on the nuScenes dataset validate the effectiveness and scalability of our method, and demonstrate that PreWorld achieves competitive performance across 3D occupancy prediction, 4D occupancy forecasting and motion planning tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07309v1-abstract-full').style.display = 'none'; document.getElementById('2502.07309v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07292">arXiv:2502.07292</a> <span> [<a href="https://arxiv.org/pdf/2502.07292">pdf</a>, <a href="https://arxiv.org/format/2502.07292">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Investigating Creativity in Humans and Generative AI Through Circles Exercises </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Duan%2C+R">Runlin Duan</a>, <a href="/search/cs?searchtype=author&query=Hsia%2C+S">Shao-Kang Hsia</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuzhao Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yichen Hu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+M">Ming Yin</a>, <a href="/search/cs?searchtype=author&query=Ramani%2C+K">Karthik Ramani</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07292v1-abstract-short" style="display: inline;"> Generative AI (GenAI) is transforming the creativity process. However, as presented in this paper, GenAI encounters "narrow creativity" barriers. We observe that both humans and GenAI focus on limited subsets of the design space. We investigate this phenomenon using the "Circles Exercise," a creativity test widely used to examine the creativity of humans. Quantitative analysis reveals that humans… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07292v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07292v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07292v1-abstract-full" style="display: none;"> Generative AI (GenAI) is transforming the creativity process. However, as presented in this paper, GenAI encounters "narrow creativity" barriers. We observe that both humans and GenAI focus on limited subsets of the design space. We investigate this phenomenon using the "Circles Exercise," a creativity test widely used to examine the creativity of humans. Quantitative analysis reveals that humans tend to generate familiar, high-frequency ideas, while GenAI produces a larger volume of incremental innovations at a low cost. However, similar to humans, it struggles to significantly expand creative boundaries. Moreover, advanced prompting strategies, such as Chain-of-Thought (CoT) prompting, mitigate narrow creativity issues but still fall short of substantially broadening the creative scope of humans and GenAI. These findings underscore both the challenges and opportunities for advancing GenAI-powered human creativity support tools. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07292v1-abstract-full').style.display = 'none'; document.getElementById('2502.07292v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07288">arXiv:2502.07288</a> <span> [<a href="https://arxiv.org/pdf/2502.07288">pdf</a>, <a href="https://arxiv.org/format/2502.07288">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> KPIs 2024 Challenge: Advancing Glomerular Segmentation from Patch- to Slide-Level </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+R">Ruining Deng</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+T">Tianyuan Yao</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yucheng Tang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Junlin Guo</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+S">Siqi Lu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+J">Juming Xiong</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+L">Lining Yu</a>, <a href="/search/cs?searchtype=author&query=Cap%2C+Q+H">Quan Huu Cap</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+P">Pengzhou Cai</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+L">Libin Lan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Ze Zhao</a>, <a href="/search/cs?searchtype=author&query=Galdran%2C+A">Adrian Galdran</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+A">Amit Kumar</a>, <a href="/search/cs?searchtype=author&query=Deotale%2C+G">Gunjan Deotale</a>, <a href="/search/cs?searchtype=author&query=Das%2C+D+K">Dev Kumar Das</a>, <a href="/search/cs?searchtype=author&query=Paik%2C+I">Inyoung Paik</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Joonho Lee</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G">Geongyu Lee</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yujia Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wangkai Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhaoyang Li</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+X">Xuege Hou</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zeyuan Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shengjin Wang</a>, <a href="/search/cs?searchtype=author&query=Fischer%2C+M">Maximilian Fischer</a> , et al. (22 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07288v1-abstract-short" style="display: inline;"> Chronic kidney disease (CKD) is a major global health issue, affecting over 10% of the population and causing significant mortality. While kidney biopsy remains the gold standard for CKD diagnosis and treatment, the lack of comprehensive benchmarks for kidney pathology segmentation hinders progress in the field. To address this, we organized the Kidney Pathology Image Segmentation (KPIs) Challenge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07288v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07288v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07288v1-abstract-full" style="display: none;"> Chronic kidney disease (CKD) is a major global health issue, affecting over 10% of the population and causing significant mortality. While kidney biopsy remains the gold standard for CKD diagnosis and treatment, the lack of comprehensive benchmarks for kidney pathology segmentation hinders progress in the field. To address this, we organized the Kidney Pathology Image Segmentation (KPIs) Challenge, introducing a dataset that incorporates preclinical rodent models of CKD with over 10,000 annotated glomeruli from 60+ Periodic Acid Schiff (PAS)-stained whole slide images. The challenge includes two tasks, patch-level segmentation and whole slide image segmentation and detection, evaluated using the Dice Similarity Coefficient (DSC) and F1-score. By encouraging innovative segmentation methods that adapt to diverse CKD models and tissue conditions, the KPIs Challenge aims to advance kidney pathology analysis, establish new benchmarks, and enable precise, large-scale quantification for disease research and diagnosis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07288v1-abstract-full').style.display = 'none'; document.getElementById('2502.07288v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07286">arXiv:2502.07286</a> <span> [<a href="https://arxiv.org/pdf/2502.07286">pdf</a>, <a href="https://arxiv.org/format/2502.07286">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Small Language Model Makes an Effective Long Text Extractor </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yelin Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fanjin Zhang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jie Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07286v1-abstract-short" style="display: inline;"> Named Entity Recognition (NER) is a fundamental problem in natural language processing (NLP). However, the task of extracting longer entity spans (e.g., awards) from extended texts (e.g., homepages) is barely explored. Current NER methods predominantly fall into two categories: span-based methods and generation-based methods. Span-based methods require the enumeration of all possible token-pair sp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07286v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07286v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07286v1-abstract-full" style="display: none;"> Named Entity Recognition (NER) is a fundamental problem in natural language processing (NLP). However, the task of extracting longer entity spans (e.g., awards) from extended texts (e.g., homepages) is barely explored. Current NER methods predominantly fall into two categories: span-based methods and generation-based methods. Span-based methods require the enumeration of all possible token-pair spans, followed by classification on each span, resulting in substantial redundant computations and excessive GPU memory usage. In contrast, generation-based methods involve prompting or fine-tuning large language models (LLMs) to adapt to downstream NER tasks. However, these methods struggle with the accurate generation of longer spans and often incur significant time costs for effective fine-tuning. To address these challenges, this paper introduces a lightweight span-based NER method called SeNER, which incorporates a bidirectional arrow attention mechanism coupled with LogN-Scaling on the [CLS] token to embed long texts effectively, and comprises a novel bidirectional sliding-window plus-shaped attention (BiSPA) mechanism to reduce redundant candidate token-pair spans significantly and model interactions between token-pair spans simultaneously. Extensive experiments demonstrate that our method achieves state-of-the-art extraction accuracy on three long NER datasets and is capable of extracting entities from long texts in a GPU-memory-friendly manner. Code: https://github.com/THUDM/scholar-profiling/tree/main/sener <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07286v1-abstract-full').style.display = 'none'; document.getElementById('2502.07286v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AAAI'25, 9 pages, 1 appendix pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07237">arXiv:2502.07237</a> <span> [<a href="https://arxiv.org/pdf/2502.07237">pdf</a>, <a href="https://arxiv.org/format/2502.07237">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> DrugImproverGPT: A Large Language Model for Drug Optimization with Fine-Tuning via Structured Policy Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xuefeng Liu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+S">Songhao Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Siyu Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhuoran Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuxin Chen</a>, <a href="/search/cs?searchtype=author&query=Foster%2C+I">Ian Foster</a>, <a href="/search/cs?searchtype=author&query=Stevens%2C+R">Rick Stevens</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07237v1-abstract-short" style="display: inline;"> Finetuning a Large Language Model (LLM) is crucial for generating results towards specific objectives. This research delves into the realm of drug optimization and introduce a novel reinforcement learning algorithm to finetune a drug optimization LLM-based generative model, enhancing the original drug across target objectives, while retains the beneficial chemical properties of the original drug.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07237v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07237v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07237v1-abstract-full" style="display: none;"> Finetuning a Large Language Model (LLM) is crucial for generating results towards specific objectives. This research delves into the realm of drug optimization and introduce a novel reinforcement learning algorithm to finetune a drug optimization LLM-based generative model, enhancing the original drug across target objectives, while retains the beneficial chemical properties of the original drug. This work is comprised of two primary components: (1) DrugImprover: A framework tailored for improving robustness and efficiency in drug optimization. It includes a LLM designed for drug optimization and a novel Structured Policy Optimization (SPO) algorithm, which is theoretically grounded. This algorithm offers a unique perspective for fine-tuning the LLM-based generative model by aligning the improvement of the generated molecule with the input molecule under desired objectives. (2) A dataset of 1 million compounds, each with OEDOCK docking scores on 5 human proteins associated with cancer cells and 24 binding sites from SARS-CoV-2 virus. We conduct a comprehensive evaluation of SPO and demonstrate its effectiveness in improving the original drug across target properties. Our code and dataset will be publicly available at: https://github.com/xuefeng-cs/DrugImproverGPT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07237v1-abstract-full').style.display = 'none'; document.getElementById('2502.07237v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07222">arXiv:2502.07222</a> <span> [<a href="https://arxiv.org/pdf/2502.07222">pdf</a>, <a href="https://arxiv.org/format/2502.07222">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Memory Efficient Randomized Subspace Optimization Method for Training Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yiming Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yin Liu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+K">Kun Yuan</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Z">Zaiwen Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07222v1-abstract-short" style="display: inline;"> The memory challenges associated with training Large Language Models (LLMs) have become a critical concern, particularly when using the Adam optimizer. To address this issue, numerous memory-efficient techniques have been proposed, with GaLore standing out as a notable example designed to reduce the memory footprint of optimizer states. However, these approaches do not alleviate the memory burden… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07222v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07222v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07222v1-abstract-full" style="display: none;"> The memory challenges associated with training Large Language Models (LLMs) have become a critical concern, particularly when using the Adam optimizer. To address this issue, numerous memory-efficient techniques have been proposed, with GaLore standing out as a notable example designed to reduce the memory footprint of optimizer states. However, these approaches do not alleviate the memory burden imposed by activations, rendering them unsuitable for scenarios involving long context sequences or large mini-batches. Moreover, their convergence properties are still not well-understood in the literature. In this work, we introduce a Randomized Subspace Optimization framework for pre-training and fine-tuning LLMs. Our approach decomposes the high-dimensional training problem into a series of lower-dimensional subproblems. At each iteration, a random subspace is selected, and the parameters within that subspace are optimized. This structured reduction in dimensionality allows our method to simultaneously reduce memory usage for both activations and optimizer states. We establish comprehensive convergence guarantees and derive rates for various scenarios, accommodating different optimization strategies to solve the subproblems. Extensive experiments validate the superior memory and communication efficiency of our method, achieving performance comparable to GaLore and Adam. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07222v1-abstract-full').style.display = 'none'; document.getElementById('2502.07222v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07218">arXiv:2502.07218</a> <span> [<a href="https://arxiv.org/pdf/2502.07218">pdf</a>, <a href="https://arxiv.org/format/2502.07218">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LUNAR: LLM Unlearning via Neural Activation Redirection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+W+F">William F. Shen</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xinchi Qiu</a>, <a href="/search/cs?searchtype=author&query=Kurmanji%2C+M">Meghdad Kurmanji</a>, <a href="/search/cs?searchtype=author&query=Iacob%2C+A">Alex Iacob</a>, <a href="/search/cs?searchtype=author&query=Sani%2C+L">Lorenzo Sani</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yihong Chen</a>, <a href="/search/cs?searchtype=author&query=Cancedda%2C+N">Nicola Cancedda</a>, <a href="/search/cs?searchtype=author&query=Lane%2C+N+D">Nicholas D. Lane</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07218v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) benefit from training on ever larger amounts of textual data, but as a result, they increasingly incur the risk of leaking private information. The ability to selectively remove knowledge from LLMs is, therefore, a highly desirable capability. In this paper, we propose LUNAR, a novel unlearning methodology grounded in the Linear Representation Hypothesis. LUNAR operate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07218v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07218v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07218v1-abstract-full" style="display: none;"> Large Language Models (LLMs) benefit from training on ever larger amounts of textual data, but as a result, they increasingly incur the risk of leaking private information. The ability to selectively remove knowledge from LLMs is, therefore, a highly desirable capability. In this paper, we propose LUNAR, a novel unlearning methodology grounded in the Linear Representation Hypothesis. LUNAR operates by redirecting the representations of unlearned data to regions that trigger the model's inherent ability to express its inability to answer. LUNAR achieves state-of-the-art unlearning performance while significantly enhancing the controllability of the unlearned model during inference. Specifically, LUNAR achieves between 2.9x to 11.7x improvements on combined "unlearning efficacy" and "model utility" score ("Deviation Score") on the PISTOL dataset across various base models. We also demonstrate, through quantitative analysis and qualitative examples, LUNAR's superior controllability in generating coherent and contextually aware responses, mitigating undesired side effects of existing methods. Moreover, we demonstrate that LUNAR is robust against white-box adversarial attacks and versatile in handling real-world scenarios, such as processing sequential unlearning requests. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07218v1-abstract-full').style.display = 'none'; document.getElementById('2502.07218v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07062">arXiv:2502.07062</a> <span> [<a href="https://arxiv.org/pdf/2502.07062">pdf</a>, <a href="https://arxiv.org/format/2502.07062">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> Breaking Barriers: Combinatorial Algorithms for Non-monotone Submodular Maximization with Sublinear Adaptivity and $1/e$ Approximation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yixin Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenjing Chen</a>, <a href="/search/cs?searchtype=author&query=Kuhnle%2C+A">Alan Kuhnle</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07062v1-abstract-short" style="display: inline;"> With the rapid growth of data in modern applications, parallel combinatorial algorithms for maximizing non-monotone submodular functions have gained significant attention. The state-of-the-art approximation ratio of $1/e$ is currently achieved only by a continuous algorithm (Ene & Nguyen, 2020) with adaptivity $\mathcal O(\log(n))$. In this work, we focus on size constraints and propose a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07062v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07062v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07062v1-abstract-full" style="display: none;"> With the rapid growth of data in modern applications, parallel combinatorial algorithms for maximizing non-monotone submodular functions have gained significant attention. The state-of-the-art approximation ratio of $1/e$ is currently achieved only by a continuous algorithm (Ene & Nguyen, 2020) with adaptivity $\mathcal O(\log(n))$. In this work, we focus on size constraints and propose a $(1/4-\varepsilon)$-approximation algorithm with high probability for this problem, as well as the first randomized parallel combinatorial algorithm achieving a $1/e-\varepsilon$ approximation ratio, which bridges the gap between continuous and combinatorial approaches. Both algorithms achieve $\mathcal O(\log(n)\log(k))$ adaptivity and $\mathcal O(n\log(n)\log(k))$ query complexity. Empirical results show our algorithms achieve competitive objective values, with the first algorithm particularly efficient in queries. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07062v1-abstract-full').style.display = 'none'; document.getElementById('2502.07062v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06994">arXiv:2502.06994</a> <span> [<a href="https://arxiv.org/pdf/2502.06994">pdf</a>, <a href="https://arxiv.org/format/2502.06994">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> SyncMind: Measuring Agent Out-of-Sync Recovery in Collaborative Software Engineering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xuehang Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xingyao Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yangyi Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Sha Li</a>, <a href="/search/cs?searchtype=author&query=Han%2C+C">Chi Han</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Manling Li</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+H">Heng Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06994v1-abstract-short" style="display: inline;"> Software engineering (SE) is increasingly collaborative, with developers working together on shared complex codebases. Effective collaboration in shared environments requires participants -- whether humans or AI agents -- to stay on the same page as their environment evolves. When a collaborator's understanding diverges from the current state -- what we term the out-of-sync challenge -- the collab… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06994v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06994v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06994v1-abstract-full" style="display: none;"> Software engineering (SE) is increasingly collaborative, with developers working together on shared complex codebases. Effective collaboration in shared environments requires participants -- whether humans or AI agents -- to stay on the same page as their environment evolves. When a collaborator's understanding diverges from the current state -- what we term the out-of-sync challenge -- the collaborator's actions may fail, leading to integration issues. In this work, we introduce SyncMind, a framework that systematically defines the out-of-sync problem faced by large language model (LLM) agents in collaborative software engineering (CSE). Based on SyncMind, we create SyncBench, a benchmark featuring 24,332 instances of agent out-of-sync scenarios in real-world CSE derived from 21 popular GitHub repositories with executable verification tests. Experiments on SyncBench uncover critical insights into existing LLM agents' capabilities and limitations. Besides substantial performance gaps among agents (from Llama-3.1 agent <= 3.33% to Claude-3.5-Sonnet >= 28.18%), their consistently low collaboration willingness (<= 4.86%) suggests fundamental limitations of existing LLM in CSE. However, when collaboration occurs, it positively correlates with out-of-sync recovery success. Minimal performance differences in agents' resource-aware out-of-sync recoveries further reveal their significant lack of resource awareness and adaptability, shedding light on future resource-efficient collaborative systems. Code and data are openly available on our project website: https://xhguo7.github.io/SyncMind/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06994v1-abstract-full').style.display = 'none'; document.getElementById('2502.06994v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06847">arXiv:2502.06847</a> <span> [<a href="https://arxiv.org/pdf/2502.06847">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> A Deep Learning Framework Integrating CNN and BiLSTM for Financial Systemic Risk Analysis and Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yu Cheng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhen Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuan Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuhan Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhenghao Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jinsong Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06847v1-abstract-short" style="display: inline;"> This study proposes a deep learning model based on the combination of convolutional neural network (CNN) and bidirectional long short-term memory network (BiLSTM) for discriminant analysis of financial systemic risk. The model first uses CNN to extract local patterns of multidimensional features of financial markets, and then models the bidirectional dependency of time series through BiLSTM, to co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06847v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06847v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06847v1-abstract-full" style="display: none;"> This study proposes a deep learning model based on the combination of convolutional neural network (CNN) and bidirectional long short-term memory network (BiLSTM) for discriminant analysis of financial systemic risk. The model first uses CNN to extract local patterns of multidimensional features of financial markets, and then models the bidirectional dependency of time series through BiLSTM, to comprehensively characterize the changing laws of systemic risk in spatial features and temporal dynamics. The experiment is based on real financial data sets. The results show that the model is significantly superior to traditional single models (such as BiLSTM, CNN, Transformer, and TCN) in terms of accuracy, recall, and F1 score. The F1-score reaches 0.88, showing extremely high discriminant ability. This shows that the joint strategy of combining CNN and BiLSTM can not only fully capture the complex patterns of market data but also effectively deal with the long-term dependency problem in time series data. In addition, this study also explores the robustness of the model in dealing with data noise and processing high-dimensional data, providing strong support for intelligent financial risk management. In the future, the research will further optimize the model structure, introduce methods such as reinforcement learning and multimodal data analysis, and improve the efficiency and generalization ability of the model to cope with a more complex financial environment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06847v1-abstract-full').style.display = 'none'; document.getElementById('2502.06847v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06823">arXiv:2502.06823</a> <span> [<a href="https://arxiv.org/pdf/2502.06823">pdf</a>, <a href="https://arxiv.org/format/2502.06823">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> CTR-Driven Advertising Image Generation with Multimodal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xingye Chen</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+W">Wei Feng</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhenbang Du</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weizhen Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yanyin Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haohan Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Linkai Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yaoyu Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jinyuan Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yu Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+J">Jingjing Lv</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Junjie Shen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhangang Lin</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+J">Jingping Shao</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Y">Yuanjie Shao</a>, <a href="/search/cs?searchtype=author&query=You%2C+X">Xinge You</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+C">Changxin Gao</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+N">Nong Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06823v1-abstract-short" style="display: inline;"> In web data, advertising images are crucial for capturing user attention and improving advertising effectiveness. Most existing methods generate background for products primarily focus on the aesthetic quality, which may fail to achieve satisfactory online performance. To address this limitation, we explore the use of Multimodal Large Language Models (MLLMs) for generating advertising images by op… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06823v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06823v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06823v1-abstract-full" style="display: none;"> In web data, advertising images are crucial for capturing user attention and improving advertising effectiveness. Most existing methods generate background for products primarily focus on the aesthetic quality, which may fail to achieve satisfactory online performance. To address this limitation, we explore the use of Multimodal Large Language Models (MLLMs) for generating advertising images by optimizing for Click-Through Rate (CTR) as the primary objective. Firstly, we build targeted pre-training tasks, and leverage a large-scale e-commerce multimodal dataset to equip MLLMs with initial capabilities for advertising image generation tasks. To further improve the CTR of generated images, we propose a novel reward model to fine-tune pre-trained MLLMs through Reinforcement Learning (RL), which can jointly utilize multimodal features and accurately reflect user click preferences. Meanwhile, a product-centric preference optimization strategy is developed to ensure that the generated background content aligns with the product characteristics after fine-tuning, enhancing the overall relevance and effectiveness of the advertising images. Extensive experiments have demonstrated that our method achieves state-of-the-art performance in both online and offline metrics. Our code and pre-trained models are publicly available at: https://github.com/Chenguoz/CAIG. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06823v1-abstract-full').style.display = 'none'; document.getElementById('2502.06823v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to WWW 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06419">arXiv:2502.06419</a> <span> [<a href="https://arxiv.org/pdf/2502.06419">pdf</a>, <a href="https://arxiv.org/format/2502.06419">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Occ-LLM: Enhancing Autonomous Driving with Occupancy-Based Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+T">Tianshuo Xu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Hao Lu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xu Yan</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yingjie Cai</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bingbing Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yingcong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06419v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have made substantial advancements in the field of robotic and autonomous driving. This study presents the first Occupancy-based Large Language Model (Occ-LLM), which represents a pioneering effort to integrate LLMs with an important representation. To effectively encode occupancy as input for the LLM and address the category imbalances associated with occupancy, we pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06419v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06419v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06419v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have made substantial advancements in the field of robotic and autonomous driving. This study presents the first Occupancy-based Large Language Model (Occ-LLM), which represents a pioneering effort to integrate LLMs with an important representation. To effectively encode occupancy as input for the LLM and address the category imbalances associated with occupancy, we propose Motion Separation Variational Autoencoder (MS-VAE). This innovative approach utilizes prior knowledge to distinguish dynamic objects from static scenes before inputting them into a tailored Variational Autoencoder (VAE). This separation enhances the model's capacity to concentrate on dynamic trajectories while effectively reconstructing static scenes. The efficacy of Occ-LLM has been validated across key tasks, including 4D occupancy forecasting, self-ego planning, and occupancy-based scene question answering. Comprehensive evaluations demonstrate that Occ-LLM significantly surpasses existing state-of-the-art methodologies, achieving gains of about 6\% in Intersection over Union (IoU) and 4\% in mean Intersection over Union (mIoU) for the task of 4D occupancy forecasting. These findings highlight the transformative potential of Occ-LLM in reshaping current paradigms within robotic and autonomous driving. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06419v1-abstract-full').style.display = 'none'; document.getElementById('2502.06419v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in 2025 IEEE International Conference on Robotics and Automation (ICRA)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06167">arXiv:2502.06167</a> <span> [<a href="https://arxiv.org/pdf/2502.06167">pdf</a>, <a href="https://arxiv.org/format/2502.06167">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Universal Approximation of Visual Autoregressive Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yifang Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaoyu Li</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yingyu Liang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zhenmei Shi</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zhao Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06167v1-abstract-short" style="display: inline;"> We investigate the fundamental limits of transformer-based foundation models, extending our analysis to include Visual Autoregressive (VAR) transformers. VAR represents a big step toward generating images using a novel, scalable, coarse-to-fine ``next-scale prediction'' framework. These models set a new quality bar, outperforming all previous methods, including Diffusion Transformers, while having… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06167v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06167v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06167v1-abstract-full" style="display: none;"> We investigate the fundamental limits of transformer-based foundation models, extending our analysis to include Visual Autoregressive (VAR) transformers. VAR represents a big step toward generating images using a novel, scalable, coarse-to-fine ``next-scale prediction'' framework. These models set a new quality bar, outperforming all previous methods, including Diffusion Transformers, while having state-of-the-art performance for image synthesis tasks. Our primary contributions establish that, for single-head VAR transformers with a single self-attention layer and single interpolation layer, the VAR Transformer is universal. From the statistical perspective, we prove that such simple VAR transformers are universal approximators for any image-to-image Lipschitz functions. Furthermore, we demonstrate that flow-based autoregressive transformers inherit similar approximation capabilities. Our results provide important design principles for effective and computationally efficient VAR Transformer strategies that can be used to extend their utility to more sophisticated VAR models in image generation and other related areas. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06167v1-abstract-full').style.display = 'none'; document.getElementById('2502.06167v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06150">arXiv:2502.06150</a> <span> [<a href="https://arxiv.org/pdf/2502.06150">pdf</a>, <a href="https://arxiv.org/format/2502.06150">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Scaling Public Health Text Annotation: Zero-Shot Learning vs. Crowdsourcing for Improved Efficiency and Labeling Accuracy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kazari%2C+K">Kamyar Kazari</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yong Chen</a>, <a href="/search/cs?searchtype=author&query=Shakeri%2C+Z">Zahra Shakeri</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06150v1-abstract-short" style="display: inline;"> Public health researchers are increasingly interested in using social media data to study health-related behaviors, but manually labeling this data can be labor-intensive and costly. This study explores whether zero-shot labeling using large language models (LLMs) can match or surpass conventional crowd-sourced annotation for Twitter posts related to sleep disorders, physical activity, and sedenta… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06150v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06150v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06150v1-abstract-full" style="display: none;"> Public health researchers are increasingly interested in using social media data to study health-related behaviors, but manually labeling this data can be labor-intensive and costly. This study explores whether zero-shot labeling using large language models (LLMs) can match or surpass conventional crowd-sourced annotation for Twitter posts related to sleep disorders, physical activity, and sedentary behavior. Multiple annotation pipelines were designed to compare labels produced by domain experts, crowd workers, and LLM-driven approaches under varied prompt-engineering strategies. Our findings indicate that LLMs can rival human performance in straightforward classification tasks and significantly reduce labeling time, yet their accuracy diminishes for tasks requiring more nuanced domain knowledge. These results clarify the trade-offs between automated scalability and human expertise, demonstrating conditions under which LLM-based labeling can be efficiently integrated into public health research without undermining label quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06150v1-abstract-full').style.display = 'none'; document.getElementById('2502.06150v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages, 1 figure</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06061">arXiv:2502.06061</a> <span> [<a href="https://arxiv.org/pdf/2502.06061">pdf</a>, <a href="https://arxiv.org/format/2502.06061">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Online Reward-Weighted Fine-Tuning of Flow Matching with Wasserstein Regularization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fan%2C+J">Jiajun Fan</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shuaike Shen</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+C">Chaoran Cheng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuxin Chen</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+C">Chumeng Liang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+G">Ge Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06061v1-abstract-short" style="display: inline;"> Recent advancements in reinforcement learning (RL) have achieved great success in fine-tuning diffusion-based generative models. However, fine-tuning continuous flow-based generative models to align with arbitrary user-defined reward functions remains challenging, particularly due to issues such as policy collapse from overoptimization and the prohibitively high computational cost of likelihoods i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06061v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06061v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06061v1-abstract-full" style="display: none;"> Recent advancements in reinforcement learning (RL) have achieved great success in fine-tuning diffusion-based generative models. However, fine-tuning continuous flow-based generative models to align with arbitrary user-defined reward functions remains challenging, particularly due to issues such as policy collapse from overoptimization and the prohibitively high computational cost of likelihoods in continuous-time flows. In this paper, we propose an easy-to-use and theoretically sound RL fine-tuning method, which we term Online Reward-Weighted Conditional Flow Matching with Wasserstein-2 Regularization (ORW-CFM-W2). Our method integrates RL into the flow matching framework to fine-tune generative models with arbitrary reward functions, without relying on gradients of rewards or filtered datasets. By introducing an online reward-weighting mechanism, our approach guides the model to prioritize high-reward regions in the data manifold. To prevent policy collapse and maintain diversity, we incorporate Wasserstein-2 (W2) distance regularization into our method and derive a tractable upper bound for it in flow matching, effectively balancing exploration and exploitation of policy optimization. We provide theoretical analyses to demonstrate the convergence properties and induced data distributions of our method, establishing connections with traditional RL algorithms featuring Kullback-Leibler (KL) regularization and offering a more comprehensive understanding of the underlying mechanisms and learning behavior of our approach. Extensive experiments on tasks including target image generation, image compression, and text-image alignment demonstrate the effectiveness of our method, where our method achieves optimal policy convergence while allowing controllable trade-offs between reward maximization and diversity preservation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06061v1-abstract-full').style.display = 'none'; document.getElementById('2502.06061v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">61 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05879">arXiv:2502.05879</a> <span> [<a href="https://arxiv.org/pdf/2502.05879">pdf</a>, <a href="https://arxiv.org/format/2502.05879">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Depression Detection with Chain-of-Thought Prompting: From Emotion to Reasoning Using Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Teng%2C+S">Shiyu Teng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaqing Liu</a>, <a href="/search/cs?searchtype=author&query=Jain%2C+R+K">Rahul Kumar Jain</a>, <a href="/search/cs?searchtype=author&query=Chai%2C+S">Shurong Chai</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+R">Ruibo Hou</a>, <a href="/search/cs?searchtype=author&query=Tateyama%2C+T">Tomoko Tateyama</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+L">Lanfen Lin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yen-wei Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05879v1-abstract-short" style="display: inline;"> Depression is one of the leading causes of disability worldwide, posing a severe burden on individuals, healthcare systems, and society at large. Recent advancements in Large Language Models (LLMs) have shown promise in addressing mental health challenges, including the detection of depression through text-based analysis. However, current LLM-based methods often struggle with nuanced symptom ident… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05879v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05879v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05879v1-abstract-full" style="display: none;"> Depression is one of the leading causes of disability worldwide, posing a severe burden on individuals, healthcare systems, and society at large. Recent advancements in Large Language Models (LLMs) have shown promise in addressing mental health challenges, including the detection of depression through text-based analysis. However, current LLM-based methods often struggle with nuanced symptom identification and lack a transparent, step-by-step reasoning process, making it difficult to accurately classify and explain mental health conditions. To address these challenges, we propose a Chain-of-Thought Prompting approach that enhances both the performance and interpretability of LLM-based depression detection. Our method breaks down the detection process into four stages: (1) sentiment analysis, (2) binary depression classification, (3) identification of underlying causes, and (4) assessment of severity. By guiding the model through these structured reasoning steps, we improve interpretability and reduce the risk of overlooking subtle clinical indicators. We validate our method on the E-DAIC dataset, where we test multiple state-of-the-art large language models. Experimental results indicate that our Chain-of-Thought Prompting technique yields superior performance in both classification accuracy and the granularity of diagnostic insights, compared to baseline approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05879v1-abstract-full').style.display = 'none'; document.getElementById('2502.05879v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05708">arXiv:2502.05708</a> <span> [<a href="https://arxiv.org/pdf/2502.05708">pdf</a>, <a href="https://arxiv.org/format/2502.05708">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> GWRF: A Generalizable Wireless Radiance Field for Wireless Signal Propagation Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kang Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuning Chen</a>, <a href="/search/cs?searchtype=author&query=Du%2C+W">Wan Du</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05708v1-abstract-short" style="display: inline;"> We present Generalizable Wireless Radiance Fields (GWRF), a framework for modeling wireless signal propagation at arbitrary 3D transmitter and receiver positions. Unlike previous methods that adapt vanilla Neural Radiance Fields (NeRF) from the optical to the wireless signal domain, requiring extensive per-scene training, GWRF generalizes effectively across scenes. First, a geometry-aware Transfor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05708v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05708v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05708v1-abstract-full" style="display: none;"> We present Generalizable Wireless Radiance Fields (GWRF), a framework for modeling wireless signal propagation at arbitrary 3D transmitter and receiver positions. Unlike previous methods that adapt vanilla Neural Radiance Fields (NeRF) from the optical to the wireless signal domain, requiring extensive per-scene training, GWRF generalizes effectively across scenes. First, a geometry-aware Transformer encoder-based wireless scene representation module incorporates information from geographically proximate transmitters to learn a generalizable wireless radiance field. Second, a neural-driven ray tracing algorithm operates on this field to automatically compute signal reception at the receiver. Experimental results demonstrate that GWRF outperforms existing methods on single scenes and achieves state-of-the-art performance on unseen scenes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05708v1-abstract-full').style.display = 'none'; document.getElementById('2502.05708v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05567">arXiv:2502.05567</a> <span> [<a href="https://arxiv.org/pdf/2502.05567">pdf</a>, <a href="https://arxiv.org/format/2502.05567">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ATLAS: Autoformalizing Theorems through Lifting, Augmentation, and Synthesis of Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoyang Liu</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+K">Kangjie Bao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiashuo Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yunqi Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yu Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuntian Liu</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+Y">Yang Jiao</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+T">Tao Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05567v1-abstract-short" style="display: inline;"> Autoformalization, the process of automatically translating natural language mathematics into machine-verifiable formal language, has demonstrated advancements with the progress of large language models (LLMs). However, a key obstacle to further advancements is the scarcity of paired datasets that align natural language with formal language. To address this challenge, we introduce ATLAS (Autoforma… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05567v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05567v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05567v1-abstract-full" style="display: none;"> Autoformalization, the process of automatically translating natural language mathematics into machine-verifiable formal language, has demonstrated advancements with the progress of large language models (LLMs). However, a key obstacle to further advancements is the scarcity of paired datasets that align natural language with formal language. To address this challenge, we introduce ATLAS (Autoformalizing Theorems through Lifting, Augmentation, and Synthesis of Data), an iterative data generation framework designed to produce large-scale, high-quality parallel theorem statements. With the proposed ATLAS running for 10 iterations, we construct an undergraduate-level dataset comprising 300k theorem statements and develop the ATLAS translator, achieving accuracies of 80.59% (pass@8) and 92.99% (pass@128) on ProofNet, significantly outperforming the base model (23.99% and 47.17%) and InternLM2-Math-Plus-7B (50.94% and 80.32%). Furthermore, the ATLAS translator also achieves state-of-the-art performance on both the high-school-level miniF2F dataset and the graduate-level MathQual dataset introduced in this work. The datasets, model, and code will be released to the public soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05567v1-abstract-full').style.display = 'none'; document.getElementById('2502.05567v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05555">arXiv:2502.05555</a> <span> [<a href="https://arxiv.org/pdf/2502.05555">pdf</a>, <a href="https://arxiv.org/format/2502.05555">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Efficient Reinforcement Learning Through Adaptively Pretrained Visual Encoder </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuhan Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+G">Guoqing Ma</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+G">Guangfu Hao</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+L">Liangxuan Guo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yang Chen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+S">Shan Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05555v1-abstract-short" style="display: inline;"> While Reinforcement Learning (RL) agents can successfully learn to handle complex tasks, effectively generalizing acquired skills to unfamiliar settings remains a challenge. One of the reasons behind this is the visual encoders used are task-dependent, preventing effective feature extraction in different settings. To address this issue, recent studies have tried to pretrain encoders with diverse v… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05555v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05555v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05555v1-abstract-full" style="display: none;"> While Reinforcement Learning (RL) agents can successfully learn to handle complex tasks, effectively generalizing acquired skills to unfamiliar settings remains a challenge. One of the reasons behind this is the visual encoders used are task-dependent, preventing effective feature extraction in different settings. To address this issue, recent studies have tried to pretrain encoders with diverse visual inputs in order to improve their performance. However, they rely on existing pretrained encoders without further exploring the impact of pretraining period. In this work, we propose APE: efficient reinforcement learning through Adaptively Pretrained visual Encoder -- a framework that utilizes adaptive augmentation strategy during the pretraining phase and extracts generalizable features with only a few interactions within the task environments in the policy learning period. Experiments are conducted across various domains, including DeepMind Control Suite, Atari Games and Memory Maze benchmarks, to verify the effectiveness of our method. Results show that mainstream RL methods, such as DreamerV3 and DrQ-v2, achieve state-of-the-art performance when equipped with APE. In addition, APE significantly improves the sampling efficiency using only visual inputs during learning, approaching the efficiency of state-based method in several control tasks. These findings demonstrate the potential of adaptive pretraining of encoder in enhancing the generalization ability and efficiency of visual RL algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05555v1-abstract-full').style.display = 'none'; document.getElementById('2502.05555v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05503">arXiv:2502.05503</a> <span> [<a href="https://arxiv.org/pdf/2502.05503">pdf</a>, <a href="https://arxiv.org/format/2502.05503">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Physical Coherence Benchmark for Evaluating Video Generation Models via Optical Flow-guided Frame Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yongfan Chen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xiuwen Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianyu Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chunhua Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05503v1-abstract-short" style="display: inline;"> Recent advances in video generation models demonstrate their potential as world simulators, but they often struggle with videos deviating from physical laws, a key concern overlooked by most text-to-video benchmarks. We introduce a benchmark designed specifically to assess the Physical Coherence of generated videos, PhyCoBench. Our benchmark includes 120 prompts covering 7 categories of physical p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05503v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05503v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05503v1-abstract-full" style="display: none;"> Recent advances in video generation models demonstrate their potential as world simulators, but they often struggle with videos deviating from physical laws, a key concern overlooked by most text-to-video benchmarks. We introduce a benchmark designed specifically to assess the Physical Coherence of generated videos, PhyCoBench. Our benchmark includes 120 prompts covering 7 categories of physical principles, capturing key physical laws observable in video content. We evaluated four state-of-the-art (SoTA) T2V models on PhyCoBench and conducted manual assessments. Additionally, we propose an automated evaluation model: PhyCoPredictor, a diffusion model that generates optical flow and video frames in a cascade manner. Through a consistency evaluation comparing automated and manual sorting, the experimental results show that PhyCoPredictor currently aligns most closely with human evaluation. Therefore, it can effectively evaluate the physical coherence of videos, providing insights for future model optimization. Our benchmark, which includes physical coherence prompts, automatic evaluation tool PhyCoPredictor, and generated video dataset, will all be released on GitHub shortly. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05503v1-abstract-full').style.display = 'none'; document.getElementById('2502.05503v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05450">arXiv:2502.05450</a> <span> [<a href="https://arxiv.org/pdf/2502.05450">pdf</a>, <a href="https://arxiv.org/format/2502.05450">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ConRFT: A Reinforced Fine-tuning Method for VLA Models via Consistency Policy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuhui Chen</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+S">Shuai Tian</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shugao Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yingting Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoran Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+D">Dongbin Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05450v1-abstract-short" style="display: inline;"> Vision-Language-Action (VLA) models have shown substantial potential in real-world robotic manipulation. However, fine-tuning these models through supervised learning struggles to achieve robust performance due to limited, inconsistent demonstrations, especially in contact-rich environments. In this paper, we propose a reinforced fine-tuning approach for VLA models, named ConRFT, which consists of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05450v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05450v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05450v1-abstract-full" style="display: none;"> Vision-Language-Action (VLA) models have shown substantial potential in real-world robotic manipulation. However, fine-tuning these models through supervised learning struggles to achieve robust performance due to limited, inconsistent demonstrations, especially in contact-rich environments. In this paper, we propose a reinforced fine-tuning approach for VLA models, named ConRFT, which consists of offline and online fine-tuning with a unified consistency-based training objective, to address these challenges. In the offline stage, our method integrates behavior cloning and Q-learning to effectively extract policy from a small set of demonstrations and stabilize value estimating. In the online stage, the VLA model is further fine-tuned via consistency policy, with human interventions to ensure safe exploration and high sample efficiency. We evaluate our approach on eight diverse real-world manipulation tasks. It achieves an average success rate of 96.3% within 45-90 minutes of online fine-tuning, outperforming prior supervised methods with a 144% improvement in success rate and 1.9x shorter episode length. This work highlights the potential of integrating reinforcement learning to enhance the performance of VLA models for real-world robotic applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05450v1-abstract-full').style.display = 'none'; document.getElementById('2502.05450v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05352">arXiv:2502.05352</a> <span> [<a href="https://arxiv.org/pdf/2502.05352">pdf</a>, <a href="https://arxiv.org/format/2502.05352">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> ITBench: Evaluating AI Agents across Diverse Real-World IT Automation Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jha%2C+S">Saurabh Jha</a>, <a href="/search/cs?searchtype=author&query=Arora%2C+R">Rohan Arora</a>, <a href="/search/cs?searchtype=author&query=Watanabe%2C+Y">Yuji Watanabe</a>, <a href="/search/cs?searchtype=author&query=Yanagawa%2C+T">Takumi Yanagawa</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yinfang Chen</a>, <a href="/search/cs?searchtype=author&query=Clark%2C+J">Jackson Clark</a>, <a href="/search/cs?searchtype=author&query=Bhavya%2C+B">Bhavya Bhavya</a>, <a href="/search/cs?searchtype=author&query=Verma%2C+M">Mudit Verma</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+H">Harshit Kumar</a>, <a href="/search/cs?searchtype=author&query=Kitahara%2C+H">Hirokuni Kitahara</a>, <a href="/search/cs?searchtype=author&query=Zheutlin%2C+N">Noah Zheutlin</a>, <a href="/search/cs?searchtype=author&query=Takano%2C+S">Saki Takano</a>, <a href="/search/cs?searchtype=author&query=Pathak%2C+D">Divya Pathak</a>, <a href="/search/cs?searchtype=author&query=George%2C+F">Felix George</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xinbo Wu</a>, <a href="/search/cs?searchtype=author&query=Turkkan%2C+B+O">Bekir O. Turkkan</a>, <a href="/search/cs?searchtype=author&query=Vanloo%2C+G">Gerard Vanloo</a>, <a href="/search/cs?searchtype=author&query=Nidd%2C+M">Michael Nidd</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+T">Ting Dai</a>, <a href="/search/cs?searchtype=author&query=Chatterjee%2C+O">Oishik Chatterjee</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+P">Pranjal Gupta</a>, <a href="/search/cs?searchtype=author&query=Samanta%2C+S">Suranjana Samanta</a>, <a href="/search/cs?searchtype=author&query=Aggarwal%2C+P">Pooja Aggarwal</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+R">Rong Lee</a>, <a href="/search/cs?searchtype=author&query=Murali%2C+P">Pavankumar Murali</a> , et al. (18 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05352v1-abstract-short" style="display: inline;"> Realizing the vision of using AI agents to automate critical IT tasks depends on the ability to measure and understand effectiveness of proposed solutions. We introduce ITBench, a framework that offers a systematic methodology for benchmarking AI agents to address real-world IT automation tasks. Our initial release targets three key areas: Site Reliability Engineering (SRE), Compliance and Securit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05352v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05352v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05352v1-abstract-full" style="display: none;"> Realizing the vision of using AI agents to automate critical IT tasks depends on the ability to measure and understand effectiveness of proposed solutions. We introduce ITBench, a framework that offers a systematic methodology for benchmarking AI agents to address real-world IT automation tasks. Our initial release targets three key areas: Site Reliability Engineering (SRE), Compliance and Security Operations (CISO), and Financial Operations (FinOps). The design enables AI researchers to understand the challenges and opportunities of AI agents for IT automation with push-button workflows and interpretable metrics. ITBench includes an initial set of 94 real-world scenarios, which can be easily extended by community contributions. Our results show that agents powered by state-of-the-art models resolve only 13.8% of SRE scenarios, 25.2% of CISO scenarios, and 0% of FinOps scenarios. We expect ITBench to be a key enabler of AI-driven IT automation that is correct, safe, and fast. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05352v1-abstract-full').style.display = 'none'; document.getElementById('2502.05352v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05339">arXiv:2502.05339</a> <span> [<a href="https://arxiv.org/pdf/2502.05339">pdf</a>, <a href="https://arxiv.org/format/2502.05339">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Fast Subspace Fluid Simulation with Temporal-Aware Basis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+S">Siyuan Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yixin Chen</a>, <a href="/search/cs?searchtype=author&query=Benchekroun%2C+O">Otman Benchekroun</a>, <a href="/search/cs?searchtype=author&query=Panuelos%2C+J">Jonathan Panuelos</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+Y">Yue Chang</a>, <a href="/search/cs?searchtype=author&query=Grinspun%2C+E">Eitan Grinspun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhecheng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05339v1-abstract-short" style="display: inline;"> We present a novel reduced-order fluid simulation technique leveraging Dynamic Mode Decomposition (DMD) to achieve fast, memory-efficient, and user-controllable subspace simulation. We demonstrate that our approach combines the strengths of both spatial reduced order models (ROMs) as well as spectral decompositions. By optimizing for the operator that evolves a system state from one timestep to th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05339v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05339v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05339v1-abstract-full" style="display: none;"> We present a novel reduced-order fluid simulation technique leveraging Dynamic Mode Decomposition (DMD) to achieve fast, memory-efficient, and user-controllable subspace simulation. We demonstrate that our approach combines the strengths of both spatial reduced order models (ROMs) as well as spectral decompositions. By optimizing for the operator that evolves a system state from one timestep to the next, rather than the system state itself, we gain both the compressive power of spatial ROMs as well as the intuitive physical dynamics of spectral methods. The latter property is of particular interest in graphics applications, where user control of fluid phenomena is of high demand. We demonstrate this in various applications including spatial and temporal modulation tools and fluid upscaling with added turbulence. We adapt DMD for graphics applications by reducing computational overhead, incorporating user-defined force inputs, and optimizing memory usage with randomized SVD. The integration of OptDMD and DMD with Control (DMDc) facilitates noise-robust reconstruction and real-time user interaction. We demonstrate the technique's robustness across diverse simulation scenarios, including artistic editing, time-reversal, and super-resolution. Through experimental validation on challenging scenarios, such as colliding vortex rings and boundary-interacting plumes, our method also exhibits superior performance and fidelity with significantly fewer basis functions compared to existing spatial ROMs. The inherent linearity of the DMD operator enables unique application modes, such as time-reversible fluid simulation. This work establishes another avenue for developing real-time, high-quality fluid simulations, enriching the space of fluid simulation techniques in interactive graphics and animation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05339v1-abstract-full').style.display = 'none'; document.getElementById('2502.05339v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.6 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05282">arXiv:2502.05282</a> <span> [<a href="https://arxiv.org/pdf/2502.05282">pdf</a>, <a href="https://arxiv.org/format/2502.05282">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Homeomorphism Prior for False Positive and Negative Problem in Medical Image Dense Contrastive Representation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+Y">Yuting He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Boyu Wang</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+R">Rongjun Ge</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yang Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guanyu Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuo Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05282v1-abstract-short" style="display: inline;"> Dense contrastive representation learning (DCRL) has greatly improved the learning efficiency for image-dense prediction tasks, showing its great potential to reduce the large costs of medical image collection and dense annotation. However, the properties of medical images make unreliable correspondence discovery, bringing an open problem of large-scale false positive and negative (FP&N) pairs in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05282v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05282v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05282v1-abstract-full" style="display: none;"> Dense contrastive representation learning (DCRL) has greatly improved the learning efficiency for image-dense prediction tasks, showing its great potential to reduce the large costs of medical image collection and dense annotation. However, the properties of medical images make unreliable correspondence discovery, bringing an open problem of large-scale false positive and negative (FP&N) pairs in DCRL. In this paper, we propose GEoMetric vIsual deNse sImilarity (GEMINI) learning which embeds the homeomorphism prior to DCRL and enables a reliable correspondence discovery for effective dense contrast. We propose a deformable homeomorphism learning (DHL) which models the homeomorphism of medical images and learns to estimate a deformable mapping to predict the pixels' correspondence under topological preservation. It effectively reduces the searching space of pairing and drives an implicit and soft learning of negative pairs via a gradient. We also propose a geometric semantic similarity (GSS) which extracts semantic information in features to measure the alignment degree for the correspondence learning. It will promote the learning efficiency and performance of deformation, constructing positive pairs reliably. We implement two practical variants on two typical representation learning tasks in our experiments. Our promising results on seven datasets which outperform the existing methods show our great superiority. We will release our code on a companion link: https://github.com/YutingHe-list/GEMINI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05282v1-abstract-full').style.display = 'none'; document.getElementById('2502.05282v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by T-PAMI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05255">arXiv:2502.05255</a> <span> [<a href="https://arxiv.org/pdf/2502.05255">pdf</a>, <a href="https://arxiv.org/format/2502.05255">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Physics and Society">physics.soc-ph</span> </div> </div> <p class="title is-5 mathjax"> Incivility and Contentiousness Spillover between COVID-19 and Climate Science Engagement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Narimanzadeh%2C+H">Hasti Narimanzadeh</a>, <a href="/search/cs?searchtype=author&query=Badie-Modiri%2C+A">Arash Badie-Modiri</a>, <a href="/search/cs?searchtype=author&query=Smirnova%2C+I">Iuliia Smirnova</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T+H+Y">Ted Hsuan Yun Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05255v1-abstract-short" style="display: inline;"> Affective polarization and its accompanying cleavage-based sorting drives incivility and contentiousness around climate change and other science-related issues. Looking at the COVID-19 period, we study cross-domain spillover of incivility and contentiousness in public engagements with climate change and climate science on Twitter and Reddit. We find strong evidence of the signatures of affective p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05255v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05255v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05255v1-abstract-full" style="display: none;"> Affective polarization and its accompanying cleavage-based sorting drives incivility and contentiousness around climate change and other science-related issues. Looking at the COVID-19 period, we study cross-domain spillover of incivility and contentiousness in public engagements with climate change and climate science on Twitter and Reddit. We find strong evidence of the signatures of affective polarization surrounding COVID-19 spilling into the climate change domain. Across different social media systems, COVID-19 content is associated with incivility and contentiousness in climate discussions. These patterns of increased antagonism were responsive to pandemic events that made the link between science and public policy more salient. We also show that the observed spillover activated along pre-pandemic political cleavages, specifically anti-internationalist populist beliefs, that linked climate policy opposition to vaccine hesitancy. Our findings highlight the dangers of entrenched cross-domain polarization manifesting as spillover of antagonistic behavior. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05255v1-abstract-full').style.display = 'none'; document.getElementById('2502.05255v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">29 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05226">arXiv:2502.05226</a> <span> [<a href="https://arxiv.org/pdf/2502.05226">pdf</a>, <a href="https://arxiv.org/format/2502.05226">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Complexity">cs.CC</span> </div> </div> <p class="title is-5 mathjax"> Benchmarking of Quantum and Classical Computing in Large-Scale Dynamic Portfolio Optimization Under Market Frictions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Ying Chen</a>, <a href="/search/cs?searchtype=author&query=Koch%2C+T">Thorsten Koch</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+H">Hanqiu Peng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongrui Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05226v1-abstract-short" style="display: inline;"> Quantum computing is poised to transform the financial industry, yet its advantages over traditional methods have not been evidenced. As this technology rapidly evolves, benchmarking is essential to fairly evaluate and compare different computational strategies. This study presents a challenging yet solvable problem of large-scale dynamic portfolio optimization under realistic market conditions wi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05226v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05226v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05226v1-abstract-full" style="display: none;"> Quantum computing is poised to transform the financial industry, yet its advantages over traditional methods have not been evidenced. As this technology rapidly evolves, benchmarking is essential to fairly evaluate and compare different computational strategies. This study presents a challenging yet solvable problem of large-scale dynamic portfolio optimization under realistic market conditions with frictions. We frame this issue as a Quadratic Unconstrained Binary Optimization (QUBO) problem, compatible with digital computing and ready for quantum computing, to establish a reliable benchmark. By applying the latest solvers to real data, we release benchmarks that help verify true advancements in dynamic trading strategies, either quantum or digital computing, ensuring that reported improvements in portfolio optimization are based on robust, transparent, and comparable metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05226v1-abstract-full').style.display = 'none'; document.getElementById('2502.05226v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages, 9 figures, 5 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05206">arXiv:2502.05206</a> <span> [<a href="https://arxiv.org/pdf/2502.05206">pdf</a>, <a href="https://arxiv.org/format/2502.05206">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Safety at Scale: A Comprehensive Survey of Large Model Safety </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xingjun Ma</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yifeng Gao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yixu Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruofan Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Ye Sun</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Yifan Ding</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hengyuan Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yunhao Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yunhan Zhao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hanxun Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yige Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiaming Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+X">Xiang Zheng</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yang Bai</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zuxuan Wu</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xipeng Qiu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingfeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yiming Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jun Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Cong Wang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jindong Gu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Baoyuan Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Siheng Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianwei Zhang</a> , et al. (19 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05206v2-abstract-short" style="display: inline;"> The rapid advancement of large models, driven by their exceptional abilities in learning and generalization through large-scale pre-training, has reshaped the landscape of Artificial Intelligence (AI). These models are now foundational to a wide range of applications, including conversational AI, recommendation systems, autonomous driving, content generation, medical diagnostics, and scientific di… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05206v2-abstract-full').style.display = 'inline'; document.getElementById('2502.05206v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05206v2-abstract-full" style="display: none;"> The rapid advancement of large models, driven by their exceptional abilities in learning and generalization through large-scale pre-training, has reshaped the landscape of Artificial Intelligence (AI). These models are now foundational to a wide range of applications, including conversational AI, recommendation systems, autonomous driving, content generation, medical diagnostics, and scientific discovery. However, their widespread deployment also exposes them to significant safety risks, raising concerns about robustness, reliability, and ethical implications. This survey provides a systematic review of current safety research on large models, covering Vision Foundation Models (VFMs), Large Language Models (LLMs), Vision-Language Pre-training (VLP) models, Vision-Language Models (VLMs), Diffusion Models (DMs), and large-model-based Agents. Our contributions are summarized as follows: (1) We present a comprehensive taxonomy of safety threats to these models, including adversarial attacks, data poisoning, backdoor attacks, jailbreak and prompt injection attacks, energy-latency attacks, data and model extraction attacks, and emerging agent-specific threats. (2) We review defense strategies proposed for each type of attacks if available and summarize the commonly used datasets and benchmarks for safety research. (3) Building on this, we identify and discuss the open challenges in large model safety, emphasizing the need for comprehensive safety evaluations, scalable and effective defense mechanisms, and sustainable data practices. More importantly, we highlight the necessity of collective efforts from the research community and international collaboration. Our work can serve as a useful reference for researchers and practitioners, fostering the ongoing development of comprehensive defense systems and platforms to safeguard AI models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05206v2-abstract-full').style.display = 'none'; document.getElementById('2502.05206v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">47 pages, 3 figures, 11 tables GitHub: https://github.com/xingjunm/Awesome-Large-Model-Safety</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05176">arXiv:2502.05176</a> <span> [<a href="https://arxiv.org/pdf/2502.05176">pdf</a>, <a href="https://arxiv.org/format/2502.05176">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AuraFusion360: Augmented Unseen Region Alignment for Reference-based 360掳 Unbounded Scene Inpainting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chung-Ho Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yang-Jung Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Ying-Huan Chen</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jie-Ying Lee</a>, <a href="/search/cs?searchtype=author&query=Ke%2C+B">Bo-Hsu Ke</a>, <a href="/search/cs?searchtype=author&query=Mu%2C+C+T">Chun-Wei Tuan Mu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yi-Chuan Huang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chin-Yang Lin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Min-Hung Chen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yen-Yu Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yu-Lun Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05176v1-abstract-short" style="display: inline;"> Three-dimensional scene inpainting is crucial for applications from virtual reality to architectural visualization, yet existing methods struggle with view consistency and geometric accuracy in 360掳 unbounded scenes. We present AuraFusion360, a novel reference-based method that enables high-quality object removal and hole filling in 3D scenes represented by Gaussian Splatting. Our approach introdu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05176v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05176v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05176v1-abstract-full" style="display: none;"> Three-dimensional scene inpainting is crucial for applications from virtual reality to architectural visualization, yet existing methods struggle with view consistency and geometric accuracy in 360掳 unbounded scenes. We present AuraFusion360, a novel reference-based method that enables high-quality object removal and hole filling in 3D scenes represented by Gaussian Splatting. Our approach introduces (1) depth-aware unseen mask generation for accurate occlusion identification, (2) Adaptive Guided Depth Diffusion, a zero-shot method for accurate initial point placement without requiring additional training, and (3) SDEdit-based detail enhancement for multi-view coherence. We also introduce 360-USID, the first comprehensive dataset for 360掳 unbounded scene inpainting with ground truth. Extensive experiments demonstrate that AuraFusion360 significantly outperforms existing methods, achieving superior perceptual quality while maintaining geometric accuracy across dramatic viewpoint changes. See our project page for video results and the dataset at https://kkennethwu.github.io/aurafusion360/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05176v1-abstract-full').style.display = 'none'; document.getElementById('2502.05176v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://kkennethwu.github.io/aurafusion360/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04734">arXiv:2502.04734</a> <span> [<a href="https://arxiv.org/pdf/2502.04734">pdf</a>, <a href="https://arxiv.org/format/2502.04734">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> SC-OmniGS: Self-Calibrating Omnidirectional Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+H">Huajian Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yingshu Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Longwei Li</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hui Cheng</a>, <a href="/search/cs?searchtype=author&query=Braud%2C+T">Tristan Braud</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yajie Zhao</a>, <a href="/search/cs?searchtype=author&query=Yeung%2C+S">Sai-Kit Yeung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04734v1-abstract-short" style="display: inline;"> 360-degree cameras streamline data collection for radiance field 3D reconstruction by capturing comprehensive scene data. However, traditional radiance field methods do not address the specific challenges inherent to 360-degree images. We present SC-OmniGS, a novel self-calibrating omnidirectional Gaussian splatting system for fast and accurate omnidirectional radiance field reconstruction using 3… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04734v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04734v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04734v1-abstract-full" style="display: none;"> 360-degree cameras streamline data collection for radiance field 3D reconstruction by capturing comprehensive scene data. However, traditional radiance field methods do not address the specific challenges inherent to 360-degree images. We present SC-OmniGS, a novel self-calibrating omnidirectional Gaussian splatting system for fast and accurate omnidirectional radiance field reconstruction using 360-degree images. Rather than converting 360-degree images to cube maps and performing perspective image calibration, we treat 360-degree images as a whole sphere and derive a mathematical framework that enables direct omnidirectional camera pose calibration accompanied by 3D Gaussians optimization. Furthermore, we introduce a differentiable omnidirectional camera model in order to rectify the distortion of real-world data for performance enhancement. Overall, the omnidirectional camera intrinsic model, extrinsic poses, and 3D Gaussians are jointly optimized by minimizing weighted spherical photometric loss. Extensive experiments have demonstrated that our proposed SC-OmniGS is able to recover a high-quality radiance field from noisy camera poses or even no pose prior in challenging scenarios characterized by wide baselines and non-object-centric configurations. The noticeable performance gain in the real-world dataset captured by consumer-grade omnidirectional cameras verifies the effectiveness of our general omnidirectional camera model in reducing the distortion of 360-degree images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04734v1-abstract-full').style.display = 'none'; document.getElementById('2502.04734v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICLR 2025, Project Page: http://www.chenyingshu.com/sc-omnigs/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04692">arXiv:2502.04692</a> <span> [<a href="https://arxiv.org/pdf/2502.04692">pdf</a>, <a href="https://arxiv.org/ps/2502.04692">ps</a>, <a href="https://arxiv.org/format/2502.04692">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> STRIDE: Automating Reward Design, Deep Reinforcement Learning Training and Feedback Optimization in Humanoid Robotics Locomotion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhenwei Wu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jinxiong Lu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuxiao Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yunxin Liu</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+Y">Yueting Zhuang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+L">Luhui Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04692v3-abstract-short" style="display: inline;"> Humanoid robotics presents significant challenges in artificial intelligence, requiring precise coordination and control of high-degree-of-freedom systems. Designing effective reward functions for deep reinforcement learning (DRL) in this domain remains a critical bottleneck, demanding extensive manual effort, domain expertise, and iterative refinement. To overcome these challenges, we introduce S… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04692v3-abstract-full').style.display = 'inline'; document.getElementById('2502.04692v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04692v3-abstract-full" style="display: none;"> Humanoid robotics presents significant challenges in artificial intelligence, requiring precise coordination and control of high-degree-of-freedom systems. Designing effective reward functions for deep reinforcement learning (DRL) in this domain remains a critical bottleneck, demanding extensive manual effort, domain expertise, and iterative refinement. To overcome these challenges, we introduce STRIDE, a novel framework built on agentic engineering to automate reward design, DRL training, and feedback optimization for humanoid robot locomotion tasks. By combining the structured principles of agentic engineering with large language models (LLMs) for code-writing, zero-shot generation, and in-context optimization, STRIDE generates, evaluates, and iteratively refines reward functions without relying on task-specific prompts or templates. Across diverse environments featuring humanoid robot morphologies, STRIDE outperforms the state-of-the-art reward design framework EUREKA, achieving an average improvement of round 250% in efficiency and task performance. Using STRIDE-generated rewards, simulated humanoid robots achieve sprint-level locomotion across complex terrains, highlighting its ability to advance DRL workflows and humanoid robotics research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04692v3-abstract-full').style.display = 'none'; document.getElementById('2502.04692v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04575">arXiv:2502.04575</a> <span> [<a href="https://arxiv.org/pdf/2502.04575">pdf</a>, <a href="https://arxiv.org/ps/2502.04575">ps</a>, <a href="https://arxiv.org/format/2502.04575">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Physics">physics.comp-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation">stat.CO</span> </div> </div> <p class="title is-5 mathjax"> Complexity Analysis of Normalizing Constant Estimation: from Jarzynski Equality to Annealed Importance Sampling and beyond </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+W">Wei Guo</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+M">Molei Tao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yongxin Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04575v1-abstract-short" style="display: inline;"> Given an unnormalized probability density $蟺\propto\mathrm{e}^{-V}$, estimating its normalizing constant $Z=\int_{\mathbb{R}^d}\mathrm{e}^{-V(x)}\mathrm{d}x$ or free energy $F=-\log Z$ is a crucial problem in Bayesian statistics, statistical mechanics, and machine learning. It is challenging especially in high dimensions or when $蟺$ is multimodal. To mitigate the high variance of conventional impo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04575v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04575v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04575v1-abstract-full" style="display: none;"> Given an unnormalized probability density $蟺\propto\mathrm{e}^{-V}$, estimating its normalizing constant $Z=\int_{\mathbb{R}^d}\mathrm{e}^{-V(x)}\mathrm{d}x$ or free energy $F=-\log Z$ is a crucial problem in Bayesian statistics, statistical mechanics, and machine learning. It is challenging especially in high dimensions or when $蟺$ is multimodal. To mitigate the high variance of conventional importance sampling estimators, annealing-based methods such as Jarzynski equality and annealed importance sampling are commonly adopted, yet their quantitative complexity guarantees remain largely unexplored. We take a first step toward a non-asymptotic analysis of annealed importance sampling. In particular, we derive an oracle complexity of $\widetilde{O}\left(\frac{d尾^2{\mathcal{A}}^2}{\varepsilon^4}\right)$ for estimating $Z$ within $\varepsilon$ relative error with high probability, where $尾$ is the smoothness of $V$ and $\mathcal{A}$ denotes the action of a curve of probability measures interpolating $蟺$ and a tractable reference distribution. Our analysis, leveraging Girsanov theorem and optimal transport, does not explicitly require isoperimetric assumptions on the target distribution. Finally, to tackle the large action of the widely used geometric interpolation of probability distributions, we propose a new normalizing constant estimation algorithm based on reverse diffusion samplers and establish a framework for analyzing its complexity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04575v1-abstract-full').style.display = 'none'; document.getElementById('2502.04575v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Chen%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Chen%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>