Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 337 results for author: <span class="mathjax">Yin, Z</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Yin%2C+Z">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Yin, Z"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Yin%2C+Z&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Yin, Z"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yin%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yin%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yin%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yin%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yin%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yin%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Yin%2C+Z&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> <li> <a href="/search/?searchtype=author&query=Yin%2C+Z&start=300" class="pagination-link " aria-label="Page 7" aria-current="page">7 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07343">arXiv:2502.07343</a> <span> [<a href="https://arxiv.org/pdf/2502.07343">pdf</a>, <a href="https://arxiv.org/format/2502.07343">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> DEG: Efficient Hybrid Vector Search Using the Dynamic Edge Navigation Graph </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Ziqi Yin</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jianyang Gao</a>, <a href="/search/cs?searchtype=author&query=Balsebre%2C+P">Pasquale Balsebre</a>, <a href="/search/cs?searchtype=author&query=Cong%2C+G">Gao Cong</a>, <a href="/search/cs?searchtype=author&query=Long%2C+C">Cheng Long</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07343v1-abstract-short" style="display: inline;"> Bimodal data, such as image-text pairs, has become increasingly prevalent in the digital era. The Hybrid Vector Query (HVQ) is an effective approach for querying such data and has recently garnered considerable attention from researchers. It calculates similarity scores for objects represented by two vectors using a weighted sum of each individual vector's similarity, with a query-specific paramet… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07343v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07343v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07343v1-abstract-full" style="display: none;"> Bimodal data, such as image-text pairs, has become increasingly prevalent in the digital era. The Hybrid Vector Query (HVQ) is an effective approach for querying such data and has recently garnered considerable attention from researchers. It calculates similarity scores for objects represented by two vectors using a weighted sum of each individual vector's similarity, with a query-specific parameter $伪$ to determine the weight. Existing methods for HVQ typically construct Approximate Nearest Neighbors Search (ANNS) indexes with a fixed $伪$ value. This leads to significant performance degradation when the query's $伪$ dynamically changes based on the different scenarios and needs. In this study, we introduce the Dynamic Edge Navigation Graph (DEG), a graph-based ANNS index that maintains efficiency and accuracy with changing $伪$ values. It includes three novel components: (1) a greedy Pareto frontier search algorithm to compute a candidate neighbor set for each node, which comprises the node's approximate nearest neighbors for all possible $伪$ values; (2) a dynamic edge pruning strategy to determine the final edges from the candidate set and assign each edge an active range. This active range enables the dynamic use of the Relative Neighborhood Graph's pruning strategy based on the query's $伪$ values, skipping redundant edges at query time and achieving a better accuracy-efficiency trade-off; and (3) an edge seed method that accelerates the querying process. Extensive experiments on real-world datasets show that DEG demonstrates superior performance compared to existing methods under varying $伪$ values. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07343v1-abstract-full').style.display = 'none'; document.getElementById('2502.07343v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by sigmod 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06868">arXiv:2502.06868</a> <span> [<a href="https://arxiv.org/pdf/2502.06868">pdf</a>, <a href="https://arxiv.org/format/2502.06868">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Related Knowledge Perturbation Matters: Rethinking Multiple Pieces of Knowledge Editing in Same-Subject </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Duan%2C+Z">Zenghao Duan</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+W">Wenbin Duan</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhiyi Yin</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yinghan Shen</a>, <a href="/search/cs?searchtype=author&query=Jing%2C+S">Shaoling Jing</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jie Zhang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+H">Huawei Shen</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xueqi Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06868v1-abstract-short" style="display: inline;"> Knowledge editing has become a promising approach for efficiently and precisely updating knowledge embedded in large language models (LLMs). In this work, we focus on Same-Subject Editing, which involves modifying multiple attributes of a single entity to ensure comprehensive and consistent updates to entity-centric knowledge. Through preliminary observation, we identify a significant challenge: C… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06868v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06868v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06868v1-abstract-full" style="display: none;"> Knowledge editing has become a promising approach for efficiently and precisely updating knowledge embedded in large language models (LLMs). In this work, we focus on Same-Subject Editing, which involves modifying multiple attributes of a single entity to ensure comprehensive and consistent updates to entity-centric knowledge. Through preliminary observation, we identify a significant challenge: Current state-of-the-art editing methods struggle when tasked with editing multiple related knowledge pieces for the same subject. To address the lack of relevant editing data for identical subjects in traditional benchmarks, we introduce the $\text{S}^2\text{RKE}$(Same-Subject Related Knowledge Editing) benchmark. Our extensive experiments reveal that only mainstream locate-then-edit methods, such as ROME and MEMIT, exhibit "related knowledge perturbation," where subsequent edits interfere with earlier ones. Further analysis reveals that these methods over-rely on subject information, neglecting other critical factors, resulting in reduced editing effectiveness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06868v1-abstract-full').style.display = 'none'; document.getElementById('2502.06868v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NAACL 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05690">arXiv:2502.05690</a> <span> [<a href="https://arxiv.org/pdf/2502.05690">pdf</a>, <a href="https://arxiv.org/format/2502.05690">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="General Economics">econ.GN</span> </div> </div> <p class="title is-5 mathjax"> Managing Geological Uncertainty in Critical Mineral Supply Chains: A POMDP Approach with Application to U.S. Lithium Resources </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Arief%2C+M">Mansur Arief</a>, <a href="/search/cs?searchtype=author&query=Alonso%2C+Y">Yasmine Alonso</a>, <a href="/search/cs?searchtype=author&query=Oshiro%2C+C">CJ Oshiro</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+W">William Xu</a>, <a href="/search/cs?searchtype=author&query=Corso%2C+A">Anthony Corso</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+D+Z">David Zhen Yin</a>, <a href="/search/cs?searchtype=author&query=Caers%2C+J+K">Jef K. Caers</a>, <a href="/search/cs?searchtype=author&query=Kochenderfer%2C+M+J">Mykel J. Kochenderfer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05690v1-abstract-short" style="display: inline;"> The world is entering an unprecedented period of critical mineral demand, driven by the global transition to renewable energy technologies and electric vehicles. This transition presents unique challenges in mineral resource development, particularly due to geological uncertainty-a key characteristic that traditional supply chain optimization approaches do not adequately address. To tackle this ch… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05690v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05690v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05690v1-abstract-full" style="display: none;"> The world is entering an unprecedented period of critical mineral demand, driven by the global transition to renewable energy technologies and electric vehicles. This transition presents unique challenges in mineral resource development, particularly due to geological uncertainty-a key characteristic that traditional supply chain optimization approaches do not adequately address. To tackle this challenge, we propose a novel application of Partially Observable Markov Decision Processes (POMDPs) that optimizes critical mineral sourcing decisions while explicitly accounting for the dynamic nature of geological uncertainty. Through a case study of the U.S. lithium supply chain, we demonstrate that POMDP-based policies achieve superior outcomes compared to traditional approaches, especially when initial reserve estimates are imperfect. Our framework provides quantitative insights for balancing domestic resource development with international supply diversification, offering policymakers a systematic approach to strategic decision-making in critical mineral supply chains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05690v1-abstract-full').style.display = 'none'; document.getElementById('2502.05690v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05482">arXiv:2502.05482</a> <span> [<a href="https://arxiv.org/pdf/2502.05482">pdf</a>, <a href="https://arxiv.org/format/2502.05482">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Robustifying Fourier Features Embeddings for Implicit Neural Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+M">Mingze Ma</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Q">Qingtian Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+Y">Yifan Zhan</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhengwei Yin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongjun Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yinqiang Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05482v1-abstract-short" style="display: inline;"> Implicit Neural Representations (INRs) employ neural networks to represent continuous functions by mapping coordinates to the corresponding values of the target function, with applications e.g., inverse graphics. However, INRs face a challenge known as spectral bias when dealing with scenes containing varying frequencies. To overcome spectral bias, the most common approach is the Fourier features-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05482v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05482v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05482v1-abstract-full" style="display: none;"> Implicit Neural Representations (INRs) employ neural networks to represent continuous functions by mapping coordinates to the corresponding values of the target function, with applications e.g., inverse graphics. However, INRs face a challenge known as spectral bias when dealing with scenes containing varying frequencies. To overcome spectral bias, the most common approach is the Fourier features-based methods such as positional encoding. However, Fourier features-based methods will introduce noise to output, which degrades their performances when applied to downstream tasks. In response, this paper initially hypothesizes that combining multi-layer perceptrons (MLPs) with Fourier feature embeddings mutually enhances their strengths, yet simultaneously introduces limitations inherent in Fourier feature embeddings. By presenting a simple theorem, we validate our hypothesis, which serves as a foundation for the design of our solution. Leveraging these insights, we propose the use of multi-layer perceptrons (MLPs) without additive <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05482v1-abstract-full').style.display = 'none'; document.getElementById('2502.05482v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04307">arXiv:2502.04307</a> <span> [<a href="https://arxiv.org/pdf/2502.04307">pdf</a>, <a href="https://arxiv.org/format/2502.04307">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> DexterityGen: Foundation Controller for Unprecedented Dexterity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhao-Heng Yin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Changhao Wang</a>, <a href="/search/cs?searchtype=author&query=Pineda%2C+L">Luis Pineda</a>, <a href="/search/cs?searchtype=author&query=Hogan%2C+F">Francois Hogan</a>, <a href="/search/cs?searchtype=author&query=Bodduluri%2C+K">Krishna Bodduluri</a>, <a href="/search/cs?searchtype=author&query=Sharma%2C+A">Akash Sharma</a>, <a href="/search/cs?searchtype=author&query=Lancaster%2C+P">Patrick Lancaster</a>, <a href="/search/cs?searchtype=author&query=Prasad%2C+I">Ishita Prasad</a>, <a href="/search/cs?searchtype=author&query=Kalakrishnan%2C+M">Mrinal Kalakrishnan</a>, <a href="/search/cs?searchtype=author&query=Malik%2C+J">Jitendra Malik</a>, <a href="/search/cs?searchtype=author&query=Lambeta%2C+M">Mike Lambeta</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tingfan Wu</a>, <a href="/search/cs?searchtype=author&query=Abbeel%2C+P">Pieter Abbeel</a>, <a href="/search/cs?searchtype=author&query=Mukadam%2C+M">Mustafa Mukadam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04307v1-abstract-short" style="display: inline;"> Teaching robots dexterous manipulation skills, such as tool use, presents a significant challenge. Current approaches can be broadly categorized into two strategies: human teleoperation (for imitation learning) and sim-to-real reinforcement learning. The first approach is difficult as it is hard for humans to produce safe and dexterous motions on a different embodiment without touch feedback. The… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04307v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04307v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04307v1-abstract-full" style="display: none;"> Teaching robots dexterous manipulation skills, such as tool use, presents a significant challenge. Current approaches can be broadly categorized into two strategies: human teleoperation (for imitation learning) and sim-to-real reinforcement learning. The first approach is difficult as it is hard for humans to produce safe and dexterous motions on a different embodiment without touch feedback. The second RL-based approach struggles with the domain gap and involves highly task-specific reward engineering on complex tasks. Our key insight is that RL is effective at learning low-level motion primitives, while humans excel at providing coarse motion commands for complex, long-horizon tasks. Therefore, the optimal solution might be a combination of both approaches. In this paper, we introduce DexterityGen (DexGen), which uses RL to pretrain large-scale dexterous motion primitives, such as in-hand rotation or translation. We then leverage this learned dataset to train a dexterous foundational controller. In the real world, we use human teleoperation as a prompt to the controller to produce highly dexterous behavior. We evaluate the effectiveness of DexGen in both simulation and real world, demonstrating that it is a general-purpose controller that can realize input dexterous manipulation commands and significantly improves stability by 10-100x measured as duration of holding objects across diverse tasks. Notably, with DexGen we demonstrate unprecedented dexterous skills including diverse object reorientation and dexterous tool use such as pen, syringe, and screwdriver for the first time. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04307v1-abstract-full').style.display = 'none'; document.getElementById('2502.04307v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project: https://zhaohengyin.github.io/dexteritygen</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00653">arXiv:2502.00653</a> <span> [<a href="https://arxiv.org/pdf/2502.00653">pdf</a>, <a href="https://arxiv.org/format/2502.00653">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Towards Robust Multimodal Large Language Models Against Jailbreak Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Ziyi Yin</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yuanpu Cao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Han Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Ting Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jinghui Chen</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+F">Fenhlong Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00653v1-abstract-short" style="display: inline;"> While multimodal large language models (MLLMs) have achieved remarkable success in recent advancements, their susceptibility to jailbreak attacks has come to light. In such attacks, adversaries exploit carefully crafted prompts to coerce models into generating harmful or undesirable content. Existing defense mechanisms often rely on external inference steps or safety alignment training, both of wh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00653v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00653v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00653v1-abstract-full" style="display: none;"> While multimodal large language models (MLLMs) have achieved remarkable success in recent advancements, their susceptibility to jailbreak attacks has come to light. In such attacks, adversaries exploit carefully crafted prompts to coerce models into generating harmful or undesirable content. Existing defense mechanisms often rely on external inference steps or safety alignment training, both of which are less effective and impractical when facing sophisticated adversarial perturbations in white-box scenarios. To address these challenges and bolster MLLM robustness, we introduce SafeMLLM by adopting an adversarial training framework that alternates between an attack step for generating adversarial noise and a model updating step. At the attack step, SafeMLLM generates adversarial perturbations through a newly proposed contrastive embedding attack (CoE-Attack), which optimizes token embeddings under a contrastive objective. SafeMLLM then updates model parameters to neutralize the perturbation effects while preserving model utility on benign inputs. We evaluate SafeMLLM across six MLLMs and six jailbreak methods spanning multiple modalities. Experimental results show that SafeMLLM effectively defends against diverse attacks, maintaining robust performance and utilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00653v1-abstract-full').style.display = 'none'; document.getElementById('2502.00653v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15581">arXiv:2501.15581</a> <span> [<a href="https://arxiv.org/pdf/2501.15581">pdf</a>, <a href="https://arxiv.org/format/2501.15581">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Error Classification of Large Language Models on Math Word Problems: A Dynamically Adaptive Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuhong Sun</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhangyue Yin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xuanjing Huang</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xipeng Qiu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hui Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15581v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated remarkable capabilities across various domains. Math Word Problems (MWPs) serve as a crucial benchmark for evaluating LLMs' reasoning abilities. While most research primarily focuses on improving accuracy, it often neglects understanding and addressing the underlying patterns of errors. Current error classification methods rely on static and predefine… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15581v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15581v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15581v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated remarkable capabilities across various domains. Math Word Problems (MWPs) serve as a crucial benchmark for evaluating LLMs' reasoning abilities. While most research primarily focuses on improving accuracy, it often neglects understanding and addressing the underlying patterns of errors. Current error classification methods rely on static and predefined categories, which limit their ability to capture the full spectrum of error patterns in mathematical reasoning. To enable systematic error analysis, we collect error samples from 15 different LLMs of varying sizes across four distinct MWP datasets using multiple sampling strategies. Based on this extensive collection, we introduce MWPES-300K, a comprehensive dataset containing 304,865 error samples that cover diverse error patterns and reasoning paths. To reduce human bias and enable fine-grained analysis of error patterns, we propose a novel framework for automated dynamic error classification in mathematical reasoning. Experimental results demonstrate that dataset characteristics significantly shape error patterns, which evolve from basic to complex manifestations as model capabilities increase. With deeper insights into error patterns, we propose error-aware prompting that incorporates common error patterns as explicit guidance, leading to significant improvements in mathematical reasoning performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15581v1-abstract-full').style.display = 'none'; document.getElementById('2501.15581v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12119">arXiv:2501.12119</a> <span> [<a href="https://arxiv.org/pdf/2501.12119">pdf</a>, <a href="https://arxiv.org/format/2501.12119">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ENTIRE: Learning-based Volume Rendering Time Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zikai Yin</a>, <a href="/search/cs?searchtype=author&query=Gadirov%2C+H">Hamid Gadirov</a>, <a href="/search/cs?searchtype=author&query=Kosinka%2C+J">Jiri Kosinka</a>, <a href="/search/cs?searchtype=author&query=Frey%2C+S">Steffen Frey</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12119v1-abstract-short" style="display: inline;"> We present ENTIRE, a novel approach for volume rendering time prediction. Time-dependent volume data from simulations or experiments typically comprise complex deforming structures across hundreds or thousands of time steps, which in addition to the camera configuration has a significant impact on rendering performance. We first extract a feature vector from a volume that captures its structure th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12119v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12119v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12119v1-abstract-full" style="display: none;"> We present ENTIRE, a novel approach for volume rendering time prediction. Time-dependent volume data from simulations or experiments typically comprise complex deforming structures across hundreds or thousands of time steps, which in addition to the camera configuration has a significant impact on rendering performance. We first extract a feature vector from a volume that captures its structure that is relevant for rendering time performance. Then we combine this feature vector with further relevant parameters (e.g. camera setup), and with this perform the final prediction. Our experiments conducted on various datasets demonstrate that our model is capable of efficiently achieving high prediction accuracy with fast response rates. We showcase ENTIRE's capability of enabling dynamic parameter adaptation for stable frame rates and load balancing in two case studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12119v1-abstract-full').style.display = 'none'; document.getElementById('2501.12119v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07397">arXiv:2501.07397</a> <span> [<a href="https://arxiv.org/pdf/2501.07397">pdf</a>, <a href="https://arxiv.org/format/2501.07397">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VDOR: A Video-based Dataset for Object Removal via Sequence Consistency </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+R">Runpu Wei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shuo Zhang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Z">Zhonghao Yan</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zijin Yin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xueyi Wang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+K">Kongming Liang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zhanyu Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07397v2-abstract-short" style="display: inline;"> Object removal, as a sub-task of image inpainting, has garnered significant attention in recent years. Existing datasets related to object removal serve a valuable foundation for model validation and optimization. However, they mainly rely on inpainting techniques to generate pseudo-removed results, leading to distribution gaps between synthetic and real-world data. While some real-world datasets… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07397v2-abstract-full').style.display = 'inline'; document.getElementById('2501.07397v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07397v2-abstract-full" style="display: none;"> Object removal, as a sub-task of image inpainting, has garnered significant attention in recent years. Existing datasets related to object removal serve a valuable foundation for model validation and optimization. However, they mainly rely on inpainting techniques to generate pseudo-removed results, leading to distribution gaps between synthetic and real-world data. While some real-world datasets mitigate these issues, they face challenges such as limited scalability, high annotation costs, and unrealistic representations of lighting and shadows. To address these limitations, we propose a novel video-based annotation pipeline for constructing a realistic illumination-aware object removal dataset. Leveraging this pipeline, we introduce VDOR, a dataset specifically designed for object removal tasks, which comprises triplets of original frame images with objects, background images without objects, and corresponding masks. By leveraging continuous real-world video frames, we minimize distribution gaps and accurately capture realistic lighting and shadow variations, ensuring close alignment with real-world scenarios. Our approach significantly reduces annotation effort while providing a robust foundation for advancing object removal research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07397v2-abstract-full').style.display = 'none'; document.getElementById('2501.07397v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">technical report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06692">arXiv:2501.06692</a> <span> [<a href="https://arxiv.org/pdf/2501.06692">pdf</a>, <a href="https://arxiv.org/format/2501.06692">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PGP-SAM: Prototype-Guided Prompt Learning for Efficient Few-Shot Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+Z">Zhonghao Yan</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zijin Yin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+T">Tianyu Lin</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+X">Xiangzhu Zeng</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+K">Kongming Liang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zhanyu Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06692v1-abstract-short" style="display: inline;"> The Segment Anything Model (SAM) has demonstrated strong and versatile segmentation capabilities, along with intuitive prompt-based interactions. However, customizing SAM for medical image segmentation requires massive amounts of pixel-level annotations and precise point- or box-based prompt designs. To address these challenges, we introduce PGP-SAM, a novel prototype-based few-shot tuning approac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06692v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06692v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06692v1-abstract-full" style="display: none;"> The Segment Anything Model (SAM) has demonstrated strong and versatile segmentation capabilities, along with intuitive prompt-based interactions. However, customizing SAM for medical image segmentation requires massive amounts of pixel-level annotations and precise point- or box-based prompt designs. To address these challenges, we introduce PGP-SAM, a novel prototype-based few-shot tuning approach that uses limited samples to replace tedious manual prompts. Our key idea is to leverage inter- and intra-class prototypes to capture class-specific knowledge and relationships. We propose two main components: (1) a plug-and-play contextual modulation module that integrates multi-scale information, and (2) a class-guided cross-attention mechanism that fuses prototypes and features for automatic prompt generation. Experiments on a public multi-organ dataset and a private ventricle dataset demonstrate that PGP-SAM achieves superior mean Dice scores compared with existing prompt-free SAM variants, while using only 10\% of the 2D slices. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06692v1-abstract-full').style.display = 'none'; document.getElementById('2501.06692v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2 figures, Accepted at ISBI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.01973">arXiv:2501.01973</a> <span> [<a href="https://arxiv.org/pdf/2501.01973">pdf</a>, <a href="https://arxiv.org/format/2501.01973">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> INFELM: In-depth Fairness Evaluation of Large Text-To-Image Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+D">Di Jin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xing Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yu Liu</a>, <a href="/search/cs?searchtype=author&query=Yap%2C+J+Q">Jia Qing Yap</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+A">Andrea Wong</a>, <a href="/search/cs?searchtype=author&query=Crespo%2C+A">Adriana Crespo</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Q">Qi Lin</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhiyuan Yin</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Q">Qiang Yan</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+R">Ryan Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.01973v3-abstract-short" style="display: inline;"> The rapid development of large language models (LLMs) and large vision models (LVMs) have propelled the evolution of multi-modal AI systems, which have demonstrated the remarkable potential for industrial applications by emulating human-like cognition. However, they also pose significant ethical challenges, including amplifying harmful content and reinforcing societal biases. For instance, biases… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01973v3-abstract-full').style.display = 'inline'; document.getElementById('2501.01973v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.01973v3-abstract-full" style="display: none;"> The rapid development of large language models (LLMs) and large vision models (LVMs) have propelled the evolution of multi-modal AI systems, which have demonstrated the remarkable potential for industrial applications by emulating human-like cognition. However, they also pose significant ethical challenges, including amplifying harmful content and reinforcing societal biases. For instance, biases in some industrial image generation models highlighted the urgent need for robust fairness assessments. Most existing evaluation frameworks focus on the comprehensiveness of various aspects of the models, but they exhibit critical limitations, including insufficient attention to content generation alignment and social bias-sensitive domains. More importantly, their reliance on pixel-detection techniques is prone to inaccuracies. To address these issues, this paper presents INFELM, an in-depth fairness evaluation on widely-used text-to-image models. Our key contributions are: (1) an advanced skintone classifier incorporating facial topology and refined skin pixel representation to enhance classification precision by at least 16.04%, (2) a bias-sensitive content alignment measurement for understanding societal impacts, (3) a generalizable representation bias evaluation for diverse demographic groups, and (4) extensive experiments analyzing large-scale text-to-image model outputs across six social-bias-sensitive domains. We find that existing models in the study generally do not meet the empirical fairness criteria, and representation bias is generally more pronounced than alignment errors. INFELM establishes a robust benchmark for fairness assessment, supporting the development of multi-modal AI systems that align with ethical and human-centric principles. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01973v3-abstract-full').style.display = 'none'; document.getElementById('2501.01973v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Di Jin and Xing Liu contributed equally to this work</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00220">arXiv:2501.00220</a> <span> [<a href="https://arxiv.org/pdf/2501.00220">pdf</a>, <a href="https://arxiv.org/format/2501.00220">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/978-3-031-72335-3_8">10.1007/978-3-031-72335-3_8 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> DecoratingFusion: A LiDAR-Camera Fusion Network with the Combination of Point-level and Feature-level Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zixuan Yin</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Han Sun</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+N">Ningzhong Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Huiyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Jiaquan Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00220v1-abstract-short" style="display: inline;"> Lidars and cameras play essential roles in autonomous driving, offering complementary information for 3D detection. The state-of-the-art fusion methods integrate them at the feature level, but they mostly rely on the learned soft association between point clouds and images, which lacks interpretability and neglects the hard association between them. In this paper, we combine feature-level fusion w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00220v1-abstract-full').style.display = 'inline'; document.getElementById('2501.00220v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00220v1-abstract-full" style="display: none;"> Lidars and cameras play essential roles in autonomous driving, offering complementary information for 3D detection. The state-of-the-art fusion methods integrate them at the feature level, but they mostly rely on the learned soft association between point clouds and images, which lacks interpretability and neglects the hard association between them. In this paper, we combine feature-level fusion with point-level fusion, using hard association established by the calibration matrices to guide the generation of object queries. Specifically, in the early fusion stage, we use the 2D CNN features of images to decorate the point cloud data, and employ two independent sparse convolutions to extract the decorated point cloud features. In the mid-level fusion stage, we initialize the queries with a center heatmap and embed the predicted class labels as auxiliary information into the queries, making the initial positions closer to the actual centers of the targets. Extensive experiments conducted on two popular datasets, i.e. KITTI, Waymo, demonstrate the superiority of DecoratingFusion. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00220v1-abstract-full').style.display = 'none'; document.getElementById('2501.00220v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 2 figures. accepted by ICANN2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19654">arXiv:2412.19654</a> <span> [<a href="https://arxiv.org/pdf/2412.19654">pdf</a>, <a href="https://arxiv.org/format/2412.19654">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Asymmetrical Reciprocity-based Federated Learning for Resolving Disparities in Medical Diagnosis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiaqi Wang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Ziyi Yin</a>, <a href="/search/cs?searchtype=author&query=You%2C+Q">Quanzeng You</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+L">Lingjuan Lyu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+F">Fenglong Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19654v1-abstract-short" style="display: inline;"> Geographic health disparities pose a pressing global challenge, particularly in underserved regions of low- and middle-income nations. Addressing this issue requires a collaborative approach to enhance healthcare quality, leveraging support from medically more developed areas. Federated learning emerges as a promising tool for this purpose. However, the scarcity of medical data and limited computa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19654v1-abstract-full').style.display = 'inline'; document.getElementById('2412.19654v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19654v1-abstract-full" style="display: none;"> Geographic health disparities pose a pressing global challenge, particularly in underserved regions of low- and middle-income nations. Addressing this issue requires a collaborative approach to enhance healthcare quality, leveraging support from medically more developed areas. Federated learning emerges as a promising tool for this purpose. However, the scarcity of medical data and limited computation resources in underserved regions make collaborative training of powerful machine learning models challenging. Furthermore, there exists an asymmetrical reciprocity between underserved and developed regions. To overcome these challenges, we propose a novel cross-silo federated learning framework, named FedHelp, aimed at alleviating geographic health disparities and fortifying the diagnostic capabilities of underserved regions. Specifically, FedHelp leverages foundational model knowledge via one-time API access to guide the learning process of underserved small clients, addressing the challenge of insufficient data. Additionally, we introduce a novel asymmetric dual knowledge distillation module to manage the issue of asymmetric reciprocity, facilitating the exchange of necessary knowledge between developed large clients and underserved small clients. We validate the effectiveness and utility of FedHelp through extensive experiments on both medical image classification and segmentation tasks. The experimental results demonstrate significant performance improvement compared to state-of-the-art baselines, particularly benefiting clients in underserved regions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19654v1-abstract-full').style.display = 'none'; document.getElementById('2412.19654v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Jiaqi Wang and Ziyi Yin equally contributed to this paper. This paper has been accepted by KDD 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18194">arXiv:2412.18194</a> <span> [<a href="https://arxiv.org/pdf/2412.18194">pdf</a>, <a href="https://arxiv.org/format/2412.18194">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VLABench: A Large-Scale Benchmark for Language-Conditioned Robotics Manipulation with Long-Horizon Reasoning Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shiduo Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhe Xu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peiju Liu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xiaopeng Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuan Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Q">Qinghui Gao</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+Z">Zhaoye Fei</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhangyue Yin</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zuxuan Wu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yu-Gang Jiang</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xipeng Qiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18194v1-abstract-short" style="display: inline;"> General-purposed embodied agents are designed to understand the users' natural instructions or intentions and act precisely to complete universal tasks. Recently, methods based on foundation models especially Vision-Language-Action models (VLAs) have shown a substantial potential to solve language-conditioned manipulation (LCM) tasks well. However, existing benchmarks do not adequately meet the ne… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18194v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18194v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18194v1-abstract-full" style="display: none;"> General-purposed embodied agents are designed to understand the users' natural instructions or intentions and act precisely to complete universal tasks. Recently, methods based on foundation models especially Vision-Language-Action models (VLAs) have shown a substantial potential to solve language-conditioned manipulation (LCM) tasks well. However, existing benchmarks do not adequately meet the needs of VLAs and relative algorithms. To better define such general-purpose tasks in the context of LLMs and advance the research in VLAs, we present VLABench, an open-source benchmark for evaluating universal LCM task learning. VLABench provides 100 carefully designed categories of tasks, with strong randomization in each category of task and a total of 2000+ objects. VLABench stands out from previous benchmarks in four key aspects: 1) tasks requiring world knowledge and common sense transfer, 2) natural language instructions with implicit human intentions rather than templates, 3) long-horizon tasks demanding multi-step reasoning, and 4) evaluation of both action policies and language model capabilities. The benchmark assesses multiple competencies including understanding of mesh\&texture, spatial relationship, semantic instruction, physical laws, knowledge transfer and reasoning, etc. To support the downstream finetuning, we provide high-quality training data collected via an automated framework incorporating heuristic skills and prior information. The experimental results indicate that both the current state-of-the-art pretrained VLAs and the workflow based on VLMs face challenges in our tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18194v1-abstract-full').style.display = 'none'; document.getElementById('2412.18194v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14505">arXiv:2412.14505</a> <span> [<a href="https://arxiv.org/pdf/2412.14505">pdf</a>, <a href="https://arxiv.org/ps/2412.14505">ps</a>, <a href="https://arxiv.org/format/2412.14505">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A hybrid framework for effective and efficient machine unlearning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingxin Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yizhen Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+N">Ning Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhigang Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaodong Wang</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+H">Haipeng Qu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jia Xu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+S">Shen Su</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhichao Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14505v1-abstract-short" style="display: inline;"> Recently machine unlearning (MU) is proposed to remove the imprints of revoked samples from the already trained model parameters, to solve users' privacy concern. Different from the runtime expensive retraining from scratch, there exist two research lines, exact MU and approximate MU with different favorites in terms of accuracy and efficiency. In this paper, we present a novel hybrid strategy on… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14505v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14505v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14505v1-abstract-full" style="display: none;"> Recently machine unlearning (MU) is proposed to remove the imprints of revoked samples from the already trained model parameters, to solve users' privacy concern. Different from the runtime expensive retraining from scratch, there exist two research lines, exact MU and approximate MU with different favorites in terms of accuracy and efficiency. In this paper, we present a novel hybrid strategy on top of them to achieve an overall success. It implements the unlearning operation with an acceptable computation cost, while simultaneously improving the accuracy as much as possible. Specifically, it runs reasonable unlearning techniques by estimating the retraining workloads caused by revocations. If the workload is lightweight, it performs retraining to derive the model parameters consistent with the accurate ones retrained from scratch. Otherwise, it outputs the unlearned model by directly modifying the current parameters, for better efficiency. In particular, to improve the accuracy in the latter case, we propose an optimized version to amend the output model with lightweight runtime penalty. We particularly study the boundary of two approaches in our frameworks to adaptively make the smart selection. Extensive experiments on real datasets validate that our proposals can improve the unlearning efficiency by 1.5$\times$ to 8$\times$ while achieving comparable accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14505v1-abstract-full').style.display = 'none'; document.getElementById('2412.14505v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 5 figures, accepted by CSE2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14135">arXiv:2412.14135</a> <span> [<a href="https://arxiv.org/pdf/2412.14135">pdf</a>, <a href="https://arxiv.org/format/2412.14135">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Scaling of Search and Learning: A Roadmap to Reproduce o1 from Reinforcement Learning Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Zhiyuan Zeng</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Q">Qinyuan Cheng</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhangyue Yin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bo Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shimin Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yunhua Zhou</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qipeng Guo</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xuanjing Huang</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xipeng Qiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14135v1-abstract-short" style="display: inline;"> OpenAI o1 represents a significant milestone in Artificial Inteiligence, which achieves expert-level performances on many challanging tasks that require strong reasoning ability.OpenAI has claimed that the main techinique behinds o1 is the reinforcement learining. Recent works use alternative approaches like knowledge distillation to imitate o1's reasoning style, but their effectiveness is limited… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14135v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14135v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14135v1-abstract-full" style="display: none;"> OpenAI o1 represents a significant milestone in Artificial Inteiligence, which achieves expert-level performances on many challanging tasks that require strong reasoning ability.OpenAI has claimed that the main techinique behinds o1 is the reinforcement learining. Recent works use alternative approaches like knowledge distillation to imitate o1's reasoning style, but their effectiveness is limited by the capability ceiling of the teacher model. Therefore, this paper analyzes the roadmap to achieving o1 from the perspective of reinforcement learning, focusing on four key components: policy initialization, reward design, search, and learning. Policy initialization enables models to develop human-like reasoning behaviors, equipping them with the ability to effectively explore solution spaces for complex problems. Reward design provides dense and effective signals via reward shaping or reward modeling, which is the guidance for both search and learning. Search plays a crucial role in generating high-quality solutions during both training and testing phases, which can produce better solutions with more computation. Learning utilizes the data generated by search for improving policy, which can achieve the better performance with more parameters and more searched data. Existing open-source projects that attempt to reproduce o1 can be seem as a part or a variant of our roadmap. Collectively, these components underscore how learning and search drive o1's advancement, making meaningful contributions to the development of LLM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14135v1-abstract-full').style.display = 'none'; document.getElementById('2412.14135v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13388">arXiv:2412.13388</a> <span> [<a href="https://arxiv.org/pdf/2412.13388">pdf</a>, <a href="https://arxiv.org/format/2412.13388">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Applications">stat.AP</span> </div> </div> <p class="title is-5 mathjax"> Catalysts of Conversation: Examining Interaction Dynamics Between Topic Initiators and Commentors in Alzheimer's Disease Online Communities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ni%2C+C">Congning Ni</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qingxia Chen</a>, <a href="/search/cs?searchtype=author&query=Song%2C+L">Lijun Song</a>, <a href="/search/cs?searchtype=author&query=Commiskey%2C+P">Patricia Commiskey</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Q">Qingyuan Song</a>, <a href="/search/cs?searchtype=author&query=Malin%2C+B+A">Bradley A. Malin</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhijun Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13388v1-abstract-short" style="display: inline;"> Informal caregivers (e.g.,family members or friends) of people living with Alzheimers Disease and Related Dementias (ADRD) face substantial challenges and often seek informational or emotional support through online communities. Understanding the factors that drive engagement within these platforms is crucial, as it can enhance their long-term value for caregivers by ensuring that these communitie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13388v1-abstract-full').style.display = 'inline'; document.getElementById('2412.13388v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13388v1-abstract-full" style="display: none;"> Informal caregivers (e.g.,family members or friends) of people living with Alzheimers Disease and Related Dementias (ADRD) face substantial challenges and often seek informational or emotional support through online communities. Understanding the factors that drive engagement within these platforms is crucial, as it can enhance their long-term value for caregivers by ensuring that these communities effectively meet their needs. This study investigated the user interaction dynamics within two large, popular ADRD communities, TalkingPoint and ALZConnected, focusing on topic initiator engagement, initial post content, and the linguistic patterns of comments at the thread level. Using analytical methods such as propensity score matching, topic modeling, and predictive modeling, we found that active topic initiator engagement drives higher comment volumes, and reciprocal replies from topic initiators encourage further commentor engagement at the community level. Practical caregiving topics prompt more re-engagement of topic initiators, while emotional support topics attract more comments from other commentors. Additionally, the linguistic complexity and emotional tone of a comment influence its likelihood of receiving replies from topic initiators. These findings highlight the importance of fostering active and reciprocal engagement and providing effective strategies to enhance sustainability in ADRD caregiving and broader health-related online communities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13388v1-abstract-full').style.display = 'none'; document.getElementById('2412.13388v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 11 figures (6 in main text and 5 in the appendix). The paper includes statistical analyses, structural topic modeling, and predictive modeling to examine user engagement dynamics in Alzheimers Disease online communities. Submitted for consideration to The Web Conference 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12562">arXiv:2412.12562</a> <span> [<a href="https://arxiv.org/pdf/2412.12562">pdf</a>, <a href="https://arxiv.org/format/2412.12562">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Efficient Oriented Object Detection with Enhanced Small Object Recognition in Aerial Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zhifei Shi</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zongyao Yin</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+S">Sheng Chang</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+X">Xiao Yi</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xianchuan Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12562v1-abstract-short" style="display: inline;"> Achieving a balance between computational efficiency and detection accuracy in the realm of rotated bounding box object detection within aerial imagery is a significant challenge. While prior research has aimed at creating lightweight models that enhance computational performance and feature extraction, there remains a gap in the performance of these networks when it comes to the detection of smal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12562v1-abstract-full').style.display = 'inline'; document.getElementById('2412.12562v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12562v1-abstract-full" style="display: none;"> Achieving a balance between computational efficiency and detection accuracy in the realm of rotated bounding box object detection within aerial imagery is a significant challenge. While prior research has aimed at creating lightweight models that enhance computational performance and feature extraction, there remains a gap in the performance of these networks when it comes to the detection of small and multi-scale objects in remote sensing (RS) imagery. To address these challenges, we present a novel enhancement to the YOLOv8 model, tailored for oriented object detection tasks and optimized for environments with limited computational resources. Our model features a wavelet transform-based C2f module for capturing associative features and an Adaptive Scale Feature Pyramid (ASFP) module that leverages P2 layer details. Additionally, the incorporation of GhostDynamicConv significantly contributes to the model's lightweight nature, ensuring high efficiency in aerial imagery analysis. Featuring a parameter count of 21.6M, our approach provides a more efficient architectural design than DecoupleNet, which has 23.3M parameters, all while maintaining detection accuracy. On the DOTAv1.0 dataset, our model demonstrates a mean Average Precision (mAP) that is competitive with leading methods such as DecoupleNet. The model's efficiency, combined with its reduced parameter count, makes it a strong candidate for aerial object detection, particularly in resource-constrained environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12562v1-abstract-full').style.display = 'none'; document.getElementById('2412.12562v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09919">arXiv:2412.09919</a> <span> [<a href="https://arxiv.org/pdf/2412.09919">pdf</a>, <a href="https://arxiv.org/ps/2412.09919">ps</a>, <a href="https://arxiv.org/format/2412.09919">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> B-VLLM: A Vision Large Language Model with Balanced Spatio-Temporal Tokens </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zhuqiang Lu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhenfei Yin</a>, <a href="/search/cs?searchtype=author&query=He%2C+M">Mengwei He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhihui Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zicheng Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhiyong Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kun Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09919v1-abstract-short" style="display: inline;"> Recently, Vision Large Language Models (VLLMs) integrated with vision encoders have shown promising performance in vision understanding. The key of VLLMs is to encode visual content into sequences of visual tokens, enabling VLLMs to simultaneously process both visual and textual content. However, understanding videos, especially long videos, remain a challenge to VLLMs as the number of visual toke… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09919v1-abstract-full').style.display = 'inline'; document.getElementById('2412.09919v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09919v1-abstract-full" style="display: none;"> Recently, Vision Large Language Models (VLLMs) integrated with vision encoders have shown promising performance in vision understanding. The key of VLLMs is to encode visual content into sequences of visual tokens, enabling VLLMs to simultaneously process both visual and textual content. However, understanding videos, especially long videos, remain a challenge to VLLMs as the number of visual tokens grows rapidly when encoding videos, resulting in the risk of exceeding the context window of VLLMs and introducing heavy computation burden. To restrict the number of visual tokens, existing VLLMs either: (1) uniformly downsample videos into a fixed number of frames or (2) reducing the number of visual tokens encoded from each frame. We argue the former solution neglects the rich temporal cue in videos and the later overlooks the spatial details in each frame. In this work, we present Balanced-VLLM (B-VLLM): a novel VLLM framework that aims to effectively leverage task relevant spatio-temporal cues while restricting the number of visual tokens under the VLLM context window length. At the core of our method, we devise a text-conditioned adaptive frame selection module to identify frames relevant to the visual understanding task. The selected frames are then de-duplicated using a temporal frame token merging technique. The visual tokens of the selected frames are processed through a spatial token sampling module and an optional spatial token merging strategy to achieve precise control over the token count. Experimental results show that B-VLLM is effective in balancing the number of frames and visual tokens in video understanding, yielding superior performance on various video understanding benchmarks. Our code is available at https://github.com/zhuqiangLu/B-VLLM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09919v1-abstract-full').style.display = 'none'; document.getElementById('2412.09919v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09661">arXiv:2412.09661</a> <span> [<a href="https://arxiv.org/pdf/2412.09661">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Language model driven: a PROTAC generation pipeline with dual constraints of structure and property </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shao%2C+J">Jinsong Shao</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Q">Qineng Gong</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zeyu Yin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yu Chen</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Y">Yajie Hao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lei Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+L">Linlin Jiang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+M">Min Yao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinlong Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fubo Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Li Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09661v1-abstract-short" style="display: inline;"> The imperfect modeling of ternary complexes has limited the application of computer-aided drug discovery tools in PROTAC research and development. In this study, an AI-assisted approach for PROTAC molecule design pipeline named LM-PROTAC was developed, which stands for language model driven Proteolysis Targeting Chimera, by embedding a transformer-based generative model with dual constraints on st… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09661v1-abstract-full').style.display = 'inline'; document.getElementById('2412.09661v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09661v1-abstract-full" style="display: none;"> The imperfect modeling of ternary complexes has limited the application of computer-aided drug discovery tools in PROTAC research and development. In this study, an AI-assisted approach for PROTAC molecule design pipeline named LM-PROTAC was developed, which stands for language model driven Proteolysis Targeting Chimera, by embedding a transformer-based generative model with dual constraints on structure and properties, referred to as the DCT. This study utilized the fragmentation representation of molecules and developed a language model driven pipeline. Firstly, a language model driven affinity model for protein compounds to screen molecular fragments with high affinity for the target protein. Secondly, structural and physicochemical properties of these fragments were constrained during the generation process to meet specific scenario requirements. Finally, a two-round screening of the preliminary generated molecules using a multidimensional property prediction model to generate a batch of PROTAC molecules capable of degrading disease-relevant target proteins for validation in vitro experiments, thus achieving a complete solution for AI-assisted PROTAC drug generation. Taking the tumor key target Wnt3a as an example, the LM-PROTAC pipeline successfully generated PROTAC molecules capable of inhibiting Wnt3a. The results show that DCT can efficiently generate PROTAC that targets and hydrolyses Wnt3a. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09661v1-abstract-full').style.display = 'none'; document.getElementById('2412.09661v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">61 pages,12 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7; D.3.2 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09276">arXiv:2412.09276</a> <span> [<a href="https://arxiv.org/pdf/2412.09276">pdf</a>, <a href="https://arxiv.org/format/2412.09276">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Text-Video Multi-Grained Integration for Video Moment Montage </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhihui Yin</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Ye Ma</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xipeng Cao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bo Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Quan Chen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+P">Peng Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09276v1-abstract-short" style="display: inline;"> The proliferation of online short video platforms has driven a surge in user demand for short video editing. However, manually selecting, cropping, and assembling raw footage into a coherent, high-quality video remains laborious and time-consuming. To accelerate this process, we focus on a user-friendly new task called Video Moment Montage (VMM), which aims to accurately locate the corresponding v… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09276v1-abstract-full').style.display = 'inline'; document.getElementById('2412.09276v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09276v1-abstract-full" style="display: none;"> The proliferation of online short video platforms has driven a surge in user demand for short video editing. However, manually selecting, cropping, and assembling raw footage into a coherent, high-quality video remains laborious and time-consuming. To accelerate this process, we focus on a user-friendly new task called Video Moment Montage (VMM), which aims to accurately locate the corresponding video segments based on a pre-provided narration text and then arrange these video clips to create a complete video that aligns with the corresponding descriptions. The challenge lies in extracting precise temporal segments while ensuring intra-sentence and inter-sentence context consistency, as a single script sentence may require trimming and assembling multiple video clips. To address this problem, we present a novel \textit{Text-Video Multi-Grained Integration} method (TV-MGI) that efficiently fuses text features from the script with both shot-level and frame-level video features, which enables the global and fine-grained alignment between the video content and the corresponding textual descriptions in the script. To facilitate further research in this area, we introduce the Multiple Sentences with Shots Dataset (MSSD), a large-scale dataset designed explicitly for the VMM task. We conduct extensive experiments on the MSSD dataset to demonstrate the effectiveness of our framework compared to baseline methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09276v1-abstract-full').style.display = 'none'; document.getElementById('2412.09276v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07202">arXiv:2412.07202</a> <span> [<a href="https://arxiv.org/pdf/2412.07202">pdf</a>, <a href="https://arxiv.org/format/2412.07202">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> BrokerChain: A Blockchain Sharding Protocol by Exploiting Broker Accounts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+H">Huawei Huang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhaokang Yin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qinde Chen</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+G">Guang Ye</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+X">Xiaowen Peng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yue Lin</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zibin Zheng</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Song Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07202v1-abstract-short" style="display: inline;"> State-of-the-art blockchain sharding solutions such as Monoxide, can cause severely imbalanced distribution of transaction (TX) workloads across all blockchain shards due to the deployment policy of their accounts. Imbalanced TX distributions then produce hot shards, in which the cross-shard TXs may experience an unlimited confirmation latency. Thus, how to address the hot-shard issue and how to r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07202v1-abstract-full').style.display = 'inline'; document.getElementById('2412.07202v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07202v1-abstract-full" style="display: none;"> State-of-the-art blockchain sharding solutions such as Monoxide, can cause severely imbalanced distribution of transaction (TX) workloads across all blockchain shards due to the deployment policy of their accounts. Imbalanced TX distributions then produce hot shards, in which the cross-shard TXs may experience an unlimited confirmation latency. Thus, how to address the hot-shard issue and how to reduce crossshard TXs become significant challenges of blockchain sharding. Through reviewing the related studies, we find that a crossshard TX protocol that can achieve workload balance among all shards and simultaneously reduce the quantity of crossshard TXs is still absent from the literature. To this end, we propose BrokerChain, which is a cross-shard blockchain protocol dedicated to account-based state sharding. Essentially, BrokerChain exploits fine-grained state partition and account segmentation. We also elaborate on how BrokerChain handles cross-shard TXs through broker accounts. The security issues and other properties of BrokerChain are analyzed rigorously. Finally, we conduct comprehensive evaluations using an opensource blockchain sharding prototype named BlockEmulator. The evaluation results show that BrokerChain outperforms other baselines in terms of transaction throughput, transaction confirmation latency, the queue size of the transaction pool, and workload balance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07202v1-abstract-full').style.display = 'none'; document.getElementById('2412.07202v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01708">arXiv:2412.01708</a> <span> [<a href="https://arxiv.org/pdf/2412.01708">pdf</a>, <a href="https://arxiv.org/format/2412.01708">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Are We There Yet? Revealing the Risks of Utilizing Large Language Models in Scholarly Peer Review </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+R">Rui Ye</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+X">Xianghe Pang</a>, <a href="/search/cs?searchtype=author&query=Chai%2C+J">Jingyi Chai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiaao Chen</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhenfei Yin</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+Z">Zhen Xiang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+X">Xiaowen Dong</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+J">Jing Shao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Siheng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01708v1-abstract-short" style="display: inline;"> Scholarly peer review is a cornerstone of scientific advancement, but the system is under strain due to increasing manuscript submissions and the labor-intensive nature of the process. Recent advancements in large language models (LLMs) have led to their integration into peer review, with promising results such as substantial overlaps between LLM- and human-generated reviews. However, the unchecke… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01708v1-abstract-full').style.display = 'inline'; document.getElementById('2412.01708v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01708v1-abstract-full" style="display: none;"> Scholarly peer review is a cornerstone of scientific advancement, but the system is under strain due to increasing manuscript submissions and the labor-intensive nature of the process. Recent advancements in large language models (LLMs) have led to their integration into peer review, with promising results such as substantial overlaps between LLM- and human-generated reviews. However, the unchecked adoption of LLMs poses significant risks to the integrity of the peer review system. In this study, we comprehensively analyze the vulnerabilities of LLM-generated reviews by focusing on manipulation and inherent flaws. Our experiments show that injecting covert deliberate content into manuscripts allows authors to explicitly manipulate LLM reviews, leading to inflated ratings and reduced alignment with human reviews. In a simulation, we find that manipulating 5% of the reviews could potentially cause 12% of the papers to lose their position in the top 30% rankings. Implicit manipulation, where authors strategically highlight minor limitations in their papers, further demonstrates LLMs' susceptibility compared to human reviewers, with a 4.5 times higher consistency with disclosed limitations. Additionally, LLMs exhibit inherent flaws, such as potentially assigning higher ratings to incomplete papers compared to full papers and favoring well-known authors in single-blind review process. These findings highlight the risks of over-reliance on LLMs in peer review, underscoring that we are not yet ready for widespread adoption and emphasizing the need for robust safeguards. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01708v1-abstract-full').style.display = 'none'; document.getElementById('2412.01708v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27 pages, 24 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.19946">arXiv:2411.19946</a> <span> [<a href="https://arxiv.org/pdf/2411.19946">pdf</a>, <a href="https://arxiv.org/format/2411.19946">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DELT: A Simple Diversity-driven EarlyLate Training for Dataset Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+Z">Zhiqiang Shen</a>, <a href="/search/cs?searchtype=author&query=Sherif%2C+A">Ammar Sherif</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zeyuan Yin</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+S">Shitong Shao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.19946v1-abstract-short" style="display: inline;"> Recent advances in dataset distillation have led to solutions in two main directions. The conventional batch-to-batch matching mechanism is ideal for small-scale datasets and includes bi-level optimization methods on models and syntheses, such as FRePo, RCIG, and RaT-BPTT, as well as other methods like distribution matching, gradient matching, and weight trajectory matching. Conversely, batch-to-g… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19946v1-abstract-full').style.display = 'inline'; document.getElementById('2411.19946v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.19946v1-abstract-full" style="display: none;"> Recent advances in dataset distillation have led to solutions in two main directions. The conventional batch-to-batch matching mechanism is ideal for small-scale datasets and includes bi-level optimization methods on models and syntheses, such as FRePo, RCIG, and RaT-BPTT, as well as other methods like distribution matching, gradient matching, and weight trajectory matching. Conversely, batch-to-global matching typifies decoupled methods, which are particularly advantageous for large-scale datasets. This approach has garnered substantial interest within the community, as seen in SRe$^2$L, G-VBSM, WMDD, and CDA. A primary challenge with the second approach is the lack of diversity among syntheses within each class since samples are optimized independently and the same global supervision signals are reused across different synthetic images. In this study, we propose a new Diversity-driven EarlyLate Training (DELT) scheme to enhance the diversity of images in batch-to-global matching with less computation. Our approach is conceptually simple yet effective, it partitions predefined IPC samples into smaller subtasks and employs local optimizations to distill each subset into distributions from distinct phases, reducing the uniformity induced by the unified optimization process. These distilled images from the subtasks demonstrate effective generalization when applied to the entire task. We conduct extensive experiments on CIFAR, Tiny-ImageNet, ImageNet-1K, and its sub-datasets. Our approach outperforms the previous state-of-the-art by 2$\sim$5% on average across different datasets and IPCs (images per class), increasing diversity per class by more than 5% while reducing synthesis time by up to 39.3% for enhancing the training efficiency. Code is available at: https://github.com/VILA-Lab/DELT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19946v1-abstract-full').style.display = 'none'; document.getElementById('2411.19946v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17052">arXiv:2411.17052</a> <span> [<a href="https://arxiv.org/pdf/2411.17052">pdf</a>, <a href="https://arxiv.org/format/2411.17052">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Programming-Based Offline Redundancy Resolution of Redundant Manipulators Along Prescribed Paths with Real-Time Adjustment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhihang Yin</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fa Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziqian Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jianmin Yang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jiyong Tan</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+D">Dexing Kong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17052v1-abstract-short" style="display: inline;"> Traditional offline redundancy resolution of trajectories for redundant manipulators involves computing inverse kinematic solutions for Cartesian space paths, constraining the manipulator to a fixed path without real-time adjustments. Online redundancy resolution can achieve real-time adjustment of paths, but it cannot consider subsequent path points, leading to the possibility of the manipulator… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17052v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17052v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17052v1-abstract-full" style="display: none;"> Traditional offline redundancy resolution of trajectories for redundant manipulators involves computing inverse kinematic solutions for Cartesian space paths, constraining the manipulator to a fixed path without real-time adjustments. Online redundancy resolution can achieve real-time adjustment of paths, but it cannot consider subsequent path points, leading to the possibility of the manipulator being forced to stop mid-motion due to joint constraints. To address this, this paper introduces a dynamic programming-based offline redundancy resolution for redundant manipulators along prescribed paths with real-time adjustment. The proposed method allows the manipulator to move along a prescribed path while implementing real-time adjustment along the normal to the path. Using Dynamic Programming, the proposed approach computes a global maximum for the variation of adjustment coefficients. As long as the coefficient variation between adjacent sampling path points does not exceed this limit, the algorithm provides the next path point's joint angles based on the current joint angles, enabling the end-effector to achieve the adjusted Cartesian pose. The main innovation of this paper lies in augmenting traditional offline optimal planning with real-time adjustment capabilities, achieving a fusion of offline planning and online planning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17052v1-abstract-full').style.display = 'none'; document.getElementById('2411.17052v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17034">arXiv:2411.17034</a> <span> [<a href="https://arxiv.org/pdf/2411.17034">pdf</a>, <a href="https://arxiv.org/format/2411.17034">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Programming-Based Redundancy Resolution for Path Planning of Redundant Manipulators Considering Breakpoints </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhihang Yin</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fa Wu</a>, <a href="/search/cs?searchtype=author&query=Bian%2C+R">Ruofan Bian</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziqian Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jianmin Yang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jiyong Tan</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+D">Dexing Kong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17034v1-abstract-short" style="display: inline;"> This paper proposes a redundancy resolution algorithm for a redundant manipulator based on dynamic programming. This algorithm can compute the desired joint angles at each point on a pre-planned discrete path in Cartesian space, while ensuring that the angles, velocities, and accelerations of each joint do not exceed the manipulator's constraints. We obtain the analytical solution to the inverse k… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17034v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17034v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17034v1-abstract-full" style="display: none;"> This paper proposes a redundancy resolution algorithm for a redundant manipulator based on dynamic programming. This algorithm can compute the desired joint angles at each point on a pre-planned discrete path in Cartesian space, while ensuring that the angles, velocities, and accelerations of each joint do not exceed the manipulator's constraints. We obtain the analytical solution to the inverse kinematics problem of the manipulator using a parameterization method, transforming the redundancy resolution problem into an optimization problem of determining the parameters at each path point. The constraints on joint velocity and acceleration serve as constraints for the optimization problem. Then all feasible inverse kinematic solutions for each pose under the joint angle constraints of the manipulator are obtained through parameterization methods, and the globally optimal solution to this problem is obtained through the dynamic programming algorithm. On the other hand, if a feasible joint-space path satisfying the constraints does not exist, the proposed algorithm can compute the minimum number of breakpoints required for the path and partition the path with as few breakpoints as possible to facilitate the manipulator's operation along the path. The algorithm can also determine the optimal selection of breakpoints to minimize the global cost function, rather than simply interrupting when the manipulator is unable to continue operating. The proposed algorithm is tested using a manipulator produced by a certain manufacturer, demonstrating the effectiveness of the algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17034v1-abstract-full').style.display = 'none'; document.getElementById('2411.17034v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13819">arXiv:2411.13819</a> <span> [<a href="https://arxiv.org/pdf/2411.13819">pdf</a>, <a href="https://arxiv.org/format/2411.13819">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Robust Steganography with Boundary-Preserving Overflow Alleviation and Adaptive Error Correction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yu Cheng</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Z">Zhenlin Luo</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhaoxia Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13819v1-abstract-short" style="display: inline;"> With the rapid evolution of the Internet, the vast amount of data has created opportunities for fostering the development of steganographic techniques. However, traditional steganographic techniques encounter challenges due to distortions in online social networks, such as JPEG recompression. Presently, research into the lossy operations of spatial truncation in JPEG recompression remains limited.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13819v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13819v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13819v1-abstract-full" style="display: none;"> With the rapid evolution of the Internet, the vast amount of data has created opportunities for fostering the development of steganographic techniques. However, traditional steganographic techniques encounter challenges due to distortions in online social networks, such as JPEG recompression. Presently, research into the lossy operations of spatial truncation in JPEG recompression remains limited. Existing methods aim to ensure the stability of the quantized coefficients by reducing the effects of spatial truncation. Nevertheless, these approaches may induce notable alterations to image pixels, potentially compromising anti-steganalysis performance. In this study, we analyzed the overflow characteristics of spatial blocks and observed that pixel values at the boundaries of spatial blocks are more prone to overflow. Building upon this observation, we proposed a preprocessing method that performs overflow removal operations based on the actual overflow conditions of spatial blocks. After preprocessing, our algorithm enhances coefficient stability while minimizing modifications to spatial block boundaries, favoring image quality preservation. Subsequently, we employed adaptive error correction coding to reduce coding redundancy, thereby augmenting robustness and mitigating its impact on anti-steganalysis performance. The experimental results indicate that the proposed method possesses a strong embedding capacity, maintaining a high level of robustness while enhancing security. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13819v1-abstract-full').style.display = 'none'; document.getElementById('2411.13819v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13715">arXiv:2411.13715</a> <span> [<a href="https://arxiv.org/pdf/2411.13715">pdf</a>, <a href="https://arxiv.org/format/2411.13715">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SimPhony: A Device-Circuit-Architecture Cross-Layer Modeling and Simulation Framework for Heterogeneous Electronic-Photonic AI System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Ziang Yin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Meng Zhang</a>, <a href="/search/cs?searchtype=author&query=Begovic%2C+A">Amir Begovic</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+R">Rena Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jeff Zhang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jiaqi Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13715v1-abstract-short" style="display: inline;"> Electronic-photonic integrated circuits (EPICs) offer transformative potential for next-generation high-performance AI but require interdisciplinary advances across devices, circuits, architecture, and design automation. The complexity of hybrid systems makes it challenging even for domain experts to understand distinct behaviors and interactions across design stack. The lack of a flexible, accura… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13715v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13715v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13715v1-abstract-full" style="display: none;"> Electronic-photonic integrated circuits (EPICs) offer transformative potential for next-generation high-performance AI but require interdisciplinary advances across devices, circuits, architecture, and design automation. The complexity of hybrid systems makes it challenging even for domain experts to understand distinct behaviors and interactions across design stack. The lack of a flexible, accurate, fast, and easy-to-use EPIC AI system simulation framework significantly limits the exploration of hardware innovations and system evaluations on common benchmarks. To address this gap, we propose SimPhony, a cross-layer modeling and simulation framework for heterogeneous electronic-photonic AI systems. SimPhony offers a platform that enables (1) generic, extensible hardware topology representation that supports heterogeneous multi-core architectures with diverse photonic tensor core designs; (2) optics-specific dataflow modeling with unique multi-dimensional parallelism and reuse beyond spatial/temporal dimensions; (3) data-aware energy modeling with realistic device responses, layout-aware area estimation, link budget analysis, and bandwidth-adaptive memory modeling; and (4) seamless integration with model training framework for hardware/software co-simulation. By providing a unified, versatile, and high-fidelity simulation platform, SimPhony enables researchers to innovate and evaluate EPIC AI hardware across multiple domains, facilitating the next leap in emerging AI hardware. We open-source our codes at https://github.com/ScopeX-ASU/SimPhony <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13715v1-abstract-full').style.display = 'none'; document.getElementById('2411.13715v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7-page</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11581">arXiv:2411.11581</a> <span> [<a href="https://arxiv.org/pdf/2411.11581">pdf</a>, <a href="https://arxiv.org/format/2411.11581">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> OASIS: Open Agent Social Interaction Simulations with One Million Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ziyi Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zaibin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zirui Zheng</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yuxian Jiang</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+Z">Ziyue Gan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhiyu Wang</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+Z">Zijian Ling</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jinsong Chen</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+M">Martz Ma</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+B">Bowen Dong</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+P">Prateek Gupta</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shuyue Hu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhenfei Yin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guohao Li</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xu Jia</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lijun Wang</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huchuan Lu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+C">Chaochao Lu</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+W">Wanli Ouyang</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=Torr%2C+P">Philip Torr</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+J">Jing Shao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11581v4-abstract-short" style="display: inline;"> There has been a growing interest in enhancing rule-based agent-based models (ABMs) for social media platforms (i.e., X, Reddit) with more realistic large language model (LLM) agents, thereby allowing for a more nuanced study of complex systems. As a result, several LLM-based ABMs have been proposed in the past year. While they hold promise, each simulator is specifically designed to study a parti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11581v4-abstract-full').style.display = 'inline'; document.getElementById('2411.11581v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11581v4-abstract-full" style="display: none;"> There has been a growing interest in enhancing rule-based agent-based models (ABMs) for social media platforms (i.e., X, Reddit) with more realistic large language model (LLM) agents, thereby allowing for a more nuanced study of complex systems. As a result, several LLM-based ABMs have been proposed in the past year. While they hold promise, each simulator is specifically designed to study a particular scenario, making it time-consuming and resource-intensive to explore other phenomena using the same ABM. Additionally, these models simulate only a limited number of agents, whereas real-world social media platforms involve millions of users. To this end, we propose OASIS, a generalizable and scalable social media simulator. OASIS is designed based on real-world social media platforms, incorporating dynamically updated environments (i.e., dynamic social networks and post information), diverse action spaces (i.e., following, commenting), and recommendation systems (i.e., interest-based and hot-score-based). Additionally, OASIS supports large-scale user simulations, capable of modeling up to one million users. With these features, OASIS can be easily extended to different social media platforms to study large-scale group phenomena and behaviors. We replicate various social phenomena, including information spreading, group polarization, and herd effects across X and Reddit platforms. Moreover, we provide observations of social phenomena at different agent group scales. We observe that the larger agent group scale leads to more enhanced group dynamics and more diverse and helpful agents' opinions. These findings demonstrate OASIS's potential as a powerful tool for studying complex systems in digital environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11581v4-abstract-full').style.display = 'none'; document.getElementById('2411.11581v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09914">arXiv:2411.09914</a> <span> [<a href="https://arxiv.org/pdf/2411.09914">pdf</a>, <a href="https://arxiv.org/format/2411.09914">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> mmSpyVR: Exploiting mmWave Radar for Penetrating Obstacles to Uncover Privacy Vulnerability of Virtual Reality </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mei%2C+L">Luoyu Mei</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+R">Ruofeng Liu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhimeng Yin</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Q">Qingchuan Zhao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+W">Wenchao Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuai Wang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+K">Kangjie Lu</a>, <a href="/search/cs?searchtype=author&query=He%2C+T">Tian He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09914v1-abstract-short" style="display: inline;"> Virtual reality (VR), while enhancing user experiences, introduces significant privacy risks. This paper reveals a novel vulnerability in VR systems that allows attackers to capture VR privacy through obstacles utilizing millimeter-wave (mmWave) signals without physical intrusion and virtual connection with the VR devices. We propose mmSpyVR, a novel attack on VR user's privacy via mmWave radar. T… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09914v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09914v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09914v1-abstract-full" style="display: none;"> Virtual reality (VR), while enhancing user experiences, introduces significant privacy risks. This paper reveals a novel vulnerability in VR systems that allows attackers to capture VR privacy through obstacles utilizing millimeter-wave (mmWave) signals without physical intrusion and virtual connection with the VR devices. We propose mmSpyVR, a novel attack on VR user's privacy via mmWave radar. The mmSpyVR framework encompasses two main parts: (i) A transfer learning-based feature extraction model to achieve VR feature extraction from mmWave signal. (ii) An attention-based VR privacy spying module to spy VR privacy information from the extracted feature. The mmSpyVR demonstrates the capability to extract critical VR privacy from the mmWave signals that have penetrated through obstacles. We evaluate mmSpyVR through IRB-approved user studies. Across 22 participants engaged in four experimental scenes utilizing VR devices from three different manufacturers, our system achieves an application recognition accuracy of 98.5\% and keystroke recognition accuracy of 92.6\%. This newly discovered vulnerability has implications across various domains, such as cybersecurity, privacy protection, and VR technology development. We also engage with VR manufacturer Meta to discuss and explore potential mitigation strategies. Data and code are publicly available for scrutiny and research at https://github.com/luoyumei1-a/mmSpyVR/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09914v1-abstract-full').style.display = 'none'; document.getElementById('2411.09914v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05748">arXiv:2411.05748</a> <span> [<a href="https://arxiv.org/pdf/2411.05748">pdf</a>, <a href="https://arxiv.org/format/2411.05748">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Multi-Dimensional Reconfigurable, Physically Composable Hybrid Diffractive Optical Neural Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Ziang Yin</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yu Yao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jeff Zhang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jiaqi Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05748v1-abstract-short" style="display: inline;"> Diffractive optical neural networks (DONNs), leveraging free-space light wave propagation for ultra-parallel, high-efficiency computing, have emerged as promising artificial intelligence (AI) accelerators. However, their inherent lack of reconfigurability due to fixed optical structures post-fabrication hinders practical deployment in the face of dynamic AI workloads and evolving applications. To… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05748v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05748v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05748v1-abstract-full" style="display: none;"> Diffractive optical neural networks (DONNs), leveraging free-space light wave propagation for ultra-parallel, high-efficiency computing, have emerged as promising artificial intelligence (AI) accelerators. However, their inherent lack of reconfigurability due to fixed optical structures post-fabrication hinders practical deployment in the face of dynamic AI workloads and evolving applications. To overcome this challenge, we introduce, for the first time, a multi-dimensional reconfigurable hybrid diffractive ONN system (MDR-HDONN), a physically composable architecture that unlocks a new degree of freedom and unprecedented versatility in DONNs. By leveraging full-system learnability, MDR-HDONN repurposes fixed fabricated optical hardware, achieving exponentially expanded functionality and superior task adaptability through the differentiable learning of system variables. Furthermore, MDR-HDONN adopts a hybrid optical/photonic design, combining the reconfigurability of integrated photonics with the ultra-parallelism of free-space diffractive systems. Extensive evaluations demonstrate that MDR-HDONN has digital-comparable accuracy on various task adaptations with 74x faster speed and 194x lower energy. Compared to prior DONNs, MDR-HDONN shows exponentially larger functional space with 5x faster training speed, paving the way for a new paradigm of versatile, composable, hybrid optical/photonic AI computing. We will open-source our codes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05748v1-abstract-full').style.display = 'none'; document.getElementById('2411.05748v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05636">arXiv:2411.05636</a> <span> [<a href="https://arxiv.org/pdf/2411.05636">pdf</a>, <a href="https://arxiv.org/format/2411.05636">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Video RWKV:Video Action Recognition Based RWKV </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhuowen Yin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chengru Li</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+X">Xingbo Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05636v1-abstract-short" style="display: inline;"> To address the challenges of high computational costs and long-distance dependencies in exist ing video understanding methods, such as CNNs and Transformers, this work introduces RWKV to the video domain in a novel way. We propose a LSTM CrossRWKV (LCR) framework, designed for spatiotemporal representation learning to tackle the video understanding task. Specifically, the proposed linear complexit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05636v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05636v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05636v1-abstract-full" style="display: none;"> To address the challenges of high computational costs and long-distance dependencies in exist ing video understanding methods, such as CNNs and Transformers, this work introduces RWKV to the video domain in a novel way. We propose a LSTM CrossRWKV (LCR) framework, designed for spatiotemporal representation learning to tackle the video understanding task. Specifically, the proposed linear complexity LCR incorporates a novel Cross RWKV gate to facilitate interaction be tween current frame edge information and past features, enhancing the focus on the subject through edge features and globally aggregating inter-frame features over time. LCR stores long-term mem ory for video processing through an enhanced LSTM recurrent execution mechanism. By leveraging the Cross RWKV gate and recurrent execution, LCR effectively captures both spatial and temporal features. Additionally, the edge information serves as a forgetting gate for LSTM, guiding long-term memory management.Tube masking strategy reduces redundant information in food and reduces overfitting.These advantages enable LSTM CrossRWKV to set a new benchmark in video under standing, offering a scalable and efficient solution for comprehensive video analysis. All code and models are publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05636v1-abstract-full').style.display = 'none'; document.getElementById('2411.05636v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04992">arXiv:2411.04992</a> <span> [<a href="https://arxiv.org/pdf/2411.04992">pdf</a>, <a href="https://arxiv.org/format/2411.04992">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Which bits went where? Past and future transfer entropy decomposition with the information bottleneck </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Murphy%2C+K+A">Kieran A. Murphy</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhuowen Yin</a>, <a href="/search/cs?searchtype=author&query=Bassett%2C+D+S">Dani S. Bassett</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04992v1-abstract-short" style="display: inline;"> Whether the system under study is a shoal of fish, a collection of neurons, or a set of interacting atmospheric and oceanic processes, transfer entropy measures the flow of information between time series and can detect possible causal relationships. Much like mutual information, transfer entropy is generally reported as a single value summarizing an amount of shared variation, yet a more fine-gra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04992v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04992v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04992v1-abstract-full" style="display: none;"> Whether the system under study is a shoal of fish, a collection of neurons, or a set of interacting atmospheric and oceanic processes, transfer entropy measures the flow of information between time series and can detect possible causal relationships. Much like mutual information, transfer entropy is generally reported as a single value summarizing an amount of shared variation, yet a more fine-grained accounting might illuminate much about the processes under study. Here we propose to decompose transfer entropy and localize the bits of variation on both sides of information flow: that of the originating process's past and that of the receiving process's future. We employ the information bottleneck (IB) to compress the time series and identify the transferred entropy. We apply our method to decompose the transfer entropy in several synthetic recurrent processes and an experimental mouse dataset of concurrent behavioral and neural activity. Our approach highlights the nuanced dynamics within information flow, laying a foundation for future explorations into the intricate interplay of temporal processes in complex systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04992v1-abstract-full').style.display = 'none'; document.getElementById('2411.04992v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024 workshop "Machine learning and the physical sciences" Camera ready</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18072">arXiv:2410.18072</a> <span> [<a href="https://arxiv.org/pdf/2410.18072">pdf</a>, <a href="https://arxiv.org/format/2410.18072">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> WorldSimBench: Towards Video Generation Models as World Simulators </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qin%2C+Y">Yiran Qin</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zhelun Shi</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jiwen Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xijun Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+E">Enshen Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lijun Li</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhenfei Yin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xihui Liu</a>, <a href="/search/cs?searchtype=author&query=Sheng%2C+L">Lu Sheng</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+J">Jing Shao</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+L">Lei Bai</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+W">Wanli Ouyang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruimao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18072v1-abstract-short" style="display: inline;"> Recent advancements in predictive models have demonstrated exceptional capabilities in predicting the future state of objects and scenes. However, the lack of categorization based on inherent characteristics continues to hinder the progress of predictive model development. Additionally, existing benchmarks are unable to effectively evaluate higher-capability, highly embodied predictive models from… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18072v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18072v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18072v1-abstract-full" style="display: none;"> Recent advancements in predictive models have demonstrated exceptional capabilities in predicting the future state of objects and scenes. However, the lack of categorization based on inherent characteristics continues to hinder the progress of predictive model development. Additionally, existing benchmarks are unable to effectively evaluate higher-capability, highly embodied predictive models from an embodied perspective. In this work, we classify the functionalities of predictive models into a hierarchy and take the first step in evaluating World Simulators by proposing a dual evaluation framework called WorldSimBench. WorldSimBench includes Explicit Perceptual Evaluation and Implicit Manipulative Evaluation, encompassing human preference assessments from the visual perspective and action-level evaluations in embodied tasks, covering three representative embodied scenarios: Open-Ended Embodied Environment, Autonomous, Driving, and Robot Manipulation. In the Explicit Perceptual Evaluation, we introduce the HF-Embodied Dataset, a video assessment dataset based on fine-grained human feedback, which we use to train a Human Preference Evaluator that aligns with human perception and explicitly assesses the visual fidelity of World Simulators. In the Implicit Manipulative Evaluation, we assess the video-action consistency of World Simulators by evaluating whether the generated situation-aware video can be accurately translated into the correct control signals in dynamic environments. Our comprehensive evaluation offers key insights that can drive further innovation in video generation models, positioning World Simulators as a pivotal advancement toward embodied artificial intelligence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18072v1-abstract-full').style.display = 'none'; document.getElementById('2410.18072v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16732">arXiv:2410.16732</a> <span> [<a href="https://arxiv.org/pdf/2410.16732">pdf</a>, <a href="https://arxiv.org/format/2410.16732">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Polyp-E: Benchmarking the Robustness of Deep Segmentation Models via Polyp Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+R">Runpu Wei</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zijin Yin</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+K">Kongming Liang</a>, <a href="/search/cs?searchtype=author&query=Min%2C+M">Min Min</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+C">Chengwei Pan</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+G">Gang Yu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Haonan Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yan Liu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zhanyu Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16732v1-abstract-short" style="display: inline;"> Automatic polyp segmentation is helpful to assist clinical diagnosis and treatment. In daily clinical practice, clinicians exhibit robustness in identifying polyps with both location and size variations. It is uncertain if deep segmentation models can achieve comparable robustness in automated colonoscopic analysis. To benchmark the model robustness, we focus on evaluating the robustness of segmen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16732v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16732v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16732v1-abstract-full" style="display: none;"> Automatic polyp segmentation is helpful to assist clinical diagnosis and treatment. In daily clinical practice, clinicians exhibit robustness in identifying polyps with both location and size variations. It is uncertain if deep segmentation models can achieve comparable robustness in automated colonoscopic analysis. To benchmark the model robustness, we focus on evaluating the robustness of segmentation models on the polyps with various attributes (e.g. location and size) and healthy samples. Based on the Latent Diffusion Model, we perform attribute editing on real polyps and build a new dataset named Polyp-E. Our synthetic dataset boasts exceptional realism, to the extent that clinical experts find it challenging to discern them from real data. We evaluate several existing polyp segmentation models on the proposed benchmark. The results reveal most of the models are highly sensitive to attribute variations. As a novel data augmentation technique, the proposed editing pipeline can improve both in-distribution and out-of-distribution generalization ability. The code and datasets will be released. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16732v1-abstract-full').style.display = 'none'; document.getElementById('2410.16732v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16663">arXiv:2410.16663</a> <span> [<a href="https://arxiv.org/pdf/2410.16663">pdf</a>, <a href="https://arxiv.org/format/2410.16663">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> FastAttention: Extend FlashAttention2 to NPUs and Low-resource GPUs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haoran Lin</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xianzhi Yu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+K">Kang Zhao</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+L">Lu Hou</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+Z">Zongyuan Zhan</a>, <a href="/search/cs?searchtype=author&query=Kamenev%2C+S">Stanislav Kamenev</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+H">Han Bao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Ting Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mingkai Wang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+Q">Qixin Chang</a>, <a href="/search/cs?searchtype=author&query=Sui%2C+S">Siyue Sui</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+W">Weihao Sun</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jiaxin Hu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+J">Jun Yao</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zekun Yin</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+C">Cheng Qian</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Ying Zhang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Y">Yinfei Pan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yu Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Weiguo Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16663v1-abstract-short" style="display: inline;"> FlashAttention series has been widely applied in the inference of large language models (LLMs). However, FlashAttention series only supports the high-level GPU architectures, e.g., Ampere and Hopper. At present, FlashAttention series is not easily transferrable to NPUs and low-resource GPUs. Moreover, FlashAttention series is inefficient for multi- NPUs or GPUs inference scenarios. In this work, w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16663v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16663v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16663v1-abstract-full" style="display: none;"> FlashAttention series has been widely applied in the inference of large language models (LLMs). However, FlashAttention series only supports the high-level GPU architectures, e.g., Ampere and Hopper. At present, FlashAttention series is not easily transferrable to NPUs and low-resource GPUs. Moreover, FlashAttention series is inefficient for multi- NPUs or GPUs inference scenarios. In this work, we propose FastAttention which pioneers the adaptation of FlashAttention series for NPUs and low-resource GPUs to boost LLM inference efficiency. Specifically, we take Ascend NPUs and Volta-based GPUs as representatives for designing our FastAttention. We migrate FlashAttention series to Ascend NPUs by proposing a novel two-level tiling strategy for runtime speedup, tiling-mask strategy for memory saving and the tiling-AllReduce strategy for reducing communication overhead, respectively. Besides, we adapt FlashAttention for Volta-based GPUs by redesigning the operands layout in shared memory and introducing a simple yet effective CPU-GPU cooperative strategy for efficient memory utilization. On Ascend NPUs, our FastAttention can achieve a 10.7$\times$ speedup compared to the standard attention implementation. Llama-7B within FastAttention reaches up to 5.16$\times$ higher throughput than within the standard attention. On Volta architecture GPUs, FastAttention yields 1.43$\times$ speedup compared to its equivalents in \texttt{xformers}. Pangu-38B within FastAttention brings 1.46$\times$ end-to-end speedup using FasterTransformer. Coupled with the propose CPU-GPU cooperative strategy, FastAttention supports a maximal input length of 256K on 8 V100 GPUs. All the codes will be made available soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16663v1-abstract-full').style.display = 'none'; document.getElementById('2410.16663v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16430">arXiv:2410.16430</a> <span> [<a href="https://arxiv.org/pdf/2410.16430">pdf</a>, <a href="https://arxiv.org/format/2410.16430">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HaHeAE: Learning Generalisable Joint Representations of Human Hand and Head Movements in Extended Reality </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zhiming Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Guanhua Zhang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zheming Yin</a>, <a href="/search/cs?searchtype=author&query=Haeufle%2C+D">Daniel Haeufle</a>, <a href="/search/cs?searchtype=author&query=Schmitt%2C+S">Syn Schmitt</a>, <a href="/search/cs?searchtype=author&query=Bulling%2C+A">Andreas Bulling</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16430v1-abstract-short" style="display: inline;"> Human hand and head movements are the most pervasive input modalities in extended reality (XR) and are significant for a wide range of applications. However, prior works on hand and head modelling in XR only explored a single modality or focused on specific applications. We present HaHeAE - a novel self-supervised method for learning generalisable joint representations of hand and head movements i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16430v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16430v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16430v1-abstract-full" style="display: none;"> Human hand and head movements are the most pervasive input modalities in extended reality (XR) and are significant for a wide range of applications. However, prior works on hand and head modelling in XR only explored a single modality or focused on specific applications. We present HaHeAE - a novel self-supervised method for learning generalisable joint representations of hand and head movements in XR. At the core of our method is an autoencoder (AE) that uses a graph convolutional network-based semantic encoder and a diffusion-based stochastic encoder to learn the joint semantic and stochastic representations of hand-head movements. It also features a diffusion-based decoder to reconstruct the original signals. Through extensive evaluations on three public XR datasets, we show that our method 1) significantly outperforms commonly used self-supervised methods by up to 74.0% in terms of reconstruction quality and is generalisable across users, activities, and XR environments, 2) enables new applications, including interpretable hand-head cluster identification and variable hand-head movement generation, and 3) can serve as an effective feature extractor for downstream tasks. Together, these results demonstrate the effectiveness of our method and underline the potential of self-supervised methods for jointly modelling hand-head behaviours in extended reality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16430v1-abstract-full').style.display = 'none'; document.getElementById('2410.16430v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14252">arXiv:2410.14252</a> <span> [<a href="https://arxiv.org/pdf/2410.14252">pdf</a>, <a href="https://arxiv.org/format/2410.14252">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Harmony: A Home Agent for Responsive Management and Action Optimization with a Locally Deployed Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Ziqi Yin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingxin Zhang</a>, <a href="/search/cs?searchtype=author&query=Kawahara%2C+D">Daisuke Kawahara</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14252v1-abstract-short" style="display: inline;"> Since the launch of GPT-3.5, intelligent home assistant technology based on large language models (LLMs) has made significant progress. These intelligent home assistant frameworks, such as those based on high-performance LLMs like GPT-4, have greatly expanded their functional range and application scenarios by computing on the cloud, enriching user experience and diversification. In order to optim… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14252v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14252v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14252v1-abstract-full" style="display: none;"> Since the launch of GPT-3.5, intelligent home assistant technology based on large language models (LLMs) has made significant progress. These intelligent home assistant frameworks, such as those based on high-performance LLMs like GPT-4, have greatly expanded their functional range and application scenarios by computing on the cloud, enriching user experience and diversification. In order to optimize the privacy and economy of data processing while maintaining the powerful functions of LLMs, we propose Harmony, a smart home assistant framework that uses a locally deployable small-scale LLM. Based on Llama3-8b, an open LLM that can be easily deployed on a consumer-grade PC, Harmony does not send any data to the internet during operation, ensuring local computation and privacy secured. Harmony based on Llama3-8b achieved competitive performance on our benchmark tests with the framework used in related work with GPT-4. In addition to solving the issues mentioned above, Harmony can also take actions according to the user and home status, even if the user does not issue a command. For example, when the user wants to wake up later than normal on the weekend, Harmony would open the curtains only when the user gets up or prepare the room when the user comes home without requiring user commands. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14252v1-abstract-full').style.display = 'none'; document.getElementById('2410.14252v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10398">arXiv:2410.10398</a> <span> [<a href="https://arxiv.org/pdf/2410.10398">pdf</a>, <a href="https://arxiv.org/format/2410.10398">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> FairMindSim: Alignment of Behavior, Emotion, and Belief in Humans and LLM Agents Amid Ethical Dilemmas </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lei%2C+Y">Yu Lei</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hao Liu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+C">Chengxing Xie</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Songjia Liu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhiyu Yin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Canyu Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guohao Li</a>, <a href="/search/cs?searchtype=author&query=Torr%2C+P">Philip Torr</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhen Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10398v2-abstract-short" style="display: inline;"> AI alignment is a pivotal issue concerning AI control and safety. It should consider not only value-neutral human preferences but also moral and ethical considerations. In this study, we introduced FairMindSim, which simulates the moral dilemma through a series of unfair scenarios. We used LLM agents to simulate human behavior, ensuring alignment across various stages. To explore the various socio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10398v2-abstract-full').style.display = 'inline'; document.getElementById('2410.10398v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10398v2-abstract-full" style="display: none;"> AI alignment is a pivotal issue concerning AI control and safety. It should consider not only value-neutral human preferences but also moral and ethical considerations. In this study, we introduced FairMindSim, which simulates the moral dilemma through a series of unfair scenarios. We used LLM agents to simulate human behavior, ensuring alignment across various stages. To explore the various socioeconomic motivations, which we refer to as beliefs, that drive both humans and LLM agents as bystanders to intervene in unjust situations involving others, and how these beliefs interact to influence individual behavior, we incorporated knowledge from relevant sociological fields and proposed the Belief-Reward Alignment Behavior Evolution Model (BREM) based on the recursive reward model (RRM). Our findings indicate that, behaviorally, GPT-4o exhibits a stronger sense of social justice, while humans display a richer range of emotions. Additionally, we discussed the potential impact of emotions on behavior. This study provides a theoretical foundation for applications in aligning LLMs with altruistic values. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10398v2-abstract-full').style.display = 'none'; document.getElementById('2410.10398v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09467">arXiv:2410.09467</a> <span> [<a href="https://arxiv.org/pdf/2410.09467">pdf</a>, <a href="https://arxiv.org/format/2410.09467">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Single Image to 3D Generation using Gaussian Splatting and Hybrid Diffusion Priors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Basak%2C+H">Hritam Basak</a>, <a href="/search/cs?searchtype=author&query=Tabatabaee%2C+H">Hadi Tabatabaee</a>, <a href="/search/cs?searchtype=author&query=Gayaka%2C+S">Shreekant Gayaka</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Ming-Feng Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xin Yang</a>, <a href="/search/cs?searchtype=author&query=Kuo%2C+C">Cheng-Hao Kuo</a>, <a href="/search/cs?searchtype=author&query=Sen%2C+A">Arnie Sen</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Min Sun</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhaozheng Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09467v2-abstract-short" style="display: inline;"> 3D object generation from a single image involves estimating the full 3D geometry and texture of unseen views from an unposed RGB image captured in the wild. Accurately reconstructing an object's complete 3D structure and texture has numerous applications in real-world scenarios, including robotic manipulation, grasping, 3D scene understanding, and AR/VR. Recent advancements in 3D object generatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09467v2-abstract-full').style.display = 'inline'; document.getElementById('2410.09467v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09467v2-abstract-full" style="display: none;"> 3D object generation from a single image involves estimating the full 3D geometry and texture of unseen views from an unposed RGB image captured in the wild. Accurately reconstructing an object's complete 3D structure and texture has numerous applications in real-world scenarios, including robotic manipulation, grasping, 3D scene understanding, and AR/VR. Recent advancements in 3D object generation have introduced techniques that reconstruct an object's 3D shape and texture by optimizing the efficient representation of Gaussian Splatting, guided by pre-trained 2D or 3D diffusion models. However, a notable disparity exists between the training datasets of these models, leading to distinct differences in their outputs. While 2D models generate highly detailed visuals, they lack cross-view consistency in geometry and texture. In contrast, 3D models ensure consistency across different views but often result in overly smooth textures. We propose bridging the gap between 2D and 3D diffusion models to address this limitation by integrating a two-stage frequency-based distillation loss with Gaussian Splatting. Specifically, we leverage geometric priors in the low-frequency spectrum from a 3D diffusion model to maintain consistent geometry and use a 2D diffusion model to refine the fidelity and texture in the high-frequency spectrum of the generated 3D structure, resulting in more detailed and fine-grained outcomes. Our approach enhances geometric consistency and visual quality, outperforming the current SOTA. Additionally, we demonstrate the easy adaptability of our method for efficient object pose estimation and tracking. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09467v2-abstract-full').style.display = 'none'; document.getElementById('2410.09467v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09403">arXiv:2410.09403</a> <span> [<a href="https://arxiv.org/pdf/2410.09403">pdf</a>, <a href="https://arxiv.org/format/2410.09403">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Two Heads Are Better Than One: A Multi-Agent System Has the Potential to Improve Scientific Idea Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+H">Haoyang Su</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+R">Renqi Chen</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Shixiang Tang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+X">Xinzhe Zheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jingzhe Li</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhenfei Yin</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+W">Wanli Ouyang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+N">Nanqing Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09403v1-abstract-short" style="display: inline;"> The rapid advancement of scientific progress requires innovative tools that can accelerate discovery. While recent AI methods, particularly large language models (LLMs), have shown promise in tasks such as hypothesis generation and experimental design, they fall short in replicating the collaborative nature of real-world scientific practices, where diverse teams of experts work together to tackle… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09403v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09403v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09403v1-abstract-full" style="display: none;"> The rapid advancement of scientific progress requires innovative tools that can accelerate discovery. While recent AI methods, particularly large language models (LLMs), have shown promise in tasks such as hypothesis generation and experimental design, they fall short in replicating the collaborative nature of real-world scientific practices, where diverse teams of experts work together to tackle complex problems. To address the limitation, we propose an LLM-based multi-agent system, i.e., Virtual Scientists (VirSci), designed to mimic the teamwork inherent in scientific research. VirSci organizes a team of agents to collaboratively generate, evaluate, and refine research ideas. Through comprehensive experiments, we demonstrate that this multi-agent approach outperforms the state-of-the-art method in producing novel and impactful scientific ideas, showing potential in aligning with key insights in the Science of Science field. Our findings suggest that integrating collaborative agents can lead to more innovative scientific outputs, offering a robust system for autonomous scientific discovery. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09403v1-abstract-full').style.display = 'none'; document.getElementById('2410.09403v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03977">arXiv:2410.03977</a> <span> [<a href="https://arxiv.org/pdf/2410.03977">pdf</a>, <a href="https://arxiv.org/format/2410.03977">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Learning to Balance: Diverse Normalization for Cloth-Changing Person Re-Identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongjun Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiyuan Chen</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhengwei Yin</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xuan Song</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yinqiang Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03977v2-abstract-short" style="display: inline;"> Cloth-Changing Person Re-Identification (CC-ReID) involves recognizing individuals in images regardless of clothing status. In this paper, we empirically and experimentally demonstrate that completely eliminating or fully retaining clothing features is detrimental to the task. Existing work, either relying on clothing labels, silhouettes, or other auxiliary data, fundamentally aim to balance the l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03977v2-abstract-full').style.display = 'inline'; document.getElementById('2410.03977v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03977v2-abstract-full" style="display: none;"> Cloth-Changing Person Re-Identification (CC-ReID) involves recognizing individuals in images regardless of clothing status. In this paper, we empirically and experimentally demonstrate that completely eliminating or fully retaining clothing features is detrimental to the task. Existing work, either relying on clothing labels, silhouettes, or other auxiliary data, fundamentally aim to balance the learning of clothing and identity features. However, we practically find that achieving this balance is challenging and nuanced. In this study, we introduce a novel module called Diverse Norm, which expands personal features into orthogonal spaces and employs channel attention to separate clothing and identity features. A sample re-weighting optimization strategy is also introduced to guarantee the opposite optimization direction. Diverse Norm presents a simple yet effective approach that does not require additional data. Furthermore, Diverse Norm can be seamlessly integrated ResNet50 and significantly outperforms the state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03977v2-abstract-full').style.display = 'none'; document.getElementById('2410.03977v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03525">arXiv:2410.03525</a> <span> [<a href="https://arxiv.org/pdf/2410.03525">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Artificial Human Lecturers: Initial Findings From Asia's First AI Lecturers in Class to Promote Innovation in Education </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pang%2C+C+C">Ching Christie Pang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yawei Zhao</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhizhuo Yin</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jia Sun</a>, <a href="/search/cs?searchtype=author&query=Mogavi%2C+R+H">Reza Hadi Mogavi</a>, <a href="/search/cs?searchtype=author&query=Hui%2C+P">Pan Hui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03525v1-abstract-short" style="display: inline;"> In recent years, artificial intelligence (AI) has become increasingly integrated into education, reshaping traditional learning environments. Despite this, there has been limited investigation into fully operational artificial human lecturers. To the best of our knowledge, our paper presents the world's first study examining their deployment in a real-world educational setting. Specifically, we in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03525v1-abstract-full').style.display = 'inline'; document.getElementById('2410.03525v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03525v1-abstract-full" style="display: none;"> In recent years, artificial intelligence (AI) has become increasingly integrated into education, reshaping traditional learning environments. Despite this, there has been limited investigation into fully operational artificial human lecturers. To the best of our knowledge, our paper presents the world's first study examining their deployment in a real-world educational setting. Specifically, we investigate the use of "digital teachers," AI-powered virtual lecturers, in a postgraduate course at the Hong Kong University of Science and Technology (HKUST). Our study explores how features such as appearance, non-verbal cues, voice, and verbal expression impact students' learning experiences. Findings suggest that students highly value naturalness, authenticity, and interactivity in digital teachers, highlighting areas for improvement, such as increased responsiveness, personalized avatars, and integration with larger learning platforms. We conclude that digital teachers have significant potential to enhance education by providing a more flexible, engaging, personalized, and accessible learning experience for students. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03525v1-abstract-full').style.display = 'none'; document.getElementById('2410.03525v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">28 pages, 6 figures (10 sub-figures), 3 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02787">arXiv:2410.02787</a> <span> [<a href="https://arxiv.org/pdf/2410.02787">pdf</a>, <a href="https://arxiv.org/format/2410.02787">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Navigation with VLM framework: Go to Any Language </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zecheng Yin</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+C">Chonghao Cheng</a>, <a href="/search/cs?searchtype=author&query=Lizhen"> Lizhen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02787v1-abstract-short" style="display: inline;"> Navigating towards fully open language goals and exploring open scenes in a manner akin to human exploration have always posed significant challenges. Recently, Vision Large Language Models (VLMs) have demonstrated remarkable capabilities in reasoning with both language and visual data. While many works have focused on leveraging VLMs for navigation in open scenes and with open vocabularies, these… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02787v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02787v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02787v1-abstract-full" style="display: none;"> Navigating towards fully open language goals and exploring open scenes in a manner akin to human exploration have always posed significant challenges. Recently, Vision Large Language Models (VLMs) have demonstrated remarkable capabilities in reasoning with both language and visual data. While many works have focused on leveraging VLMs for navigation in open scenes and with open vocabularies, these efforts often fall short of fully utilizing the potential of VLMs or require substantial computational resources. We introduce Navigation with VLM (NavVLM), a framework that harnesses equipment-level VLMs to enable agents to navigate towards any language goal specific or non-specific in open scenes, emulating human exploration behaviors without any prior training. The agent leverages the VLM as its cognitive core to perceive environmental information based on any language goal and constantly provides exploration guidance during navigation until it reaches the target location or area. Our framework not only achieves state-of-the-art performance in Success Rate (SR) and Success weighted by Path Length (SPL) in traditional specific goal settings but also extends the navigation capabilities to any open-set language goal. We evaluate NavVLM in richly detailed environments from the Matterport 3D (MP3D), Habitat Matterport 3D (HM3D), and Gibson datasets within the Habitat simulator. With the power of VLMs, navigation has entered a new era. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02787v1-abstract-full').style.display = 'none'; document.getElementById('2410.02787v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01289">arXiv:2410.01289</a> <span> [<a href="https://arxiv.org/pdf/2410.01289">pdf</a>, <a href="https://arxiv.org/format/2410.01289">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> </div> </div> <p class="title is-5 mathjax"> The Unlikely Hero: Nonideality in Analog Photonic Neural Networks as Built-in Defender Against Adversarial Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+H">Haotian Lu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Ziang Yin</a>, <a href="/search/cs?searchtype=author&query=Bhoumik%2C+P">Partho Bhoumik</a>, <a href="/search/cs?searchtype=author&query=Banerjee%2C+S">Sanmitra Banerjee</a>, <a href="/search/cs?searchtype=author&query=Chakrabarty%2C+K">Krishnendu Chakrabarty</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jiaqi Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01289v1-abstract-short" style="display: inline;"> Electronic-photonic computing systems have emerged as a promising platform for accelerating deep neural network (DNN) workloads. Major efforts have been focused on countering hardware non-idealities and boosting efficiency with various hardware/algorithm co-design methods. However, the adversarial robustness of such photonic analog mixed-signal AI hardware remains unexplored. Though the hardware v… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01289v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01289v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01289v1-abstract-full" style="display: none;"> Electronic-photonic computing systems have emerged as a promising platform for accelerating deep neural network (DNN) workloads. Major efforts have been focused on countering hardware non-idealities and boosting efficiency with various hardware/algorithm co-design methods. However, the adversarial robustness of such photonic analog mixed-signal AI hardware remains unexplored. Though the hardware variations can be mitigated with robustness-driven optimization methods, malicious attacks on the hardware show distinct behaviors from noises, which requires a customized protection method tailored to optical analog hardware. In this work, we rethink the role of conventionally undesired non-idealities in photonic analog accelerators and claim their surprising effects on defending against adversarial weight attacks. Inspired by the protection effects from DNN quantization and pruning, we propose a synergistic defense framework tailored for optical analog hardware that proactively protects sensitive weights via pre-attack unary weight encoding and post-attack vulnerability-aware weight locking. Efficiency-reliability trade-offs are formulated as constrained optimization problems and efficiently solved offline without model re-training costs. Extensive evaluation of various DNN benchmarks with a multi-core photonic accelerator shows that our framework maintains near-ideal on-chip inference accuracy under adversarial bit-flip attacks with merely <3% memory overhead. Our codes are open-sourced at https://github.com/ScopeX-ASU/Unlikely_Hero. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01289v1-abstract-full').style.display = 'none'; document.getElementById('2410.01289v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages. Accepted to ACM/IEEE ASP-DAC 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01218">arXiv:2410.01218</a> <span> [<a href="https://arxiv.org/pdf/2410.01218">pdf</a>, <a href="https://arxiv.org/format/2410.01218">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Geophysics">physics.geo-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Physics">physics.comp-ph</span> </div> </div> <p class="title is-5 mathjax"> An uncertainty-aware Digital Shadow for underground multimodal CO2 storage monitoring </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gahlot%2C+A+P">Abhinav Prakash Gahlot</a>, <a href="/search/cs?searchtype=author&query=Orozco%2C+R">Rafael Orozco</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Ziyi Yin</a>, <a href="/search/cs?searchtype=author&query=Herrmann%2C+F+J">Felix J. Herrmann</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01218v1-abstract-short" style="display: inline;"> Geological Carbon Storage GCS is arguably the only scalable net-negative CO2 emission technology available While promising subsurface complexities and heterogeneity of reservoir properties demand a systematic approach to quantify uncertainty when optimizing production and mitigating storage risks which include assurances of Containment and Conformance of injected supercritical CO2 As a first step… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01218v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01218v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01218v1-abstract-full" style="display: none;"> Geological Carbon Storage GCS is arguably the only scalable net-negative CO2 emission technology available While promising subsurface complexities and heterogeneity of reservoir properties demand a systematic approach to quantify uncertainty when optimizing production and mitigating storage risks which include assurances of Containment and Conformance of injected supercritical CO2 As a first step towards the design and implementation of a Digital Twin for monitoring underground storage operations a machine learning based data-assimilation framework is introduced and validated on carefully designed realistic numerical simulations As our implementation is based on Bayesian inference but does not yet support control and decision-making we coin our approach an uncertainty-aware Digital Shadow To characterize the posterior distribution for the state of CO2 plumes conditioned on multi-modal time-lapse data the envisioned Shadow combines techniques from Simulation-Based Inference SBI and Ensemble Bayesian Filtering to establish probabilistic baselines and assimilate multi-modal data for GCS problems that are challenged by large degrees of freedom nonlinear multi-physics non-Gaussianity and computationally expensive to evaluate fluid flow and seismic simulations To enable SBI for dynamic systems a recursive scheme is proposed where the Digital Shadows neural networks are trained on simulated ensembles for their state and observed data well and/or seismic Once training is completed the systems state is inferred when time-lapse field data becomes available In this computational study we observe that a lack of knowledge on the permeability field can be factored into the Digital Shadows uncertainty quantification To our knowledge this work represents the first proof of concept of an uncertainty-aware in-principle scalable Digital Shadow. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01218v1-abstract-full').style.display = 'none'; document.getElementById('2410.01218v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15822">arXiv:2409.15822</a> <span> [<a href="https://arxiv.org/pdf/2409.15822">pdf</a>, <a href="https://arxiv.org/format/2409.15822">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> A Ducted Fan UAV for Safe Aerial Grabbing and Transfer of Multiple Loads Using Electromagnets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhong Yin</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+H">Hailong Pei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15822v1-abstract-short" style="display: inline;"> In recent years, research on aerial grasping, manipulation, and transportation of objects has garnered significant attention. These tasks often require UAVs to operate safely close to environments or objects and to efficiently grasp payloads. However, current widely adopted flying platforms pose safety hazards: unprotected high-speed rotating propellers can cause harm to the surroundings. Addition… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15822v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15822v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15822v1-abstract-full" style="display: none;"> In recent years, research on aerial grasping, manipulation, and transportation of objects has garnered significant attention. These tasks often require UAVs to operate safely close to environments or objects and to efficiently grasp payloads. However, current widely adopted flying platforms pose safety hazards: unprotected high-speed rotating propellers can cause harm to the surroundings. Additionally, the space for carrying payloads on the fuselage is limited, and the restricted position of the payload also hinders efficient grasping. To address these issues, this paper presents a coaxial ducted fan UAV which is equipped with electromagnets mounted externally on the fuselage, enabling safe grasping and transfer of multiple loads in midair without complex additional actuators. It also has the capability to achieve direct human-UAV cargo transfer in the air. The forces acting on the loads during magnetic attachment and their influencing factors were analyzed. An ADRC controller is utilized to counteract disturbances during grasping and achieve attitude control. Finally, flight tests are conducted to verify the UAV's ability to directly grasp multiple loads from human hands in flight while maintaining attitude tracking. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15822v1-abstract-full').style.display = 'none'; document.getElementById('2409.15822v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8pages, 13figures,accepted by IROS2024 This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10570">arXiv:2409.10570</a> <span> [<a href="https://arxiv.org/pdf/2409.10570">pdf</a>, <a href="https://arxiv.org/format/2409.10570">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Protecting Copyright of Medical Pre-trained Language Models: Training-Free Backdoor Watermarking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kong%2C+C">Cong Kong</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Rui Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weixi Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiawei Chen</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhaoxia Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10570v1-abstract-short" style="display: inline;"> Pre-training language models followed by fine-tuning on specific tasks is standard in NLP, but traditional models often underperform when applied to the medical domain, leading to the development of specialized medical pre-trained language models (Med-PLMs). These models are valuable assets but are vulnerable to misuse and theft, requiring copyright protection. However, no existing watermarking me… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10570v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10570v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10570v1-abstract-full" style="display: none;"> Pre-training language models followed by fine-tuning on specific tasks is standard in NLP, but traditional models often underperform when applied to the medical domain, leading to the development of specialized medical pre-trained language models (Med-PLMs). These models are valuable assets but are vulnerable to misuse and theft, requiring copyright protection. However, no existing watermarking methods are tailored for Med-PLMs, and adapting general PLMs watermarking techniques to the medical domain faces challenges such as task incompatibility, loss of fidelity, and inefficiency. To address these issues, we propose the first training-free backdoor watermarking method for Med-PLMs. Our method uses rare special symbols as trigger words, which do not impact downstream task performance, embedding watermarks by replacing their original embeddings with those of specific medical terms in the Med-PLMs' word embeddings layer. After fine-tuning the watermarked Med-PLMs on various medical downstream tasks, the final models (FMs) respond to the trigger words in the same way they would to the corresponding medical terms. This property can be utilized to extract the watermark. Experiments demonstrate that our method achieves high fidelity while effectively extracting watermarks across various medical downstream tasks. Additionally, our method demonstrates robustness against various attacks and significantly enhances the efficiency of watermark embedding, reducing the embedding time from 10 hours to 10 seconds. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10570v1-abstract-full').style.display = 'none'; document.getElementById('2409.10570v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10561">arXiv:2409.10561</a> <span> [<a href="https://arxiv.org/pdf/2409.10561">pdf</a>, <a href="https://arxiv.org/format/2409.10561">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DrLLM: Prompt-Enhanced Distributed Denial-of-Service Resistance Method with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhenyu Yin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shang Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+G">Guangyuan Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10561v3-abstract-short" style="display: inline;"> The increasing number of Distributed Denial of Service (DDoS) attacks poses a major threat to the Internet, highlighting the importance of DDoS mitigation. Most existing approaches require complex training methods to learn data features, which increases the complexity and generality of the application. In this paper, we propose DrLLM, which aims to mine anomalous traffic information in zero-shot s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10561v3-abstract-full').style.display = 'inline'; document.getElementById('2409.10561v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10561v3-abstract-full" style="display: none;"> The increasing number of Distributed Denial of Service (DDoS) attacks poses a major threat to the Internet, highlighting the importance of DDoS mitigation. Most existing approaches require complex training methods to learn data features, which increases the complexity and generality of the application. In this paper, we propose DrLLM, which aims to mine anomalous traffic information in zero-shot scenarios through Large Language Models (LLMs). To bridge the gap between DrLLM and existing approaches, we embed the global and local information of the traffic data into the reasoning paradigm and design three modules, namely Knowledge Embedding, Token Embedding, and Progressive Role Reasoning, for data representation and reasoning. In addition we explore the generalization of prompt engineering in the cybersecurity domain to improve the classification capability of DrLLM. Our ablation experiments demonstrate the applicability of DrLLM in zero-shot scenarios and further demonstrate the potential of LLMs in the network domains. DrLLM implementation code has been open-sourced at https://github.com/liuup/DrLLM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10561v3-abstract-full').style.display = 'none'; document.getElementById('2409.10561v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.01216">arXiv:2409.01216</a> <span> [<a href="https://arxiv.org/pdf/2409.01216">pdf</a>, <a href="https://arxiv.org/format/2409.01216">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.24963/ijcai.2024/131">10.24963/ijcai.2024/131 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> ESP-PCT: Enhanced VR Semantic Performance through Efficient Compression of Temporal and Spatial Redundancies in Point Cloud Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mei%2C+L">Luoyu Mei</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuai Wang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yun Cheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+R">Ruofeng Liu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhimeng Yin</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+W">Wenchao Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuai Wang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+W">Wei Gong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.01216v1-abstract-short" style="display: inline;"> Semantic recognition is pivotal in virtual reality (VR) applications, enabling immersive and interactive experiences. A promising approach is utilizing millimeter-wave (mmWave) signals to generate point clouds. However, the high computational and memory demands of current mmWave point cloud models hinder their efficiency and reliability. To address this limitation, our paper introduces ESP-PCT, a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01216v1-abstract-full').style.display = 'inline'; document.getElementById('2409.01216v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.01216v1-abstract-full" style="display: none;"> Semantic recognition is pivotal in virtual reality (VR) applications, enabling immersive and interactive experiences. A promising approach is utilizing millimeter-wave (mmWave) signals to generate point clouds. However, the high computational and memory demands of current mmWave point cloud models hinder their efficiency and reliability. To address this limitation, our paper introduces ESP-PCT, a novel Enhanced Semantic Performance Point Cloud Transformer with a two-stage semantic recognition framework tailored for VR applications. ESP-PCT takes advantage of the accuracy of sensory point cloud data and optimizes the semantic recognition process, where the localization and focus stages are trained jointly in an end-to-end manner. We evaluate ESP-PCT on various VR semantic recognition conditions, demonstrating substantial enhancements in recognition efficiency. Notably, ESP-PCT achieves a remarkable accuracy of 93.2% while reducing the computational requirements (FLOPs) by 76.9% and memory usage by 78.2% compared to the existing Point Transformer model simultaneously. These underscore ESP-PCT's potential in VR semantic recognition by achieving high accuracy and reducing redundancy. The code and data of this project are available at \url{https://github.com/lymei-SEU/ESP-PCT}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01216v1-abstract-full').style.display = 'none'; document.getElementById('2409.01216v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of the Thirty-Third International Joint Conference on Artificial Intelligence, IJCAI 2024 </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yin%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yin%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yin%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yin%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yin%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yin%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Yin%2C+Z&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> <li> <a href="/search/?searchtype=author&query=Yin%2C+Z&start=300" class="pagination-link " aria-label="Page 7" aria-current="page">7 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository