CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 375 results for author: <span class="mathjax">Ren, H</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Ren%2C+H">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Ren, H"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Ren%2C+H&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Ren, H"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Ren%2C+H&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Ren%2C+H&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Ren%2C+H&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Ren%2C+H&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Ren%2C+H&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Ren%2C+H&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.19319">arXiv:2501.19319</a> <span> [<a href="https://arxiv.org/pdf/2501.19319">pdf</a>, <a href="https://arxiv.org/format/2501.19319">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Advancing Dense Endoscopic Reconstruction with Gaussian Splatting-driven Surface Normal-aware Tracking and Mapping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yiming Huang</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+B">Beilei Cui</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+L">Long Bai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhen Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jinlin Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhen Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hongbin Liu</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hongliang Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.19319v1-abstract-short" style="display: inline;"> Simultaneous Localization and Mapping (SLAM) is essential for precise surgical interventions and robotic tasks in minimally invasive procedures. While recent advancements in 3D Gaussian Splatting (3DGS) have improved SLAM with high-quality novel view synthesis and fast rendering, these systems struggle with accurate depth and surface reconstruction due to multi-view inconsistencies. Simply incorpo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19319v1-abstract-full').style.display = 'inline'; document.getElementById('2501.19319v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.19319v1-abstract-full" style="display: none;"> Simultaneous Localization and Mapping (SLAM) is essential for precise surgical interventions and robotic tasks in minimally invasive procedures. While recent advancements in 3D Gaussian Splatting (3DGS) have improved SLAM with high-quality novel view synthesis and fast rendering, these systems struggle with accurate depth and surface reconstruction due to multi-view inconsistencies. Simply incorporating SLAM and 3DGS leads to mismatches between the reconstructed frames. In this work, we present Endo-2DTAM, a real-time endoscopic SLAM system with 2D Gaussian Splatting (2DGS) to address these challenges. Endo-2DTAM incorporates a surface normal-aware pipeline, which consists of tracking, mapping, and bundle adjustment modules for geometrically accurate reconstruction. Our robust tracking module combines point-to-point and point-to-plane distance metrics, while the mapping module utilizes normal consistency and depth distortion to enhance surface reconstruction quality. We also introduce a pose-consistent strategy for efficient and geometrically coherent keyframe sampling. Extensive experiments on public endoscopic datasets demonstrate that Endo-2DTAM achieves an RMSE of $1.87\pm 0.63$ mm for depth reconstruction of surgical scenes while maintaining computationally efficient tracking, high-quality visual appearance, and real-time rendering. Our code will be released at github.com/lastbasket/Endo-2DTAM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19319v1-abstract-full').style.display = 'none'; document.getElementById('2501.19319v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICRA 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15808">arXiv:2501.15808</a> <span> [<a href="https://arxiv.org/pdf/2501.15808">pdf</a>, <a href="https://arxiv.org/format/2501.15808">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ClearSight: Human Vision-Inspired Solutions for Event-Based Motion Deblurring </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xiaopeng Lin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yulong Huang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hongwei Ren</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zunchang Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yue Zhou</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+H">Haotian Fu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+B">Bojun Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15808v1-abstract-short" style="display: inline;"> Motion deblurring addresses the challenge of image blur caused by camera or scene movement. Event cameras provide motion information that is encoded in the asynchronous event streams. To efficiently leverage the temporal information of event streams, we employ Spiking Neural Networks (SNNs) for motion feature extraction and Artificial Neural Networks (ANNs) for color information processing. Due to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15808v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15808v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15808v1-abstract-full" style="display: none;"> Motion deblurring addresses the challenge of image blur caused by camera or scene movement. Event cameras provide motion information that is encoded in the asynchronous event streams. To efficiently leverage the temporal information of event streams, we employ Spiking Neural Networks (SNNs) for motion feature extraction and Artificial Neural Networks (ANNs) for color information processing. Due to the non-uniform distribution and inherent redundancy of event data, existing cross-modal feature fusion methods exhibit certain limitations. Inspired by the visual attention mechanism in the human visual system, this study introduces a bioinspired dual-drive hybrid network (BDHNet). Specifically, the Neuron Configurator Module (NCM) is designed to dynamically adjusts neuron configurations based on cross-modal features, thereby focusing the spikes in blurry regions and adapting to varying blurry scenarios dynamically. Additionally, the Region of Blurry Attention Module (RBAM) is introduced to generate a blurry mask in an unsupervised manner, effectively extracting motion clues from the event features and guiding more accurate cross-modal feature fusion. Extensive subjective and objective evaluations demonstrate that our method outperforms current state-of-the-art methods on both synthetic and real-world datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15808v1-abstract-full').style.display = 'none'; document.getElementById('2501.15808v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.11577">arXiv:2501.11577</a> <span> [<a href="https://arxiv.org/pdf/2501.11577">pdf</a>, <a href="https://arxiv.org/format/2501.11577">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Membership Inference Attacks Against Transfer Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+C">Cong Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jing Chen</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Q">Qianru Fang</a>, <a href="/search/cs?searchtype=author&query=He%2C+K">Kun He</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Ziming Zhao</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hao Ren</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+G">Guowen Xu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+Y">Yang Xiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.11577v1-abstract-short" style="display: inline;"> Transfer learning, successful in knowledge translation across related tasks, faces a substantial privacy threat from membership inference attacks (MIAs). These attacks, despite posing significant risk to ML model's training data, remain limited-explored in transfer learning. The interaction between teacher and student models in transfer learning has not been thoroughly explored in MIAs, potentiall… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11577v1-abstract-full').style.display = 'inline'; document.getElementById('2501.11577v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.11577v1-abstract-full" style="display: none;"> Transfer learning, successful in knowledge translation across related tasks, faces a substantial privacy threat from membership inference attacks (MIAs). These attacks, despite posing significant risk to ML model's training data, remain limited-explored in transfer learning. The interaction between teacher and student models in transfer learning has not been thoroughly explored in MIAs, potentially resulting in an under-examined aspect of privacy vulnerabilities within transfer learning. In this paper, we propose a new MIA vector against transfer learning, to determine whether a specific data point was used to train the teacher model while only accessing the student model in a white-box setting. Our method delves into the intricate relationship between teacher and student models, analyzing the discrepancies in hidden layer representations between the student model and its shadow counterpart. These identified differences are then adeptly utilized to refine the shadow model's training process and to inform membership inference decisions effectively. Our method, evaluated across four datasets in diverse transfer learning tasks, reveals that even when an attacker only has access to the student model, the teacher model's training data remains susceptible to MIAs. We believe our work unveils the unexplored risk of membership inference in transfer learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11577v1-abstract-full').style.display = 'none'; document.getElementById('2501.11577v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.11360">arXiv:2501.11360</a> <span> [<a href="https://arxiv.org/pdf/2501.11360">pdf</a>, <a href="https://arxiv.org/format/2501.11360">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Federated Learning with Sample-level Client Drift Mitigation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+H">Haoran Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiaze Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wanyi Wu</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hao Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.11360v1-abstract-short" style="display: inline;"> Federated Learning (FL) suffers from severe performance degradation due to the data heterogeneity among clients. Existing works reveal that the fundamental reason is that data heterogeneity can cause client drift where the local model update deviates from the global one, and thus they usually tackle this problem from the perspective of calibrating the obtained local update. Despite effectiveness,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11360v1-abstract-full').style.display = 'inline'; document.getElementById('2501.11360v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.11360v1-abstract-full" style="display: none;"> Federated Learning (FL) suffers from severe performance degradation due to the data heterogeneity among clients. Existing works reveal that the fundamental reason is that data heterogeneity can cause client drift where the local model update deviates from the global one, and thus they usually tackle this problem from the perspective of calibrating the obtained local update. Despite effectiveness, existing methods substantially lack a deep understanding of how heterogeneous data samples contribute to the formation of client drift. In this paper, we bridge this gap by identifying that the drift can be viewed as a cumulative manifestation of biases present in all local samples and the bias between samples is different. Besides, the bias dynamically changes as the FL training progresses. Motivated by this, we propose FedBSS that first mitigates the heterogeneity issue in a sample-level manner, orthogonal to existing methods. Specifically, the core idea of our method is to adopt a bias-aware sample selection scheme that dynamically selects the samples from small biases to large epoch by epoch to train progressively the local model in each round. In order to ensure the stability of training, we set the diversified knowledge acquisition stage as the warm-up stage to avoid the local optimality caused by knowledge deviation in the early stage of the model. Evaluation results show that FedBSS outperforms state-of-the-art baselines. In addition, we also achieved effective results on feature distribution skew and noise label dataset setting, which proves that FedBSS can not only reduce heterogeneity, but also has scalability and robustness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11360v1-abstract-full').style.display = 'none'; document.getElementById('2501.11360v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.11347">arXiv:2501.11347</a> <span> [<a href="https://arxiv.org/pdf/2501.11347">pdf</a>, <a href="https://arxiv.org/format/2501.11347">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EndoChat: Grounded Multimodal Large Language Model for Endoscopic Surgery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guankun Wang</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+L">Long Bai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junyi Wang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+K">Kun Yuan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhen Li</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+T">Tianxu Jiang</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiting He</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jinlin Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhen Chen</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+Z">Zhen Lei</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hongbin Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiazheng Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fan Zhang</a>, <a href="/search/cs?searchtype=author&query=Padoy%2C+N">Nicolas Padoy</a>, <a href="/search/cs?searchtype=author&query=Navab%2C+N">Nassir Navab</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hongliang Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.11347v1-abstract-short" style="display: inline;"> Recently, Multimodal Large Language Models (MLLMs) have demonstrated their immense potential in computer-aided diagnosis and decision-making. In the context of robotic-assisted surgery, MLLMs can serve as effective tools for surgical training and guidance. However, there is still a lack of MLLMs specialized for surgical scene understanding in clinical applications. In this work, we introduce EndoC… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11347v1-abstract-full').style.display = 'inline'; document.getElementById('2501.11347v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.11347v1-abstract-full" style="display: none;"> Recently, Multimodal Large Language Models (MLLMs) have demonstrated their immense potential in computer-aided diagnosis and decision-making. In the context of robotic-assisted surgery, MLLMs can serve as effective tools for surgical training and guidance. However, there is still a lack of MLLMs specialized for surgical scene understanding in clinical applications. In this work, we introduce EndoChat to address various dialogue paradigms and subtasks in surgical scene understanding that surgeons encounter. To train our EndoChat, we construct the Surg-396K dataset through a novel pipeline that systematically extracts surgical information and generates structured annotations based on collected large-scale endoscopic surgery datasets. Furthermore, we introduce a multi-scale visual token interaction mechanism and a visual contrast-based reasoning mechanism to enhance the model's representation learning and reasoning capabilities. Our model achieves state-of-the-art performance across five dialogue paradigms and eight surgical scene understanding tasks. Additionally, we conduct evaluations with professional surgeons, most of whom provide positive feedback on collaborating with EndoChat. Overall, these results demonstrate that our EndoChat has great potential to significantly advance training and automation in robotic-assisted surgery. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11347v1-abstract-full').style.display = 'none'; document.getElementById('2501.11347v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04993">arXiv:2501.04993</a> <span> [<a href="https://arxiv.org/pdf/2501.04993">pdf</a>, <a href="https://arxiv.org/format/2501.04993">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Operating Systems">cs.OS</span> </div> </div> <p class="title is-5 mathjax"> ByteFS: System Support for (CXL-based) Memory-Semantic Solid-State Drives </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+S">Shaobo Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y+E">Yirui Eric Zhou</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hao Ren</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jian Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04993v1-abstract-short" style="display: inline;"> Unlike non-volatile memory that resides on the processor memory bus, memory-semantic solid-state drives (SSDs) support both byte and block access granularity via PCIe or CXL interconnects. They provide scalable memory capacity using NAND flash at a much lower cost. In addition, they have different performance characteristics for their dual byte/block interface respectively, while offering essentia… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04993v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04993v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04993v1-abstract-full" style="display: none;"> Unlike non-volatile memory that resides on the processor memory bus, memory-semantic solid-state drives (SSDs) support both byte and block access granularity via PCIe or CXL interconnects. They provide scalable memory capacity using NAND flash at a much lower cost. In addition, they have different performance characteristics for their dual byte/block interface respectively, while offering essential memory semantics for upper-level software. Such a byte-accessible storage device provides new implications on the software system design. In this paper, we develop a new file system, named ByteFS, by rethinking the design primitives of file systems and SSD firmware to exploit the advantages of both byte and block-granular data accesses. ByteFS supports byte-granular data persistence to retain the persistence nature of SSDs. It extends the core data structure of file systems by enabling dual byte/block-granular data accesses. To facilitate the support for byte-granular writes, \pname{} manages the internal DRAM of SSD firmware in a log-structured manner and enables data coalescing to reduce the unnecessary I/O traffic to flash chips. ByteFS also enables coordinated data caching between the host page cache and SSD cache for best utilizing the precious memory resource. We implement ByteFS on both a real programmable SSD and an emulated memory-semantic SSD for sensitivity study. Compared to state-of-the-art file systems for non-volatile memory and conventional SSDs, ByteFS outperforms them by up to 2.7$\times$, while preserving the essential properties of a file system. ByteFS also reduces the write traffic to SSDs by up to 5.1$\times$ by alleviating unnecessary writes caused by both metadata and data updates in file systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04993v1-abstract-full').style.display = 'none'; document.getElementById('2501.04993v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is accepted at the 30th Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS 2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00909">arXiv:2501.00909</a> <span> [<a href="https://arxiv.org/pdf/2501.00909">pdf</a>, <a href="https://arxiv.org/format/2501.00909">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> RIS-Aided Integrated Sensing and Communication Systems under Dual-polarized Channels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xia%2C+D">Dongnan Xia</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+C">Cunhua Pan</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hong Ren</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhiyuan Yu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Yasheng Jin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiangzhou Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00909v1-abstract-short" style="display: inline;"> This paper considers reconfigurable intelligent surface (RIS)-aided integrated sensing and communication (ISAC) systems under dual-polarized (DP) channels. Unlike the existing ISAC systems, which ignored polarization of electromagnetic waves, this study adopts DP base station (BS) and DP RIS to serve users with a pair of DP antennas. The achievable sum rate is maximized through jointly optimiz… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00909v1-abstract-full').style.display = 'inline'; document.getElementById('2501.00909v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00909v1-abstract-full" style="display: none;"> This paper considers reconfigurable intelligent surface (RIS)-aided integrated sensing and communication (ISAC) systems under dual-polarized (DP) channels. Unlike the existing ISAC systems, which ignored polarization of electromagnetic waves, this study adopts DP base station (BS) and DP RIS to serve users with a pair of DP antennas. The achievable sum rate is maximized through jointly optimizing the beamforming matrix at the DP BS, and the reflecting coefficients at the DP RIS. To address this problem, we first utilize the weighted minimum mean-square error (WMMSE) method to transform the objective function into a more tractable form, and then an alternating optimization (AO) method is employed to decouple the original problem into two subproblems. Due to the constant modulus constraint, the DP RIS reflection matrix optimization problem is addressed by the majorization-minimization (MM) method. For the DP beamforming matrix, we propose a penalty-based algorithm that can obtain a low-complexity closed-form solution. Simulation results validate the advantage of deploying DP transmit array and DP RIS in the considered ISAC systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00909v1-abstract-full').style.display = 'none'; document.getElementById('2501.00909v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20803">arXiv:2412.20803</a> <span> [<a href="https://arxiv.org/pdf/2412.20803">pdf</a>, <a href="https://arxiv.org/format/2412.20803">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Frequency-aware Event Cloud Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hongwei Ren</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+F">Fei Ma</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xiaopeng Lin</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yuetong Fang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hongxiang Huang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yulong Huang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yue Zhou</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+H">Haotian Fu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ziyi Yang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F+R">Fei Richard Yu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+B">Bojun Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20803v1-abstract-short" style="display: inline;"> Event cameras are biologically inspired sensors that emit events asynchronously with remarkable temporal resolution, garnering significant attention from both industry and academia. Mainstream methods favor frame and voxel representations, which reach a satisfactory performance while introducing time-consuming transformation, bulky models, and sacrificing fine-grained temporal information. Alterna… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20803v1-abstract-full').style.display = 'inline'; document.getElementById('2412.20803v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20803v1-abstract-full" style="display: none;"> Event cameras are biologically inspired sensors that emit events asynchronously with remarkable temporal resolution, garnering significant attention from both industry and academia. Mainstream methods favor frame and voxel representations, which reach a satisfactory performance while introducing time-consuming transformation, bulky models, and sacrificing fine-grained temporal information. Alternatively, Point Cloud representation demonstrates promise in addressing the mentioned weaknesses, but it ignores the polarity information, and its models have limited proficiency in abstracting long-term events' features. In this paper, we propose a frequency-aware network named FECNet that leverages Event Cloud representations. FECNet fully utilizes 2S-1T-1P Event Cloud by innovating the event-based Group and Sampling module. To accommodate the long sequence events from Event Cloud, FECNet embraces feature extraction in the frequency domain via the Fourier transform. This approach substantially extinguishes the explosion of Multiply Accumulate Operations (MACs) while effectively abstracting spatial-temporal features. We conducted extensive experiments on event-based object classification, action recognition, and human pose estimation tasks, and the results substantiate the effectiveness and efficiency of FECNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20803v1-abstract-full').style.display = 'none'; document.getElementById('2412.20803v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19819">arXiv:2412.19819</a> <span> [<a href="https://arxiv.org/pdf/2412.19819">pdf</a>, <a href="https://arxiv.org/format/2412.19819">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ChipAlign: Instruction Alignment in Large Language Models for Chip Design via Geodesic Interpolation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+C">Chenhui Deng</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yunsheng Bai</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Haoxing Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19819v1-abstract-short" style="display: inline;"> Recent advancements in large language models (LLMs) have expanded their application across various domains, including chip design, where domain-adapted chip models like ChipNeMo have emerged. However, these models often struggle with instruction alignment, a crucial capability for LLMs that involves following explicit human directives. This limitation impedes the practical application of chip LLMs… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19819v1-abstract-full').style.display = 'inline'; document.getElementById('2412.19819v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19819v1-abstract-full" style="display: none;"> Recent advancements in large language models (LLMs) have expanded their application across various domains, including chip design, where domain-adapted chip models like ChipNeMo have emerged. However, these models often struggle with instruction alignment, a crucial capability for LLMs that involves following explicit human directives. This limitation impedes the practical application of chip LLMs, including serving as assistant chatbots for hardware design engineers. In this work, we introduce ChipAlign, a novel approach that utilizes a training-free model merging strategy, combining the strengths of a general instruction-aligned LLM with a chip-specific LLM. By considering the underlying manifold in the weight space, ChipAlign employs geodesic interpolation to effectively fuse the weights of input LLMs, producing a merged model that inherits strong instruction alignment and chip expertise from the respective instruction and chip LLMs. Our results demonstrate that ChipAlign significantly enhances instruction-following capabilities of existing chip LLMs, achieving up to a 26.6% improvement on the IFEval benchmark, while maintaining comparable expertise in the chip domain. This improvement in instruction alignment also translates to notable gains in instruction-involved QA tasks, delivering performance enhancements of 3.9% on the OpenROAD QA benchmark and 8.25% on production-level chip QA benchmarks, surpassing state-of-the-art baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19819v1-abstract-full').style.display = 'none'; document.getElementById('2412.19819v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19279">arXiv:2412.19279</a> <span> [<a href="https://arxiv.org/pdf/2412.19279">pdf</a>, <a href="https://arxiv.org/format/2412.19279">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Improving Generalization for AI-Synthesized Voice Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hainan Ren</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+L">Li Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chun-Hao Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shu Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19279v2-abstract-short" style="display: inline;"> AI-synthesized voice technology has the potential to create realistic human voices for beneficial applications, but it can also be misused for malicious purposes. While existing AI-synthesized voice detection models excel in intra-domain evaluation, they face challenges in generalizing across different domains, potentially becoming obsolete as new voice generators emerge. Current solutions use div… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19279v2-abstract-full').style.display = 'inline'; document.getElementById('2412.19279v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19279v2-abstract-full" style="display: none;"> AI-synthesized voice technology has the potential to create realistic human voices for beneficial applications, but it can also be misused for malicious purposes. While existing AI-synthesized voice detection models excel in intra-domain evaluation, they face challenges in generalizing across different domains, potentially becoming obsolete as new voice generators emerge. Current solutions use diverse data and advanced machine learning techniques (e.g., domain-invariant representation, self-supervised learning), but are limited by predefined vocoders and sensitivity to factors like background noise and speaker identity. In this work, we introduce an innovative disentanglement framework aimed at extracting domain-agnostic artifact features related to vocoders. Utilizing these features, we enhance model learning in a flat loss landscape, enabling escape from suboptimal solutions and improving generalization. Extensive experiments on benchmarks show our approach outperforms state-of-the-art methods, achieving up to 5.12% improvement in the equal error rate metric in intra-domain and 7.59% in cross-domain evaluations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19279v2-abstract-full').style.display = 'none'; document.getElementById('2412.19279v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AAAI25</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.17595">arXiv:2412.17595</a> <span> [<a href="https://arxiv.org/pdf/2412.17595">pdf</a>, <a href="https://arxiv.org/format/2412.17595">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> V$^2$-SfMLearner: Learning Monocular Depth and Ego-motion for Multimodal Wireless Capsule Endoscopy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bai%2C+L">Long Bai</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+B">Beilei Cui</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liangyu Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yanheng Li</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+S">Shilong Yao</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+S">Sishen Yuan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yanan Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yang Zhang</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+M+Q+-">Max Q. -H. Meng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhen Li</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+W">Weiping Ding</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hongliang Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.17595v1-abstract-short" style="display: inline;"> Deep learning can predict depth maps and capsule ego-motion from capsule endoscopy videos, aiding in 3D scene reconstruction and lesion localization. However, the collisions of the capsule endoscopies within the gastrointestinal tract cause vibration perturbations in the training data. Existing solutions focus solely on vision-based processing, neglecting other auxiliary signals like vibrations th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17595v1-abstract-full').style.display = 'inline'; document.getElementById('2412.17595v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.17595v1-abstract-full" style="display: none;"> Deep learning can predict depth maps and capsule ego-motion from capsule endoscopy videos, aiding in 3D scene reconstruction and lesion localization. However, the collisions of the capsule endoscopies within the gastrointestinal tract cause vibration perturbations in the training data. Existing solutions focus solely on vision-based processing, neglecting other auxiliary signals like vibrations that could reduce noise and improve performance. Therefore, we propose V$^2$-SfMLearner, a multimodal approach integrating vibration signals into vision-based depth and capsule motion estimation for monocular capsule endoscopy. We construct a multimodal capsule endoscopy dataset containing vibration and visual signals, and our artificial intelligence solution develops an unsupervised method using vision-vibration signals, effectively eliminating vibration perturbations through multimodal learning. Specifically, we carefully design a vibration network branch and a Fourier fusion module, to detect and mitigate vibration noises. The fusion framework is compatible with popular vision-only algorithms. Extensive validation on the multimodal dataset demonstrates superior performance and robustness against vision-only algorithms. Without the need for large external equipment, our V$^2$-SfMLearner has the potential for integration into clinical capsule robots, providing real-time and dependable digestive examination tools. The findings show promise for practical implementation in clinical settings, enhancing the diagnostic capabilities of doctors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17595v1-abstract-full').style.display = 'none'; document.getElementById('2412.17595v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in IEEE Transactions on Automation Science and Engineering (IEEE TASE)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16720">arXiv:2412.16720</a> <span> [<a href="https://arxiv.org/pdf/2412.16720">pdf</a>, <a href="https://arxiv.org/format/2412.16720">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> OpenAI o1 System Card </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=OpenAI"> OpenAI</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Jaech%2C+A">Aaron Jaech</a>, <a href="/search/cs?searchtype=author&query=Kalai%2C+A">Adam Kalai</a>, <a href="/search/cs?searchtype=author&query=Lerer%2C+A">Adam Lerer</a>, <a href="/search/cs?searchtype=author&query=Richardson%2C+A">Adam Richardson</a>, <a href="/search/cs?searchtype=author&query=El-Kishky%2C+A">Ahmed El-Kishky</a>, <a href="/search/cs?searchtype=author&query=Low%2C+A">Aiden Low</a>, <a href="/search/cs?searchtype=author&query=Helyar%2C+A">Alec Helyar</a>, <a href="/search/cs?searchtype=author&query=Madry%2C+A">Aleksander Madry</a>, <a href="/search/cs?searchtype=author&query=Beutel%2C+A">Alex Beutel</a>, <a href="/search/cs?searchtype=author&query=Carney%2C+A">Alex Carney</a>, <a href="/search/cs?searchtype=author&query=Iftimie%2C+A">Alex Iftimie</a>, <a href="/search/cs?searchtype=author&query=Karpenko%2C+A">Alex Karpenko</a>, <a href="/search/cs?searchtype=author&query=Passos%2C+A+T">Alex Tachard Passos</a>, <a href="/search/cs?searchtype=author&query=Neitz%2C+A">Alexander Neitz</a>, <a href="/search/cs?searchtype=author&query=Prokofiev%2C+A">Alexander Prokofiev</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+A">Alexander Wei</a>, <a href="/search/cs?searchtype=author&query=Tam%2C+A">Allison Tam</a>, <a href="/search/cs?searchtype=author&query=Bennett%2C+A">Ally Bennett</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+A">Ananya Kumar</a>, <a href="/search/cs?searchtype=author&query=Saraiva%2C+A">Andre Saraiva</a>, <a href="/search/cs?searchtype=author&query=Vallone%2C+A">Andrea Vallone</a>, <a href="/search/cs?searchtype=author&query=Duberstein%2C+A">Andrew Duberstein</a>, <a href="/search/cs?searchtype=author&query=Kondrich%2C+A">Andrew Kondrich</a> , et al. (238 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16720v1-abstract-short" style="display: inline;"> The o1 model series is trained with large-scale reinforcement learning to reason using chain of thought. These advanced reasoning capabilities provide new avenues for improving the safety and robustness of our models. In particular, our models can reason about our safety policies in context when responding to potentially unsafe prompts, through deliberative alignment. This leads to state-of-the-ar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16720v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16720v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16720v1-abstract-full" style="display: none;"> The o1 model series is trained with large-scale reinforcement learning to reason using chain of thought. These advanced reasoning capabilities provide new avenues for improving the safety and robustness of our models. In particular, our models can reason about our safety policies in context when responding to potentially unsafe prompts, through deliberative alignment. This leads to state-of-the-art performance on certain benchmarks for risks such as generating illicit advice, choosing stereotyped responses, and succumbing to known jailbreaks. Training models to incorporate a chain of thought before answering has the potential to unlock substantial benefits, while also increasing potential risks that stem from heightened intelligence. Our results underscore the need for building robust alignment methods, extensively stress-testing their efficacy, and maintaining meticulous risk management protocols. This report outlines the safety work carried out for the OpenAI o1 and OpenAI o1-mini models, including safety evaluations, external red teaming, and Preparedness Framework evaluations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16720v1-abstract-full').style.display = 'none'; document.getElementById('2412.16720v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16339">arXiv:2412.16339</a> <span> [<a href="https://arxiv.org/pdf/2412.16339">pdf</a>, <a href="https://arxiv.org/format/2412.16339">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Deliberative Alignment: Reasoning Enables Safer Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guan%2C+M+Y">Melody Y. Guan</a>, <a href="/search/cs?searchtype=author&query=Joglekar%2C+M">Manas Joglekar</a>, <a href="/search/cs?searchtype=author&query=Wallace%2C+E">Eric Wallace</a>, <a href="/search/cs?searchtype=author&query=Jain%2C+S">Saachi Jain</a>, <a href="/search/cs?searchtype=author&query=Barak%2C+B">Boaz Barak</a>, <a href="/search/cs?searchtype=author&query=Helyar%2C+A">Alec Helyar</a>, <a href="/search/cs?searchtype=author&query=Dias%2C+R">Rachel Dias</a>, <a href="/search/cs?searchtype=author&query=Vallone%2C+A">Andrea Vallone</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hongyu Ren</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+J">Jason Wei</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+H+W">Hyung Won Chung</a>, <a href="/search/cs?searchtype=author&query=Toyer%2C+S">Sam Toyer</a>, <a href="/search/cs?searchtype=author&query=Heidecke%2C+J">Johannes Heidecke</a>, <a href="/search/cs?searchtype=author&query=Beutel%2C+A">Alex Beutel</a>, <a href="/search/cs?searchtype=author&query=Glaese%2C+A">Amelia Glaese</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16339v2-abstract-short" style="display: inline;"> As large-scale language models increasingly impact safety-critical domains, ensuring their reliable adherence to well-defined principles remains a fundamental challenge. We introduce Deliberative Alignment, a new paradigm that directly teaches the model safety specifications and trains it to explicitly recall and accurately reason over the specifications before answering. We used this approach to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16339v2-abstract-full').style.display = 'inline'; document.getElementById('2412.16339v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16339v2-abstract-full" style="display: none;"> As large-scale language models increasingly impact safety-critical domains, ensuring their reliable adherence to well-defined principles remains a fundamental challenge. We introduce Deliberative Alignment, a new paradigm that directly teaches the model safety specifications and trains it to explicitly recall and accurately reason over the specifications before answering. We used this approach to align OpenAI's o-series models, and achieved highly precise adherence to OpenAI's safety policies, without requiring human-written chain-of-thoughts or answers. Deliberative Alignment pushes the Pareto frontier by simultaneously increasing robustness to jailbreaks while decreasing overrefusal rates, and also improves out-of-distribution generalization. We demonstrate that reasoning over explicitly specified policies enables more scalable, trustworthy, and interpretable alignment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16339v2-abstract-full').style.display = 'none'; document.getElementById('2412.16339v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">24 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16219">arXiv:2412.16219</a> <span> [<a href="https://arxiv.org/pdf/2412.16219">pdf</a>, <a href="https://arxiv.org/format/2412.16219">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Calibration: A Unified Conversion Framework of Spiking Neural Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziqing Wang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yuetong Fang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jiahang Cao</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hongwei Ren</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Renjing Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16219v1-abstract-short" style="display: inline;"> Spiking Neural Networks (SNNs) are seen as an energy-efficient alternative to traditional Artificial Neural Networks (ANNs), but the performance gap remains a challenge. While this gap is narrowing through ANN-to-SNN conversion, substantial computational resources are still needed, and the energy efficiency of converted SNNs cannot be ensured. To address this, we present a unified training-free co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16219v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16219v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16219v1-abstract-full" style="display: none;"> Spiking Neural Networks (SNNs) are seen as an energy-efficient alternative to traditional Artificial Neural Networks (ANNs), but the performance gap remains a challenge. While this gap is narrowing through ANN-to-SNN conversion, substantial computational resources are still needed, and the energy efficiency of converted SNNs cannot be ensured. To address this, we present a unified training-free conversion framework that significantly enhances both the performance and efficiency of converted SNNs. Inspired by the biological nervous system, we propose a novel Adaptive-Firing Neuron Model (AdaFire), which dynamically adjusts firing patterns across different layers to substantially reduce the Unevenness Error - the primary source of error of converted SNNs within limited inference timesteps. We further introduce two efficiency-enhancing techniques: the Sensitivity Spike Compression (SSC) technique for reducing spike operations, and the Input-aware Adaptive Timesteps (IAT) technique for decreasing latency. These methods collectively enable our approach to achieve state-of-the-art performance while delivering significant energy savings of up to 70.1%, 60.3%, and 43.1% on CIFAR-10, CIFAR-100, and ImageNet datasets, respectively. Extensive experiments across 2D, 3D, event-driven classification tasks, object detection, and segmentation tasks, demonstrate the effectiveness of our method in various domains. The code is available at: https://github.com/bic-L/burst-ann2snn. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16219v1-abstract-full').style.display = 'none'; document.getElementById('2412.16219v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14018">arXiv:2412.14018</a> <span> [<a href="https://arxiv.org/pdf/2412.14018">pdf</a>, <a href="https://arxiv.org/format/2412.14018">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> SurgSora: Decoupled RGBD-Flow Diffusion Model for Controllable Surgical Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tong Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuya Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junyi Wang</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+L">Long Bai</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hongliang Ren</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Luping Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14018v1-abstract-short" style="display: inline;"> Medical video generation has transformative potential for enhancing surgical understanding and pathology insights through precise and controllable visual representations. However, current models face limitations in controllability and authenticity. To bridge this gap, we propose SurgSora, a motion-controllable surgical video generation framework that uses a single input frame and user-controllable… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14018v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14018v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14018v1-abstract-full" style="display: none;"> Medical video generation has transformative potential for enhancing surgical understanding and pathology insights through precise and controllable visual representations. However, current models face limitations in controllability and authenticity. To bridge this gap, we propose SurgSora, a motion-controllable surgical video generation framework that uses a single input frame and user-controllable motion cues. SurgSora consists of three key modules: the Dual Semantic Injector (DSI), which extracts object-relevant RGB and depth features from the input frame and integrates them with segmentation cues to capture detailed spatial features of complex anatomical structures; the Decoupled Flow Mapper (DFM), which fuses optical flow with semantic-RGB-D features at multiple scales to enhance temporal understanding and object spatial dynamics; and the Trajectory Controller (TC), which allows users to specify motion directions and estimates sparse optical flow, guiding the video generation process. The fused features are used as conditions for a frozen Stable Diffusion model to produce realistic, temporally coherent surgical videos. Extensive evaluations demonstrate that SurgSora outperforms state-of-the-art methods in controllability and authenticity, showing its potential to advance surgical video generation for medical education, training, and research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14018v1-abstract-full').style.display = 'none'; document.getElementById('2412.14018v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11866">arXiv:2412.11866</a> <span> [<a href="https://arxiv.org/pdf/2412.11866">pdf</a>, <a href="https://arxiv.org/format/2412.11866">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Event-based Motion Deblurring via Multi-Temporal Granularity Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xiaopeng Lin</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hongwei Ren</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yulong Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zunchang Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yue Zhou</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+H">Haotian Fu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+B">Biao Pan</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+B">Bojun Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11866v1-abstract-short" style="display: inline;"> Conventional frame-based cameras inevitably produce blurry effects due to motion occurring during the exposure time. Event camera, a bio-inspired sensor offering continuous visual information could enhance the deblurring performance. Effectively utilizing the high-temporal-resolution event data is crucial for extracting precise motion information and enhancing deblurring performance. However, exis… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11866v1-abstract-full').style.display = 'inline'; document.getElementById('2412.11866v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11866v1-abstract-full" style="display: none;"> Conventional frame-based cameras inevitably produce blurry effects due to motion occurring during the exposure time. Event camera, a bio-inspired sensor offering continuous visual information could enhance the deblurring performance. Effectively utilizing the high-temporal-resolution event data is crucial for extracting precise motion information and enhancing deblurring performance. However, existing event-based image deblurring methods usually utilize voxel-based event representations, losing the fine-grained temporal details that are mathematically essential for fast motion deblurring. In this paper, we first introduce point cloud-based event representation into the image deblurring task and propose a Multi-Temporal Granularity Network (MTGNet). It combines the spatially dense but temporally coarse-grained voxel-based event representation and the temporally fine-grained but spatially sparse point cloud-based event. To seamlessly integrate such complementary representations, we design a Fine-grained Point Branch. An Aggregation and Mapping Module (AMM) is proposed to align the low-level point-based features with frame-based features and an Adaptive Feature Diffusion Module (AFDM) is designed to manage the resolution discrepancies between event data and image data by enriching the sparse point feature. Extensive subjective and objective evaluations demonstrate that our method outperforms current state-of-the-art approaches on both synthetic and real-world datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11866v1-abstract-full').style.display = 'none'; document.getElementById('2412.11866v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10734">arXiv:2412.10734</a> <span> [<a href="https://arxiv.org/pdf/2412.10734">pdf</a>, <a href="https://arxiv.org/format/2412.10734">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> OmniHD-Scenes: A Next-Generation Multimodal Dataset for Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+L">Lianqing Zheng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Long Yang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Q">Qunshu Lin</a>, <a href="/search/cs?searchtype=author&query=Ai%2C+W">Wenjin Ai</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Minghao Liu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+S">Shouyi Lu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jianan Liu</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hongze Ren</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+J">Jingyue Mo</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+X">Xiaokai Bai</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+J">Jie Bai</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zhixiong Ma</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xichan Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10734v4-abstract-short" style="display: inline;"> The rapid advancement of deep learning has intensified the need for comprehensive data for use by autonomous driving algorithms. High-quality datasets are crucial for the development of effective data-driven autonomous driving solutions. Next-generation autonomous driving datasets must be multimodal, incorporating data from advanced sensors that feature extensive data coverage, detailed annotation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10734v4-abstract-full').style.display = 'inline'; document.getElementById('2412.10734v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10734v4-abstract-full" style="display: none;"> The rapid advancement of deep learning has intensified the need for comprehensive data for use by autonomous driving algorithms. High-quality datasets are crucial for the development of effective data-driven autonomous driving solutions. Next-generation autonomous driving datasets must be multimodal, incorporating data from advanced sensors that feature extensive data coverage, detailed annotations, and diverse scene representation. To address this need, we present OmniHD-Scenes, a large-scale multimodal dataset that provides comprehensive omnidirectional high-definition data. The OmniHD-Scenes dataset combines data from 128-beam LiDAR, six cameras, and six 4D imaging radar systems to achieve full environmental perception. The dataset comprises 1501 clips, each approximately 30-s long, totaling more than 450K synchronized frames and more than 5.85 million synchronized sensor data points. We also propose a novel 4D annotation pipeline. To date, we have annotated 200 clips with more than 514K precise 3D bounding boxes. These clips also include semantic segmentation annotations for static scene elements. Additionally, we introduce a novel automated pipeline for generation of the dense occupancy ground truth, which effectively leverages information from non-key frames. Alongside the proposed dataset, we establish comprehensive evaluation metrics, baseline models, and benchmarks for 3D detection and semantic occupancy prediction. These benchmarks utilize surround-view cameras and 4D imaging radar to explore cost-effective sensor solutions for autonomous driving applications. Extensive experiments demonstrate the effectiveness of our low-cost sensor configuration and its robustness under adverse conditions. Data will be released at https://www.2077ai.com/OmniHD-Scenes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10734v4-abstract-full').style.display = 'none'; document.getElementById('2412.10734v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07116">arXiv:2412.07116</a> <span> [<a href="https://arxiv.org/pdf/2412.07116">pdf</a>, <a href="https://arxiv.org/format/2412.07116">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Review of Human Emotion Synthesis Based on Generative Technology </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+F">Fei Ma</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yukan Li</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yifan Xie</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Ying He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yi Zhang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hongwei Ren</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhou Liu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+W">Wei Yao</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+F">Fuji Ren</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F+R">Fei Richard Yu</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+S">Shiguang Ni</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07116v1-abstract-short" style="display: inline;"> Human emotion synthesis is a crucial aspect of affective computing. It involves using computational methods to mimic and convey human emotions through various modalities, with the goal of enabling more natural and effective human-computer interactions. Recent advancements in generative models, such as Autoencoders, Generative Adversarial Networks, Diffusion Models, Large Language Models, and Seque… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07116v1-abstract-full').style.display = 'inline'; document.getElementById('2412.07116v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07116v1-abstract-full" style="display: none;"> Human emotion synthesis is a crucial aspect of affective computing. It involves using computational methods to mimic and convey human emotions through various modalities, with the goal of enabling more natural and effective human-computer interactions. Recent advancements in generative models, such as Autoencoders, Generative Adversarial Networks, Diffusion Models, Large Language Models, and Sequence-to-Sequence Models, have significantly contributed to the development of this field. However, there is a notable lack of comprehensive reviews in this field. To address this problem, this paper aims to address this gap by providing a thorough and systematic overview of recent advancements in human emotion synthesis based on generative models. Specifically, this review will first present the review methodology, the emotion models involved, the mathematical principles of generative models, and the datasets used. Then, the review covers the application of different generative models to emotion synthesis based on a variety of modalities, including facial images, speech, and text. It also examines mainstream evaluation metrics. Additionally, the review presents some major findings and suggests future research directions, providing a comprehensive understanding of the role of generative technology in the nuanced domain of emotion synthesis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07116v1-abstract-full').style.display = 'none'; document.getElementById('2412.07116v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05840">arXiv:2412.05840</a> <span> [<a href="https://arxiv.org/pdf/2412.05840">pdf</a>, <a href="https://arxiv.org/format/2412.05840">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LVP-CLIP:Revisiting CLIP for Continual Learning with Label Vector Pool </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yue Ma</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Huantao Ren</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Boyu Wang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+J">Jingang Jin</a>, <a href="/search/cs?searchtype=author&query=Velipasalar%2C+S">Senem Velipasalar</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+Q">Qinru Qiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05840v1-abstract-short" style="display: inline;"> Continual learning aims to update a model so that it can sequentially learn new tasks without forgetting previously acquired knowledge. Recent continual learning approaches often leverage the vision-language model CLIP for its high-dimensional feature space and cross-modality feature matching. Traditional CLIP-based classification methods identify the most similar text label for a test image by co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05840v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05840v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05840v1-abstract-full" style="display: none;"> Continual learning aims to update a model so that it can sequentially learn new tasks without forgetting previously acquired knowledge. Recent continual learning approaches often leverage the vision-language model CLIP for its high-dimensional feature space and cross-modality feature matching. Traditional CLIP-based classification methods identify the most similar text label for a test image by comparing their embeddings. However, these methods are sensitive to the quality of text phrases and less effective for classes lacking meaningful text labels. In this work, we rethink CLIP-based continual learning and introduce the concept of Label Vector Pool (LVP). LVP replaces text labels with training images as similarity references, eliminating the need for ideal text descriptions. We present three variations of LVP and evaluate their performance on class and domain incremental learning tasks. Leveraging CLIP's high dimensional feature space, LVP learning algorithms are task-order invariant. The new knowledge does not modify the old knowledge, hence, there is minimum forgetting. Different tasks can be learned independently and in parallel with low computational and memory demands. Experimental results show that proposed LVP-based methods outperform the current state-of-the-art baseline by a significant margin of 40.7%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05840v1-abstract-full').style.display = 'none'; document.getElementById('2412.05840v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submitted to CVPR2025</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T45 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.10; I.4; I.5 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05311">arXiv:2412.05311</a> <span> [<a href="https://arxiv.org/pdf/2412.05311">pdf</a>, <a href="https://arxiv.org/format/2412.05311">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3698364.3705347">10.1145/3698364.3705347 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> DRC-Coder: Automated DRC Checker Code Generation Using LLM Autonomous Agent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chang%2C+C">Chen-Chia Chang</a>, <a href="/search/cs?searchtype=author&query=Ho%2C+C">Chia-Tung Ho</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yaguang Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yiran Chen</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Haoxing Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05311v1-abstract-short" style="display: inline;"> In the advanced technology nodes, the integrated design rule checker (DRC) is often utilized in place and route tools for fast optimization loops for power-performance-area. Implementing integrated DRC checkers to meet the standard of commercial DRC tools demands extensive human expertise to interpret foundry specifications, analyze layouts, and debug code iteratively. However, this labor-intensiv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05311v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05311v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05311v1-abstract-full" style="display: none;"> In the advanced technology nodes, the integrated design rule checker (DRC) is often utilized in place and route tools for fast optimization loops for power-performance-area. Implementing integrated DRC checkers to meet the standard of commercial DRC tools demands extensive human expertise to interpret foundry specifications, analyze layouts, and debug code iteratively. However, this labor-intensive process, requiring to be repeated by every update of technology nodes, prolongs the turnaround time of designing circuits. In this paper, we present DRC-Coder, a multi-agent framework with vision capabilities for automated DRC code generation. By incorporating vision language models and large language models (LLM), DRC-Coder can effectively process textual, visual, and layout information to perform rule interpretation and coding by two specialized LLMs. We also design an auto-evaluation function for LLMs to enable DRC code debugging. Experimental results show that targeting on a sub-3nm technology node for a state-of-the-art standard cell layout tool, DRC-Coder achieves perfect F1 score 1.000 in generating DRC codes for meeting the standard of a commercial DRC tool, highly outperforming standard prompting techniques (F1=0.631). DRC-Coder can generate code for each design rule within four minutes on average, which significantly accelerates technology advancement and reduces engineering costs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05311v1-abstract-full').style.display = 'none'; document.getElementById('2412.05311v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Proceedings of the 2025 International Symposium on Physical Design (ISPD '25), March 16--19, 2025, Austin, TX, USA</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00176">arXiv:2412.00176</a> <span> [<a href="https://arxiv.org/pdf/2412.00176">pdf</a>, <a href="https://arxiv.org/format/2412.00176">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Art-Free Generative Models: Art Creation Without Graphic Art Knowledge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hui Ren</a>, <a href="/search/cs?searchtype=author&query=Materzynska%2C+J">Joanna Materzynska</a>, <a href="/search/cs?searchtype=author&query=Gandikota%2C+R">Rohit Gandikota</a>, <a href="/search/cs?searchtype=author&query=Bau%2C+D">David Bau</a>, <a href="/search/cs?searchtype=author&query=Torralba%2C+A">Antonio Torralba</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00176v1-abstract-short" style="display: inline;"> We explore the question: "How much prior art knowledge is needed to create art?" To investigate this, we propose a text-to-image generation model trained without access to art-related content. We then introduce a simple yet effective method to learn an art adapter using only a few examples of selected artistic styles. Our experiments show that art generated using our method is perceived by users a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00176v1-abstract-full').style.display = 'inline'; document.getElementById('2412.00176v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00176v1-abstract-full" style="display: none;"> We explore the question: "How much prior art knowledge is needed to create art?" To investigate this, we propose a text-to-image generation model trained without access to art-related content. We then introduce a simple yet effective method to learn an art adapter using only a few examples of selected artistic styles. Our experiments show that art generated using our method is perceived by users as comparable to art produced by models trained on large, art-rich datasets. Finally, through data attribution techniques, we illustrate how examples from both artistic and non-artistic datasets contributed to the creation of new artistic styles. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00176v1-abstract-full').style.display = 'none'; document.getElementById('2412.00176v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18884">arXiv:2411.18884</a> <span> [<a href="https://arxiv.org/pdf/2411.18884">pdf</a>, <a href="https://arxiv.org/format/2411.18884">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ETSM: Automating Dissection Trajectory Suggestion and Confidence Map-Based Safety Margin Prediction for Robot-assisted Endoscopic Submucosal Dissection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mengya Xu</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+W">Wenjin Mo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guankun Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Huxin Gao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A">An Wang</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+L">Long Bai</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+C">Chaoyang Lyu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaoxiao Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhen Li</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hongliang Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18884v1-abstract-short" style="display: inline;"> Robot-assisted Endoscopic Submucosal Dissection (ESD) improves the surgical procedure by providing a more comprehensive view through advanced robotic instruments and bimanual operation, thereby enhancing dissection efficiency and accuracy. Accurate prediction of dissection trajectories is crucial for better decision-making, reducing intraoperative errors, and improving surgical training. Neverthel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18884v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18884v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18884v1-abstract-full" style="display: none;"> Robot-assisted Endoscopic Submucosal Dissection (ESD) improves the surgical procedure by providing a more comprehensive view through advanced robotic instruments and bimanual operation, thereby enhancing dissection efficiency and accuracy. Accurate prediction of dissection trajectories is crucial for better decision-making, reducing intraoperative errors, and improving surgical training. Nevertheless, predicting these trajectories is challenging due to variable tumor margins and dynamic visual conditions. To address this issue, we create the ESD Trajectory and Confidence Map-based Safety Margin (ETSM) dataset with $1849$ short clips, focusing on submucosal dissection with a dual-arm robotic system. We also introduce a framework that combines optimal dissection trajectory prediction with a confidence map-based safety margin, providing a more secure and intelligent decision-making tool to minimize surgical risks for ESD procedures. Additionally, we propose the Regression-based Confidence Map Prediction Network (RCMNet), which utilizes a regression approach to predict confidence maps for dissection areas, thereby delineating various levels of safety margins. We evaluate our RCMNet using three distinct experimental setups: in-domain evaluation, robustness assessment, and out-of-domain evaluation. Experimental results show that our approach excels in the confidence map-based safety margin prediction task, achieving a mean absolute error (MAE) of only $3.18$. To the best of our knowledge, this is the first study to apply a regression approach for visual guidance concerning delineating varying safety levels of dissection areas. Our approach bridges gaps in current research by improving prediction accuracy and enhancing the safety of the dissection process, showing great clinical significance in practice. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18884v1-abstract-full').style.display = 'none'; document.getElementById('2411.18884v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18169">arXiv:2411.18169</a> <span> [<a href="https://arxiv.org/pdf/2411.18169">pdf</a>, <a href="https://arxiv.org/format/2411.18169">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PDZSeg: Adapting the Foundation Model for Dissection Zone Segmentation with Visual Prompts in Robot-assisted Endoscopic Submucosal Dissection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mengya Xu</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+W">Wenjin Mo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guankun Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Huxin Gao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A">An Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhen Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaoxiao Yang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hongliang Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18169v1-abstract-short" style="display: inline;"> Purpose: Endoscopic surgical environments present challenges for dissection zone segmentation due to unclear boundaries between tissue types, leading to segmentation errors where models misidentify or overlook edges. This study aims to provide precise dissection zone suggestions during endoscopic submucosal dissection (ESD) procedures, enhancing ESD safety. Methods: We propose the Prompted-based… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18169v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18169v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18169v1-abstract-full" style="display: none;"> Purpose: Endoscopic surgical environments present challenges for dissection zone segmentation due to unclear boundaries between tissue types, leading to segmentation errors where models misidentify or overlook edges. This study aims to provide precise dissection zone suggestions during endoscopic submucosal dissection (ESD) procedures, enhancing ESD safety. Methods: We propose the Prompted-based Dissection Zone Segmentation (PDZSeg) model, designed to leverage diverse visual prompts such as scribbles and bounding boxes. By overlaying these prompts onto images and fine-tuning a foundational model on a specialized dataset, our approach improves segmentation performance and user experience through flexible input methods. Results: The PDZSeg model was validated using three experimental setups: in-domain evaluation, variability in visual prompt availability, and robustness assessment. Using the ESD-DZSeg dataset, results show that our method outperforms state-of-the-art segmentation approaches. This is the first study to integrate visual prompt design into dissection zone segmentation. Conclusion: The PDZSeg model effectively utilizes visual prompts to enhance segmentation performance and user experience, supported by the novel ESD-DZSeg dataset as a benchmark for dissection zone segmentation in ESD. Our work establishes a foundation for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18169v1-abstract-full').style.display = 'none'; document.getElementById('2411.18169v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15515">arXiv:2411.15515</a> <span> [<a href="https://arxiv.org/pdf/2411.15515">pdf</a>, <a href="https://arxiv.org/format/2411.15515">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Complexity">cs.CC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Logic in Computer Science">cs.LO</span> </div> </div> <p class="title is-5 mathjax"> Metamathematics of Resolution Lower Bounds: A TFNP Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiawei Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuhao Li</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hanlin Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15515v1-abstract-short" style="display: inline;"> This paper studies the *refuter* problems, a family of decision-tree $\mathsf{TFNP}$ problems capturing the metamathematical difficulty of proving proof complexity lower bounds. Suppose $\varphi$ is a hard tautology that does not admit any length-$s$ proof in some proof system $P$. In the corresponding refuter problem, we are given (query access to) a purported length-$s$ proof $蟺$ in $P$ that cla… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15515v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15515v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15515v1-abstract-full" style="display: none;"> This paper studies the *refuter* problems, a family of decision-tree $\mathsf{TFNP}$ problems capturing the metamathematical difficulty of proving proof complexity lower bounds. Suppose $\varphi$ is a hard tautology that does not admit any length-$s$ proof in some proof system $P$. In the corresponding refuter problem, we are given (query access to) a purported length-$s$ proof $蟺$ in $P$ that claims to have proved $\varphi$, and our goal is to find an invalid derivation inside $蟺$. As suggested by witnessing theorems in bounded arithmetic, the *computational complexity* of these refuter problems is closely tied to the *metamathematics* of the underlying proof complexity lower bounds. We focus on refuter problems corresponding to lower bounds for *resolution*, which is arguably the single most studied system in proof complexity. We introduce a new class $\mathrm{rwPHP}(\mathsf{PLS})$ in decision-tree $\mathsf{TFNP}$, which can be seen as a randomized version of $\mathsf{PLS}$, and argue that this class effectively captures the metamathematics of proving resolution lower bounds. We view these results as a contribution to the *bounded reverse mathematics* of complexity lower bounds: when interpreted in relativized bounded arithmetic, our results show that the theory $\mathsf{T}^1_2(伪) + \mathrm{dwPHP}(\mathsf{PV}(伪))$ characterizes the "reasoning power" required to prove (the "easiest") resolution lower bounds. An intriguing corollary of our results is that the combinatorial principle, "the pigeonhole principle requires exponential-size resolution proofs", captures the class of $\mathsf{TFNP}$ problems whose totality is provable in $\mathsf{T}^1_2 + \mathrm{dwPHP}(\mathsf{PV})$. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15515v1-abstract-full').style.display = 'none'; document.getElementById('2411.15515v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Abstract shortened due to constraints</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11934">arXiv:2411.11934</a> <span> [<a href="https://arxiv.org/pdf/2411.11934">pdf</a>, <a href="https://arxiv.org/format/2411.11934">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SpatialDreamer: Self-supervised Stereo Video Synthesis from Monocular Input </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lv%2C+Z">Zhen Lv</a>, <a href="/search/cs?searchtype=author&query=Long%2C+Y">Yangqi Long</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+C">Congzhentao Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Cao Li</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+C">Chengfei Lv</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hao Ren</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+D">Dian Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11934v1-abstract-short" style="display: inline;"> Stereo video synthesis from a monocular input is a demanding task in the fields of spatial computing and virtual reality. The main challenges of this task lie on the insufficiency of high-quality paired stereo videos for training and the difficulty of maintaining the spatio-temporal consistency between frames. Existing methods primarily address these issues by directly applying novel view synthesi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11934v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11934v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11934v1-abstract-full" style="display: none;"> Stereo video synthesis from a monocular input is a demanding task in the fields of spatial computing and virtual reality. The main challenges of this task lie on the insufficiency of high-quality paired stereo videos for training and the difficulty of maintaining the spatio-temporal consistency between frames. Existing methods primarily address these issues by directly applying novel view synthesis (NVS) techniques to video, while facing limitations such as the inability to effectively represent dynamic scenes and the requirement for large amounts of training data. In this paper, we introduce a novel self-supervised stereo video synthesis paradigm via a video diffusion model, termed SpatialDreamer, which meets the challenges head-on. Firstly, to address the stereo video data insufficiency, we propose a Depth based Video Generation module DVG, which employs a forward-backward rendering mechanism to generate paired videos with geometric and temporal priors. Leveraging data generated by DVG, we propose RefinerNet along with a self-supervised synthetic framework designed to facilitate efficient and dedicated training. More importantly, we devise a consistency control module, which consists of a metric of stereo deviation strength and a Temporal Interaction Learning module TIL for geometric and temporal consistency ensurance respectively. We evaluated the proposed method against various benchmark methods, with the results showcasing its superior performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11934v1-abstract-full').style.display = 'none'; document.getElementById('2411.11934v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11733">arXiv:2411.11733</a> <span> [<a href="https://arxiv.org/pdf/2411.11733">pdf</a>, <a href="https://arxiv.org/format/2411.11733">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Integrating Active Sensing and Rearrangement Planning for Efficient Object Retrieval from Unknown, Confined, Cluttered Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+J">Junyong Kim</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hanwen Ren</a>, <a href="/search/cs?searchtype=author&query=Qureshi%2C+A+H">Ahmed H. Qureshi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11733v1-abstract-short" style="display: inline;"> Retrieving target objects from unknown, confined spaces remains a challenging task that requires integrated, task-driven active sensing and rearrangement planning. Previous approaches have independently addressed active sensing and rearrangement planning, limiting their practicality in real-world scenarios. This paper presents a new, integrated heuristic-based active sensing and Monte-Carlo Tree S… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11733v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11733v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11733v1-abstract-full" style="display: none;"> Retrieving target objects from unknown, confined spaces remains a challenging task that requires integrated, task-driven active sensing and rearrangement planning. Previous approaches have independently addressed active sensing and rearrangement planning, limiting their practicality in real-world scenarios. This paper presents a new, integrated heuristic-based active sensing and Monte-Carlo Tree Search (MCTS)-based retrieval planning approach. These components provide feedback to one another to actively sense critical, unobserved areas suitable for the retrieval planner to plan a sequence for relocating path-blocking obstacles and a collision-free trajectory for retrieving the target object. We demonstrate the effectiveness of our approach using a robot arm equipped with an in-hand camera in both simulated and real-world confined, cluttered scenarios. Our framework is compared against various state-of-the-art methods. The results indicate that our proposed approach outperforms baseline methods by a significant margin in terms of the success rate, the object rearrangement planning time consumption and the number of planning trials before successfully retrieving the target. Videos can be found at https://youtu.be/tea7I-3RtV0. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11733v1-abstract-full').style.display = 'none'; document.getElementById('2411.11733v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08210">arXiv:2411.08210</a> <span> [<a href="https://arxiv.org/pdf/2411.08210">pdf</a>, <a href="https://arxiv.org/format/2411.08210">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Physics">physics.comp-ph</span> </div> </div> <p class="title is-5 mathjax"> BOSON$^{-1}$: Understanding and Enabling Physically-Robust Photonic Inverse Design with Adaptive Variation-Aware Subspace Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+P">Pingchuan Ma</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhengqi Gao</a>, <a href="/search/cs?searchtype=author&query=Begovic%2C+A">Amir Begovic</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Meng Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Haoyu Yang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Haoxing Ren</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z+R">Zhaoran Rena Huang</a>, <a href="/search/cs?searchtype=author&query=Boning%2C+D">Duane Boning</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jiaqi Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08210v1-abstract-short" style="display: inline;"> Nanophotonic device design aims to optimize photonic structures to meet specific requirements across various applications. Inverse design has unlocked non-intuitive, high-dimensional design spaces, enabling the discovery of high-performance devices beyond heuristic or analytic methods. The adjoint method, which calculates gradients for all variables using just two simulations, enables efficient na… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08210v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08210v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08210v1-abstract-full" style="display: none;"> Nanophotonic device design aims to optimize photonic structures to meet specific requirements across various applications. Inverse design has unlocked non-intuitive, high-dimensional design spaces, enabling the discovery of high-performance devices beyond heuristic or analytic methods. The adjoint method, which calculates gradients for all variables using just two simulations, enables efficient navigation of this complex space. However, many inverse-designed structures, while numerically plausible, are difficult to fabricate and sensitive to variations, limiting their practical use. The discrete nature with numerous local-optimal structures also pose significant optimization challenges, often causing gradient-based methods to converge on suboptimal designs. In this work, we formulate inverse design as a fabrication-restricted, discrete, probabilistic optimization problem and introduce BOSON-1, an end-to-end, variation-aware subspace optimization framework to address the challenges of manufacturability, robustness, and optimizability. To overcome optimization difficulty, we propose dense target-enhanced gradient flows to mitigate misleading local optima and introduce a conditional subspace optimization strategy to create high-dimensional tunnels to escape local optima. Furthermore, we significantly reduce the runtime associated with optimizing across exponential variation samples through an adaptive sampling-based robust optimization, ensuring both efficiency and variation robustness. On three representative photonic device benchmarks, our proposed inverse design methodology BOSON^-1 delivers fabricable structures and achieves the best convergence and performance under realistic variations, outperforming prior arts with 74.3% post-fabrication performance. We open-source our codes at https://github.com/ScopeX-ASU/BOSON. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08210v1-abstract-full').style.display = 'none'; document.getElementById('2411.08210v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages. Accepted IEEE DATE 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07311">arXiv:2411.07311</a> <span> [<a href="https://arxiv.org/pdf/2411.07311">pdf</a>, <a href="https://arxiv.org/format/2411.07311">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> </div> </div> <p class="title is-5 mathjax"> GPU-Accelerated Inverse Lithography Towards High Quality Curvy Mask Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+H">Haoyu Yang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Haoxing Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07311v1-abstract-short" style="display: inline;"> Inverse Lithography Technology (ILT) has emerged as a promising solution for photo mask design and optimization. Relying on multi-beam mask writers, ILT enables the creation of free-form curvilinear mask shapes that enhance printed wafer image quality and process window. However, a major challenge in implementing curvilinear ILT for large-scale production is mask rule checking, an area currently u… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07311v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07311v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07311v1-abstract-full" style="display: none;"> Inverse Lithography Technology (ILT) has emerged as a promising solution for photo mask design and optimization. Relying on multi-beam mask writers, ILT enables the creation of free-form curvilinear mask shapes that enhance printed wafer image quality and process window. However, a major challenge in implementing curvilinear ILT for large-scale production is mask rule checking, an area currently under development by foundries and EDA vendors. Although recent research has incorporated mask complexity into the optimization process, much of it focuses on reducing e-beam shots, which does not align with the goals of curvilinear ILT. In this paper, we introduce a GPU-accelerated ILT algorithm that improves not only contour quality and process window but also the precision of curvilinear mask shapes. Our experiments on open benchmarks demonstrate a significant advantage of our algorithm over leading academic ILT engines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07311v1-abstract-full').style.display = 'none'; document.getElementById('2411.07311v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 5 figures, Accepted by International Symposium on Physical Design (ISPD), 2025, Austin TX</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> B.7.2 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02410">arXiv:2411.02410</a> <span> [<a href="https://arxiv.org/pdf/2411.02410">pdf</a>, <a href="https://arxiv.org/format/2411.02410">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Web-based Augmented Reality with Auto-Scaling and Real-Time Head Tracking towards Markerless Neurointerventional Preoperative Planning and Training of Head-mounted Robotic Needle Insertion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ho%2C+H+L">Hon Lung Ho</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yupeng Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A">An Wang</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+L">Long Bai</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hongliang Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02410v1-abstract-short" style="display: inline;"> Neurosurgery requires exceptional precision and comprehensive preoperative planning to ensure optimal patient outcomes. Despite technological advancements, there remains a need for intuitive, accessible tools to enhance surgical preparation and medical education in this field. Traditional methods often lack the immersive experience necessary for surgeons to visualize complex procedures and critica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02410v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02410v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02410v1-abstract-full" style="display: none;"> Neurosurgery requires exceptional precision and comprehensive preoperative planning to ensure optimal patient outcomes. Despite technological advancements, there remains a need for intuitive, accessible tools to enhance surgical preparation and medical education in this field. Traditional methods often lack the immersive experience necessary for surgeons to visualize complex procedures and critical neurovascular structures, while existing advanced solutions may be cost-prohibitive or require specialized hardware. This research presents a novel markerless web-based augmented reality (AR) application designed to address these challenges in neurointerventional preoperative planning and education. Utilizing MediaPipe for precise facial localization and segmentation, and React Three Fiber for immersive 3D visualization, the application offers an intuitive platform for complex preoperative procedures. A virtual 2-RPS parallel positioner or Skull-Bot model is projected onto the user's face in real-time, simulating surgical tool control with high precision. Key features include the ability to import and auto-scale head anatomy to the user's dimensions and real-time auto-tracking of head movements once aligned. The web-based nature enables simultaneous access by multiple users, facilitating collaboration during surgeries and allowing medical students to observe live procedures. A pilot study involving three participants evaluated the application's auto-scaling and auto-tracking capabilities through various head rotation exercises. This research contributes to the field by offering a cost-effective, accessible, and collaborative tool for improving neurosurgical planning and education, potentially leading to better surgical outcomes and more comprehensive training for medical professionals. The source code of our application is publicly available at https://github.com/Hillllllllton/skullbot_web_ar. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02410v1-abstract-full').style.display = 'none'; document.getElementById('2411.02410v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to IEEE ROBIO 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23413">arXiv:2410.23413</a> <span> [<a href="https://arxiv.org/pdf/2410.23413">pdf</a>, <a href="https://arxiv.org/format/2410.23413">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EchoFM: Foundation Model for Generalizable Echocardiogram Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sekeun Kim</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+P">Pengfei Jin</a>, <a href="/search/cs?searchtype=author&query=Song%2C+S">Sifan Song</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Cheng Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yiwei Li</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hui Ren</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianming Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Quanzheng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23413v2-abstract-short" style="display: inline;"> Foundation models have recently gained significant attention because of their generalizability and adaptability across multiple tasks and data distributions. Although medical foundation models have emerged, solutions for cardiac imaging, especially echocardiography videos, are still unexplored. In this paper, we introduce EchoFM, a foundation model specifically designed to represent and analyze ec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23413v2-abstract-full').style.display = 'inline'; document.getElementById('2410.23413v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23413v2-abstract-full" style="display: none;"> Foundation models have recently gained significant attention because of their generalizability and adaptability across multiple tasks and data distributions. Although medical foundation models have emerged, solutions for cardiac imaging, especially echocardiography videos, are still unexplored. In this paper, we introduce EchoFM, a foundation model specifically designed to represent and analyze echocardiography videos. In EchoFM, we propose a self-supervised learning framework that captures both spatial and temporal variability patterns through a spatio-temporal consistent masking strategy and periodic-driven contrastive learning. This framework can effectively capture the spatio-temporal dynamics of echocardiography and learn the representative video features without any labels. We pre-train our model on an extensive dataset comprising over 290,000 echocardiography videos covering 26 scan views across different imaging modes, with up to 20 million frames of images. The pre-trained EchoFM can then be easily adapted and fine-tuned for a variety of downstream tasks, serving as a robust backbone model. Our evaluation was systemically designed for four downstream tasks after the echocardiography examination routine. Experiment results show that EchoFM surpasses state-of-the-art methods, including specialized echocardiography methods, self-supervised pre-training models, and general-purposed pre-trained foundation models, across all downstream tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23413v2-abstract-full').style.display = 'none'; document.getElementById('2410.23413v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23299">arXiv:2410.23299</a> <span> [<a href="https://arxiv.org/pdf/2410.23299">pdf</a>, <a href="https://arxiv.org/format/2410.23299">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> FVEval: Understanding Language Model Capabilities in Formal Verification of Digital Hardware </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kang%2C+M">Minwoo Kang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mingjie Liu</a>, <a href="/search/cs?searchtype=author&query=Hamad%2C+G+B">Ghaith Bany Hamad</a>, <a href="/search/cs?searchtype=author&query=Suhaib%2C+S">Syed Suhaib</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Haoxing Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23299v1-abstract-short" style="display: inline;"> The remarkable reasoning and code generation capabilities of large language models (LLMs) have spurred significant interest in applying LLMs to enable task automation in digital chip design. In particular, recent work has investigated early ideas of applying these models to formal verification (FV), an approach to verifying hardware implementations that can provide strong guarantees of confidence… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23299v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23299v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23299v1-abstract-full" style="display: none;"> The remarkable reasoning and code generation capabilities of large language models (LLMs) have spurred significant interest in applying LLMs to enable task automation in digital chip design. In particular, recent work has investigated early ideas of applying these models to formal verification (FV), an approach to verifying hardware implementations that can provide strong guarantees of confidence but demands significant amounts of human effort. While the value of LLM-driven automation is evident, our understanding of model performance, however, has been hindered by the lack of holistic evaluation. In response, we present FVEval, the first comprehensive benchmark and evaluation framework for characterizing LLM performance in tasks pertaining to FV. The benchmark consists of three sub-tasks that measure LLM capabilities at different levels: from the generation of SystemVerilog assertions (SVAs) given natural language descriptions to reasoning about the design RTL and suggesting assertions directly without additional human input. As test instances, we present both collections of expert-written verification collateral and methodologies to scalably generate synthetic examples aligned with industrial FV workflows. A wide range of existing LLMs, both proprietary and open-source, are evaluated against FVEval, based on which we investigate where today's LLMs stand and how we might further enable their application toward improving productivity in digital FV. Our benchmark and evaluation code is available at \url{https://github.com/NVlabs/FVEval}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23299v1-abstract-full').style.display = 'none'; document.getElementById('2410.23299v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21276">arXiv:2410.21276</a> <span> [<a href="https://arxiv.org/pdf/2410.21276">pdf</a>, <a href="https://arxiv.org/format/2410.21276">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> GPT-4o System Card </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=OpenAI"> OpenAI</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Hurst%2C+A">Aaron Hurst</a>, <a href="/search/cs?searchtype=author&query=Lerer%2C+A">Adam Lerer</a>, <a href="/search/cs?searchtype=author&query=Goucher%2C+A+P">Adam P. Goucher</a>, <a href="/search/cs?searchtype=author&query=Perelman%2C+A">Adam Perelman</a>, <a href="/search/cs?searchtype=author&query=Ramesh%2C+A">Aditya Ramesh</a>, <a href="/search/cs?searchtype=author&query=Clark%2C+A">Aidan Clark</a>, <a href="/search/cs?searchtype=author&query=Ostrow%2C+A">AJ Ostrow</a>, <a href="/search/cs?searchtype=author&query=Welihinda%2C+A">Akila Welihinda</a>, <a href="/search/cs?searchtype=author&query=Hayes%2C+A">Alan Hayes</a>, <a href="/search/cs?searchtype=author&query=Radford%2C+A">Alec Radford</a>, <a href="/search/cs?searchtype=author&query=M%C4%85dry%2C+A">Aleksander M膮dry</a>, <a href="/search/cs?searchtype=author&query=Baker-Whitcomb%2C+A">Alex Baker-Whitcomb</a>, <a href="/search/cs?searchtype=author&query=Beutel%2C+A">Alex Beutel</a>, <a href="/search/cs?searchtype=author&query=Borzunov%2C+A">Alex Borzunov</a>, <a href="/search/cs?searchtype=author&query=Carney%2C+A">Alex Carney</a>, <a href="/search/cs?searchtype=author&query=Chow%2C+A">Alex Chow</a>, <a href="/search/cs?searchtype=author&query=Kirillov%2C+A">Alex Kirillov</a>, <a href="/search/cs?searchtype=author&query=Nichol%2C+A">Alex Nichol</a>, <a href="/search/cs?searchtype=author&query=Paino%2C+A">Alex Paino</a>, <a href="/search/cs?searchtype=author&query=Renzin%2C+A">Alex Renzin</a>, <a href="/search/cs?searchtype=author&query=Passos%2C+A+T">Alex Tachard Passos</a>, <a href="/search/cs?searchtype=author&query=Kirillov%2C+A">Alexander Kirillov</a>, <a href="/search/cs?searchtype=author&query=Christakis%2C+A">Alexi Christakis</a> , et al. (395 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21276v1-abstract-short" style="display: inline;"> GPT-4o is an autoregressive omni model that accepts as input any combination of text, audio, image, and video, and generates any combination of text, audio, and image outputs. It's trained end-to-end across text, vision, and audio, meaning all inputs and outputs are processed by the same neural network. GPT-4o can respond to audio inputs in as little as 232 milliseconds, with an average of 320 mil… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21276v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21276v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21276v1-abstract-full" style="display: none;"> GPT-4o is an autoregressive omni model that accepts as input any combination of text, audio, image, and video, and generates any combination of text, audio, and image outputs. It's trained end-to-end across text, vision, and audio, meaning all inputs and outputs are processed by the same neural network. GPT-4o can respond to audio inputs in as little as 232 milliseconds, with an average of 320 milliseconds, which is similar to human response time in conversation. It matches GPT-4 Turbo performance on text in English and code, with significant improvement on text in non-English languages, while also being much faster and 50\% cheaper in the API. GPT-4o is especially better at vision and audio understanding compared to existing models. In line with our commitment to building AI safely and consistent with our voluntary commitments to the White House, we are sharing the GPT-4o System Card, which includes our Preparedness Framework evaluations. In this System Card, we provide a detailed look at GPT-4o's capabilities, limitations, and safety evaluations across multiple categories, focusing on speech-to-speech while also evaluating text and image capabilities, and measures we've implemented to ensure the model is safe and aligned. We also include third-party assessments on dangerous capabilities, as well as discussion of potential societal impacts of GPT-4o's text and vision capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21276v1-abstract-full').style.display = 'none'; document.getElementById('2410.21276v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18698">arXiv:2410.18698</a> <span> [<a href="https://arxiv.org/pdf/2410.18698">pdf</a>, <a href="https://arxiv.org/format/2410.18698">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Transferring Knowledge from High-Quality to Low-Quality MRI for Adult Glioma Diagnosis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yanguang Zhao</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+L">Long Bai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhaoxi Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yanan Wu</a>, <a href="/search/cs?searchtype=author&query=Islam%2C+M">Mobarakol Islam</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hongliang Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18698v2-abstract-short" style="display: inline;"> Glioma, a common and deadly brain tumor, requires early diagnosis for improved prognosis. However, low-quality Magnetic Resonance Imaging (MRI) technology in Sub-Saharan Africa (SSA) hinders accurate diagnosis. This paper presents our work in the BraTS Challenge on SSA Adult Glioma. We adopt the model from the BraTS-GLI 2021 winning solution and utilize it with three training strategies: (1) initi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18698v2-abstract-full').style.display = 'inline'; document.getElementById('2410.18698v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18698v2-abstract-full" style="display: none;"> Glioma, a common and deadly brain tumor, requires early diagnosis for improved prognosis. However, low-quality Magnetic Resonance Imaging (MRI) technology in Sub-Saharan Africa (SSA) hinders accurate diagnosis. This paper presents our work in the BraTS Challenge on SSA Adult Glioma. We adopt the model from the BraTS-GLI 2021 winning solution and utilize it with three training strategies: (1) initially training on the BraTS-GLI 2021 dataset with fine-tuning on the BraTS-Africa dataset, (2) training solely on the BraTS-Africa dataset, and (3) training solely on the BraTS-Africa dataset with 2x super-resolution enhancement. Results show that initial training on the BraTS-GLI 2021 dataset followed by fine-tuning on the BraTS-Africa dataset has yielded the best results. This suggests the importance of high-quality datasets in providing prior knowledge during training. Our top-performing model achieves Dice scores of 0.882, 0.840, and 0.926, and Hausdorff Distance (95%) scores of 15.324, 37.518, and 13.971 for enhancing tumor, tumor core, and whole tumor, respectively, in the validation phase. In the final phase of the competition, our approach successfully secured second place overall, reflecting the strength and effectiveness of our model and training strategies. Our approach provides insights into improving glioma diagnosis in SSA, showing the potential of deep learning in resource-limited settings and the importance of transfer learning from high-quality datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18698v2-abstract-full').style.display = 'none'; document.getElementById('2410.18698v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical Report, MICCAI 2024 BraTS-SSA Challenge Runner Up</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15882">arXiv:2410.15882</a> <span> [<a href="https://arxiv.org/pdf/2410.15882">pdf</a>, <a href="https://arxiv.org/format/2410.15882">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Distributed Learning for UAV Swarms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+C">Chen Hu</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hanchi Ren</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+J">Jingjing Deng</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+X">Xianghua Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15882v1-abstract-short" style="display: inline;"> Unmanned Aerial Vehicle (UAV) swarms are increasingly deployed in dynamic, data-rich environments for applications such as environmental monitoring and surveillance. These scenarios demand efficient data processing while maintaining privacy and security, making Federated Learning (FL) a promising solution. FL allows UAVs to collaboratively train global models without sharing raw data, but challeng… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15882v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15882v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15882v1-abstract-full" style="display: none;"> Unmanned Aerial Vehicle (UAV) swarms are increasingly deployed in dynamic, data-rich environments for applications such as environmental monitoring and surveillance. These scenarios demand efficient data processing while maintaining privacy and security, making Federated Learning (FL) a promising solution. FL allows UAVs to collaboratively train global models without sharing raw data, but challenges arise due to the non-Independent and Identically Distributed (non-IID) nature of the data collected by UAVs. In this study, we show an integration of the state-of-the-art FL methods to UAV Swarm application and invetigate the performance of multiple aggregation methods (namely FedAvg, FedProx, FedOpt, and MOON) with a particular focus on tackling non-IID on a variety of datasets, specifically MNIST for baseline performance, CIFAR10 for natural object classification, EuroSAT for environment monitoring, and CelebA for surveillance. These algorithms were selected to cover improved techniques on both client-side updates and global aggregation. Results show that while all algorithms perform comparably on IID data, their performance deteriorates significantly under non-IID conditions. FedProx demonstrated the most stable overall performance, emphasising the importance of regularising local updates in non-IID environments to mitigate drastic deviations in local models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15882v1-abstract-full').style.display = 'none'; document.getElementById('2410.15882v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15879">arXiv:2410.15879</a> <span> [<a href="https://arxiv.org/pdf/2410.15879">pdf</a>, <a href="https://arxiv.org/format/2410.15879">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Triplane Grasping: Efficient 6-DoF Grasping with Single RGB Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yiming Li</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hanchi Ren</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+J">Jingjing Deng</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+X">Xianghua Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15879v1-abstract-short" style="display: inline;"> Reliable object grasping is one of the fundamental tasks in robotics. However, determining grasping pose based on single-image input has long been a challenge due to limited visual information and the complexity of real-world objects. In this paper, we propose Triplane Grasping, a fast grasping decision-making method that relies solely on a single RGB-only image as input. Triplane Grasping creates… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15879v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15879v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15879v1-abstract-full" style="display: none;"> Reliable object grasping is one of the fundamental tasks in robotics. However, determining grasping pose based on single-image input has long been a challenge due to limited visual information and the complexity of real-world objects. In this paper, we propose Triplane Grasping, a fast grasping decision-making method that relies solely on a single RGB-only image as input. Triplane Grasping creates a hybrid Triplane-Gaussian 3D representation through a point decoder and a triplane decoder, which produce an efficient and high-quality reconstruction of the object to be grasped to meet real-time grasping requirements. We propose to use an end-to-end network to generate 6-DoF parallel-jaw grasp distributions directly from 3D points in the point cloud as potential grasp contacts and anchor the grasp pose in the observed data. Experiments demonstrate that our method achieves rapid modeling and grasping pose decision-making for daily objects, and exhibits a high grasping success rate in zero-shot scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15879v1-abstract-full').style.display = 'none'; document.getElementById('2410.15879v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15863">arXiv:2410.15863</a> <span> [<a href="https://arxiv.org/pdf/2410.15863">pdf</a>, <a href="https://arxiv.org/format/2410.15863">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Task-oriented Robotic Manipulation with Vision Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guran%2C+N+B">Nurhan Bulus Guran</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hanchi Ren</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+J">Jingjing Deng</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+X">Xianghua Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15863v1-abstract-short" style="display: inline;"> Vision-Language Models (VLMs) play a crucial role in robotic manipulation by enabling robots to understand and interpret the visual properties of objects and their surroundings, allowing them to perform manipulation based on this multimodal understanding. However, understanding object attributes and spatial relationships is a non-trivial task but is critical in robotic manipulation tasks. In this… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15863v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15863v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15863v1-abstract-full" style="display: none;"> Vision-Language Models (VLMs) play a crucial role in robotic manipulation by enabling robots to understand and interpret the visual properties of objects and their surroundings, allowing them to perform manipulation based on this multimodal understanding. However, understanding object attributes and spatial relationships is a non-trivial task but is critical in robotic manipulation tasks. In this work, we present a new dataset focused on spatial relationships and attribute assignment and a novel method to utilize VLMs to perform object manipulation with task-oriented, high-level input. In this dataset, the spatial relationships between objects are manually described as captions. Additionally, each object is labeled with multiple attributes, such as fragility, mass, material, and transparency, derived from a fine-tuned vision language model. The embedded object information from captions are automatically extracted and transformed into a data structure (in this case, tree, for demonstration purposes) that captures the spatial relationships among the objects within each image. The tree structures, along with the object attributes, are then fed into a language model to transform into a new tree structure that determines how these objects should be organized in order to accomplish a specific (high-level) task. We demonstrate that our method not only improves the comprehension of spatial relationships among objects in the visual environment but also enables robots to interact with these objects more effectively. As a result, this approach significantly enhances spatial reasoning in robotic manipulation tasks. To our knowledge, this is the first method of its kind in the literature, offering a novel solution that allows robots to more efficiently organize and utilize objects in their surroundings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15863v1-abstract-full').style.display = 'none'; document.getElementById('2410.15863v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12168">arXiv:2410.12168</a> <span> [<a href="https://arxiv.org/pdf/2410.12168">pdf</a>, <a href="https://arxiv.org/format/2410.12168">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> COMET: Towards Partical W4A4KV4 LLMs Serving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lian Liu</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Haimeng Ren</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+L">Long Cheng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhaohui Xu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Y">Yudong Pan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mengdi Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaowei Li</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Y">Yinhe Han</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Ying Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12168v1-abstract-short" style="display: inline;"> Quantization is a widely-used compression technology to reduce the overhead of serving large language models (LLMs) on terminal devices and in cloud data centers. However, prevalent quantization methods, such as 8-bit weight-activation or 4-bit weight-only quantization, achieve limited performance improvements due to poor support for low-precision (e.g., 4-bit) activation. This work, for the first… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12168v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12168v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12168v1-abstract-full" style="display: none;"> Quantization is a widely-used compression technology to reduce the overhead of serving large language models (LLMs) on terminal devices and in cloud data centers. However, prevalent quantization methods, such as 8-bit weight-activation or 4-bit weight-only quantization, achieve limited performance improvements due to poor support for low-precision (e.g., 4-bit) activation. This work, for the first time, realizes practical W4A4KV4 serving for LLMs, fully utilizing the INT4 tensor cores on modern GPUs and reducing the memory bottleneck caused by the KV cache. Specifically, we propose a novel fine-grained mixed-precision quantization algorithm (FMPQ) that compresses most activations into 4-bit with negligible accuracy loss. To support mixed-precision matrix multiplication for W4A4 and W4A8, we develop a highly optimized W4Ax kernel. Our approach introduces a novel mixed-precision data layout to facilitate access and fast dequantization for activation and weight tensors, utilizing the GPU's software pipeline to hide the overhead of data loading and conversion. Additionally, we propose fine-grained streaming multiprocessor (SM) scheduling to achieve load balance across different SMs. We integrate the optimized W4Ax kernel into our inference framework, COMET, and provide efficient management to support popular LLMs such as LLaMA-3-70B. Extensive evaluations demonstrate that, when running LLaMA family models on a single A100-80G-SMX4, COMET achieves a kernel-level speedup of \textbf{$2.88\times$} over cuBLAS and a \textbf{$2.02 \times$} throughput improvement compared to TensorRT-LLM from an end-to-end framework perspective. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12168v1-abstract-full').style.display = 'none'; document.getElementById('2410.12168v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 12 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08734">arXiv:2410.08734</a> <span> [<a href="https://arxiv.org/pdf/2410.08734">pdf</a>, <a href="https://arxiv.org/format/2410.08734">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Gradients Stand-in for Defending Deep Leakage in Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yi%2C+H">H. Yi</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">H. Ren</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+C">C. Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Y. Li</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+J">J. Deng</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+X">X. Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08734v1-abstract-short" style="display: inline;"> Federated Learning (FL) has become a cornerstone of privacy protection, shifting the paradigm towards localizing sensitive data while only sending model gradients to a central server. This strategy is designed to reinforce privacy protections and minimize the vulnerabilities inherent in centralized data storage systems. Despite its innovative approach, recent empirical studies have highlighted pot… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08734v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08734v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08734v1-abstract-full" style="display: none;"> Federated Learning (FL) has become a cornerstone of privacy protection, shifting the paradigm towards localizing sensitive data while only sending model gradients to a central server. This strategy is designed to reinforce privacy protections and minimize the vulnerabilities inherent in centralized data storage systems. Despite its innovative approach, recent empirical studies have highlighted potential weaknesses in FL, notably regarding the exchange of gradients. In response, this study introduces a novel, efficacious method aimed at safeguarding against gradient leakage, namely, ``AdaDefense". Following the idea that model convergence can be achieved by using different types of optimization methods, we suggest using a local stand-in rather than the actual local gradient for global gradient aggregation on the central server. This proposed approach not only effectively prevents gradient leakage, but also ensures that the overall performance of the model remains largely unaffected. Delving into the theoretical dimensions, we explore how gradients may inadvertently leak private information and present a theoretical framework supporting the efficacy of our proposed method. Extensive empirical tests, supported by popular benchmark experiments, validate that our approach maintains model integrity and is robust against gradient leakage, marking an important step in our pursuit of safe and efficient FL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08734v1-abstract-full').style.display = 'none'; document.getElementById('2410.08734v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08196">arXiv:2410.08196</a> <span> [<a href="https://arxiv.org/pdf/2410.08196">pdf</a>, <a href="https://arxiv.org/format/2410.08196">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MathCoder2: Better Math Reasoning from Continued Pretraining on Model-translated Mathematical Code </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zimu Lu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+A">Aojun Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Ke Wang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Houxing Ren</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+W">Weikang Shi</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+J">Junting Pan</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+M">Mingjie Zhan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongsheng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08196v1-abstract-short" style="display: inline;"> Code has been shown to be effective in enhancing the mathematical reasoning abilities of large language models due to its precision and accuracy. Previous works involving continued mathematical pretraining often include code that utilizes math-related packages, which are primarily designed for fields such as engineering, machine learning, signal processing, or module testing, rather than being dir… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08196v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08196v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08196v1-abstract-full" style="display: none;"> Code has been shown to be effective in enhancing the mathematical reasoning abilities of large language models due to its precision and accuracy. Previous works involving continued mathematical pretraining often include code that utilizes math-related packages, which are primarily designed for fields such as engineering, machine learning, signal processing, or module testing, rather than being directly focused on mathematical reasoning. In this paper, we introduce a novel method for generating mathematical code accompanied with corresponding reasoning steps for continued pretraining. Our approach begins with the construction of a high-quality mathematical continued pretraining dataset by incorporating math-related web data, code using mathematical packages, math textbooks, and synthetic data. Next, we construct reasoning steps by extracting LaTeX expressions, the conditions needed for the expressions, and the results of the expressions from the previously collected dataset. Based on this extracted information, we generate corresponding code to accurately capture the mathematical reasoning process. Appending the generated code to each reasoning step results in data consisting of paired natural language reasoning steps and their corresponding code. Combining this data with the original dataset results in a 19.2B-token high-performing mathematical pretraining corpus, which we name MathCode-Pile. Training several popular base models with this corpus significantly improves their mathematical abilities, leading to the creation of the MathCoder2 family of models. All of our data processing and training code is open-sourced, ensuring full transparency and easy reproducibility of the entire data collection and training pipeline. The code is released at https://github.com/mathllm/MathCoder2 . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08196v1-abstract-full').style.display = 'none'; document.getElementById('2410.08196v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">https://github.com/mathllm/MathCoder2</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07540">arXiv:2410.07540</a> <span> [<a href="https://arxiv.org/pdf/2410.07540">pdf</a>, <a href="https://arxiv.org/format/2410.07540">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CoPESD: A Multi-Level Surgical Motion Dataset for Training Large Vision-Language Models to Co-Pilot Endoscopic Submucosal Dissection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guankun Wang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+H">Han Xiao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Huxin Gao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Renrui Zhang</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+L">Long Bai</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaoxiao Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhen Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongsheng Li</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hongliang Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07540v1-abstract-short" style="display: inline;"> submucosal dissection (ESD) enables rapid resection of large lesions, minimizing recurrence rates and improving long-term overall survival. Despite these advantages, ESD is technically challenging and carries high risks of complications, necessitating skilled surgeons and precise instruments. Recent advancements in Large Visual-Language Models (LVLMs) offer promising decision support and predictiv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07540v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07540v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07540v1-abstract-full" style="display: none;"> submucosal dissection (ESD) enables rapid resection of large lesions, minimizing recurrence rates and improving long-term overall survival. Despite these advantages, ESD is technically challenging and carries high risks of complications, necessitating skilled surgeons and precise instruments. Recent advancements in Large Visual-Language Models (LVLMs) offer promising decision support and predictive planning capabilities for robotic systems, which can augment the accuracy of ESD and reduce procedural risks. However, existing datasets for multi-level fine-grained ESD surgical motion understanding are scarce and lack detailed annotations. In this paper, we design a hierarchical decomposition of ESD motion granularity and introduce a multi-level surgical motion dataset (CoPESD) for training LVLMs as the robotic \textbf{Co}-\textbf{P}ilot of \textbf{E}ndoscopic \textbf{S}ubmucosal \textbf{D}issection. CoPESD includes 17,679 images with 32,699 bounding boxes and 88,395 multi-level motions, from over 35 hours of ESD videos for both robot-assisted and conventional surgeries. CoPESD enables granular analysis of ESD motions, focusing on the complex task of submucosal dissection. Extensive experiments on the LVLMs demonstrate the effectiveness of CoPESD in training LVLMs to predict following surgical robotic motions. As the first multimodal ESD motion dataset, CoPESD supports advanced research in ESD instruction-following and surgical automation. The dataset is available at \href{https://github.com/gkw0010/CoPESD}{https://github.com/gkw0010/CoPESD.}} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07540v1-abstract-full').style.display = 'none'; document.getElementById('2410.07540v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03530">arXiv:2410.03530</a> <span> [<a href="https://arxiv.org/pdf/2410.03530">pdf</a>, <a href="https://arxiv.org/format/2410.03530">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> PRF: Parallel Resonate and Fire Neuron for Long Sequence Learning in Spiking Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yulong Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zunchang Liu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Changchun Feng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xiaopeng Lin</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hongwei Ren</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+H">Haotian Fu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yue Zhou</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+H">Hong Xing</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+B">Bojun Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03530v2-abstract-short" style="display: inline;"> Recently, there is growing demand for effective and efficient long sequence modeling, with State Space Models (SSMs) proving to be effective for long sequence tasks. To further reduce energy consumption, SSMs can be adapted to Spiking Neural Networks (SNNs) using spiking functions. However, current spiking-formalized SSMs approaches still rely on float-point matrix-vector multiplication during inf… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03530v2-abstract-full').style.display = 'inline'; document.getElementById('2410.03530v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03530v2-abstract-full" style="display: none;"> Recently, there is growing demand for effective and efficient long sequence modeling, with State Space Models (SSMs) proving to be effective for long sequence tasks. To further reduce energy consumption, SSMs can be adapted to Spiking Neural Networks (SNNs) using spiking functions. However, current spiking-formalized SSMs approaches still rely on float-point matrix-vector multiplication during inference, undermining SNNs' energy advantage. In this work, we address the efficiency and performance challenges of long sequence learning in SNNs simultaneously. First, we propose a decoupled reset method for parallel spiking neuron training, reducing the typical Leaky Integrate-and-Fire (LIF) model's training time from $O(L^2)$ to $O(L\log L)$, effectively speeding up the training by $6.57 \times$ to $16.50 \times$ on sequence lengths $1,024$ to $32,768$. To our best knowledge, this is the first time that parallel computation with a reset mechanism is implemented achieving equivalence to its sequential counterpart. Secondly, to capture long-range dependencies, we propose a Parallel Resonate and Fire (PRF) neuron, which leverages an oscillating membrane potential driven by a resonate mechanism from a differentiable reset function in the complex domain. The PRF enables efficient long sequence learning while maintaining parallel training. Finally, we demonstrate that the proposed spike-driven architecture using PRF achieves performance comparable to Structured SSMs (S4), with two orders of magnitude reduction in energy consumption, outperforming Transformer on Long Range Arena tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03530v2-abstract-full').style.display = 'none'; document.getElementById('2410.03530v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2208.04933 by other authors</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02279">arXiv:2410.02279</a> <span> [<a href="https://arxiv.org/pdf/2410.02279">pdf</a>, <a href="https://arxiv.org/format/2410.02279">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Statistics Theory">math.ST</span> </div> </div> <p class="title is-5 mathjax"> On Lai's Upper Confidence Bound in Multi-Armed Bandits </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ren%2C+H">Huachen Ren</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Cun-Hui Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02279v2-abstract-short" style="display: inline;"> In this memorial paper, we honor Tze Leung Lai's seminal contributions to the topic of multi-armed bandits, with a specific focus on his pioneering work on the upper confidence bound. We establish sharp non-asymptotic regret bounds for an upper confidence bound index with a constant level of exploration for Gaussian rewards. Furthermore, we establish a non-asymptotic regret bound for the upper con… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02279v2-abstract-full').style.display = 'inline'; document.getElementById('2410.02279v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02279v2-abstract-full" style="display: none;"> In this memorial paper, we honor Tze Leung Lai's seminal contributions to the topic of multi-armed bandits, with a specific focus on his pioneering work on the upper confidence bound. We establish sharp non-asymptotic regret bounds for an upper confidence bound index with a constant level of exploration for Gaussian rewards. Furthermore, we establish a non-asymptotic regret bound for the upper confidence bound index of Lai (1987) which employs an exploration function that decreases with the sample size of the corresponding arm. The regret bounds have leading constants that match the Lai-Robbins lower bound. Our results highlight an aspect of Lai's seminal works that deserves more attention in the machine learning literature. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02279v2-abstract-full').style.display = 'none'; document.getElementById('2410.02279v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 62L05; 62L10 (Primary) 68T05 (Secondary) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16063">arXiv:2409.16063</a> <span> [<a href="https://arxiv.org/pdf/2409.16063">pdf</a>, <a href="https://arxiv.org/format/2409.16063">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Benchmarking Robustness of Endoscopic Depth Estimation with Synthetically Corrupted Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+A">An Wang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+H">Haochen Yin</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+B">Beilei Cui</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mengya Xu</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hongliang Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16063v1-abstract-short" style="display: inline;"> Accurate depth perception is crucial for patient outcomes in endoscopic surgery, yet it is compromised by image distortions common in surgical settings. To tackle this issue, our study presents a benchmark for assessing the robustness of endoscopic depth estimation models. We have compiled a comprehensive dataset that reflects real-world conditions, incorporating a range of synthetically induced c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16063v1-abstract-full').style.display = 'inline'; document.getElementById('2409.16063v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16063v1-abstract-full" style="display: none;"> Accurate depth perception is crucial for patient outcomes in endoscopic surgery, yet it is compromised by image distortions common in surgical settings. To tackle this issue, our study presents a benchmark for assessing the robustness of endoscopic depth estimation models. We have compiled a comprehensive dataset that reflects real-world conditions, incorporating a range of synthetically induced corruptions at varying severity levels. To further this effort, we introduce the Depth Estimation Robustness Score (DERS), a novel metric that combines measures of error, accuracy, and robustness to meet the multifaceted requirements of surgical applications. This metric acts as a foundational element for evaluating performance, establishing a new paradigm for the comparative analysis of depth estimation technologies. Additionally, we set forth a benchmark focused on robustness for the evaluation of depth estimation in endoscopic surgery, with the aim of driving progress in model refinement. A thorough analysis of two monocular depth estimation models using our framework reveals crucial information about their reliability under adverse conditions. Our results emphasize the essential need for algorithms that can tolerate data corruption, thereby advancing discussions on improving model robustness. The impact of this research transcends theoretical frameworks, providing concrete gains in surgical precision and patient safety. This study establishes a benchmark for the robustness of depth estimation and serves as a foundation for developing more resilient surgical support technologies. Code is available at https://github.com/lofrienger/EndoDepthBenchmark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16063v1-abstract-full').style.display = 'none'; document.getElementById('2409.16063v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear at the Simulation and Synthesis in Medical Imaging (SASHIMI) workshop at MICCAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13138">arXiv:2409.13138</a> <span> [<a href="https://arxiv.org/pdf/2409.13138">pdf</a>, <a href="https://arxiv.org/format/2409.13138">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3670474.3685940">10.1145/3670474.3685940 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Learning to Compare Hardware Designs for High-Level Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yunsheng Bai</a>, <a href="/search/cs?searchtype=author&query=Sohrabizadeh%2C+A">Atefeh Sohrabizadeh</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Z">Zijian Ding</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+R">Rongjian Liang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weikai Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Ding Wang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Haoxing Ren</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yizhou Sun</a>, <a href="/search/cs?searchtype=author&query=Cong%2C+J">Jason Cong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13138v1-abstract-short" style="display: inline;"> High-level synthesis (HLS) is an automated design process that transforms high-level code into hardware designs, enabling the rapid development of hardware accelerators. HLS relies on pragmas, which are directives inserted into the source code to guide the synthesis process, and pragmas have various settings and values that significantly impact the resulting hardware design. State-of-the-art ML-ba… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13138v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13138v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13138v1-abstract-full" style="display: none;"> High-level synthesis (HLS) is an automated design process that transforms high-level code into hardware designs, enabling the rapid development of hardware accelerators. HLS relies on pragmas, which are directives inserted into the source code to guide the synthesis process, and pragmas have various settings and values that significantly impact the resulting hardware design. State-of-the-art ML-based HLS methods, such as HARP, first train a deep learning model, typically based on graph neural networks (GNNs) applied to graph-based representations of the source code and pragmas. They then perform design space exploration (DSE) to explore the pragma design space, rank candidate designs using the model, and return the top designs. However, traditional DSE methods face challenges due to the highly nonlinear relationship between pragma settings and performance metrics, along with complex interactions between pragmas that affect performance in non-obvious ways. To address these challenges, we propose compareXplore, a novel approach that learns to compare hardware designs for effective HLS optimization. CompareXplore introduces a hybrid loss function that combines pairwise preference learning with pointwise performance prediction, enabling the model to capture both relative preferences and absolute performance. Moreover, we introduce a novel node difference attention module that focuses on the most informative differences between designs, enabling the model to identify critical pragmas impacting performance. CompareXplore adopts a two-stage DSE, where a pointwise prediction model is used for the initial design pruning, followed by a pairwise comparison stage for precise performance verification. In extensive experiments, compareXplore achieves significant improvements in ranking metrics and generates high-quality HLS results for the selected designs, outperforming the existing SOTA method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13138v1-abstract-full').style.display = 'none'; document.getElementById('2409.13138v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in MLCAD 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of the 2024 ACM/IEEE International Symposium on Machine Learning for CAD (MLCAD '24), ACM, 2024, Article 2, 1-7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12993">arXiv:2409.12993</a> <span> [<a href="https://arxiv.org/pdf/2409.12993">pdf</a>, <a href="https://arxiv.org/format/2409.12993">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> CraftRTL: High-quality Synthetic Data Generation for Verilog Code Models with Correct-by-Construction Non-Textual Representations and Targeted Code Repair </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mingjie Liu</a>, <a href="/search/cs?searchtype=author&query=Tsai%2C+Y">Yun-Da Tsai</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Wenfei Zhou</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Haoxing Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12993v2-abstract-short" style="display: inline;"> Despite the significant progress made in code generation with large language models, challenges persist, especially with hardware description languages such as Verilog. This paper first presents an analysis of fine-tuned LLMs on Verilog coding, with synthetic data from prior methods. We identify two main issues: difficulties in handling non-textual representations (Karnaugh maps, state-transition… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12993v2-abstract-full').style.display = 'inline'; document.getElementById('2409.12993v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12993v2-abstract-full" style="display: none;"> Despite the significant progress made in code generation with large language models, challenges persist, especially with hardware description languages such as Verilog. This paper first presents an analysis of fine-tuned LLMs on Verilog coding, with synthetic data from prior methods. We identify two main issues: difficulties in handling non-textual representations (Karnaugh maps, state-transition diagrams and waveforms) and significant variability during training with models randomly making "minor" mistakes. To address these limitations, we enhance data curation by creating correct-by-construction data targeting non-textual representations. Additionally, we introduce an automated framework that generates error reports from various model checkpoints and injects these errors into open-source code to create targeted code repair data. Our fine-tuned Starcoder2-15B outperforms prior state-of-the-art results by 3.8%, 10.9%, 6.6% for pass@1 on VerilogEval-Machine, VerilogEval-Human, and RTLLM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12993v2-abstract-full').style.display = 'none'; document.getElementById('2409.12993v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12467">arXiv:2409.12467</a> <span> [<a href="https://arxiv.org/pdf/2409.12467">pdf</a>, <a href="https://arxiv.org/format/2409.12467">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SurgPLAN++: Universal Surgical Phase Localization Network for Online and Offline Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhen Chen</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+X">Xingjian Luo</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jinlin Wu</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+L">Long Bai</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+Z">Zhen Lei</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hongliang Ren</a>, <a href="/search/cs?searchtype=author&query=Ourselin%2C+S">Sebastien Ourselin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hongbin Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12467v2-abstract-short" style="display: inline;"> Surgical phase recognition is critical for assisting surgeons in understanding surgical videos. Existing studies focused more on online surgical phase recognition, by leveraging preceding frames to predict the current frame. Despite great progress, they formulated the task as a series of frame-wise classification, which resulted in a lack of global context of the entire procedure and incoherent pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12467v2-abstract-full').style.display = 'inline'; document.getElementById('2409.12467v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12467v2-abstract-full" style="display: none;"> Surgical phase recognition is critical for assisting surgeons in understanding surgical videos. Existing studies focused more on online surgical phase recognition, by leveraging preceding frames to predict the current frame. Despite great progress, they formulated the task as a series of frame-wise classification, which resulted in a lack of global context of the entire procedure and incoherent predictions. Moreover, besides online analysis, accurate offline surgical phase recognition is also in significant clinical need for retrospective analysis, and existing online algorithms do not fully analyze the entire video, thereby limiting accuracy in offline analysis. To overcome these challenges and enhance both online and offline inference capabilities, we propose a universal Surgical Phase Localization Network, named SurgPLAN++, with the principle of temporal detection. To ensure a global understanding of the surgical procedure, we devise a phase localization strategy for SurgPLAN++ to predict phase segments across the entire video through phase proposals. For online analysis, to generate high-quality phase proposals, SurgPLAN++ incorporates a data augmentation strategy to extend the streaming video into a pseudo-complete video through mirroring, center-duplication, and down-sampling. For offline analysis, SurgPLAN++ capitalizes on its global phase prediction framework to continuously refine preceding predictions during each online inference step, thereby significantly improving the accuracy of phase recognition. We perform extensive experiments to validate the effectiveness, and our SurgPLAN++ achieves remarkable performance in both online and offline modes, which outperforms state-of-the-art methods. The source code is available at https://github.com/franciszchen/SurgPLAN-Plus. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12467v2-abstract-full').style.display = 'none'; document.getElementById('2409.12467v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work is accepted by IEEE ICRA 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03421">arXiv:2409.03421</a> <span> [<a href="https://arxiv.org/pdf/2409.03421">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> F3T: A soft tactile unit with 3D force and temperature mathematical decoupling ability for robots </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiong Yang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hao Ren</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D">Dong Guo</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+Z">Zhengrong Ling</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tieshan Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Gen Li</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yifeng Tang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Haoxiang Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiale Wang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+H">Hongyuan Chang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+J">Jia Dong</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yajing Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03421v1-abstract-short" style="display: inline;"> The human skin exhibits remarkable capability to perceive contact forces and environmental temperatures, providing intricate information essential for nuanced manipulation. Despite recent advancements in soft tactile sensors, a significant challenge remains in accurately decoupling signals - specifically, separating force from directional orientation and temperature - resulting in fail to meet the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03421v1-abstract-full').style.display = 'inline'; document.getElementById('2409.03421v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03421v1-abstract-full" style="display: none;"> The human skin exhibits remarkable capability to perceive contact forces and environmental temperatures, providing intricate information essential for nuanced manipulation. Despite recent advancements in soft tactile sensors, a significant challenge remains in accurately decoupling signals - specifically, separating force from directional orientation and temperature - resulting in fail to meet the advanced application requirements of robots. This research proposes a multi-layered soft sensor unit (F3T) designed to achieve isolated measurements and mathematical decoupling of normal pressure, omnidirectional tangential forces, and temperature. We developed a circular coaxial magnetic film featuring a floating-mountain multi-layer capacitor, facilitating the physical decoupling of normal and tangential forces in all directions. Additionally, we incorporated an ion gel-based temperature sensing film atop the tactile sensor. This sensor is resilient to external pressure and deformation, enabling it to measure temperature and, crucially, eliminate capacitor errors induced by environmental temperature changes. This innovative design allows for the decoupled measurement of multiple signals, paving the way for advancements in higher-level robot motion control, autonomous decision-making, and task planning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03421v1-abstract-full').style.display = 'none'; document.getElementById('2409.03421v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.16455">arXiv:2408.16455</a> <span> [<a href="https://arxiv.org/pdf/2408.16455">pdf</a>, <a href="https://arxiv.org/format/2408.16455">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Addressing the Mutual Interference in Uplink ISAC Receivers: A Projection Method </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhiyuan Yu</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hong Ren</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+C">Cunhua Pan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+G">Gui Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruizhe Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mengyu Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiangzhou Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.16455v1-abstract-short" style="display: inline;"> Dual function radar and communication (DFRC) is a promising research direction within integrated sensing and communication (ISAC), improving hardware and spectrum efficiency by merging sensing and communication (S&C) functionalities into a shared platform. However, the DFRC receiver (DFRC-R) is tasked with both uplink communication signal detection and simultaneously target-related parameter estim… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16455v1-abstract-full').style.display = 'inline'; document.getElementById('2408.16455v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.16455v1-abstract-full" style="display: none;"> Dual function radar and communication (DFRC) is a promising research direction within integrated sensing and communication (ISAC), improving hardware and spectrum efficiency by merging sensing and communication (S&C) functionalities into a shared platform. However, the DFRC receiver (DFRC-R) is tasked with both uplink communication signal detection and simultaneously target-related parameter estimation from the echoes, leading to issues with mutual interference. In this paper, a projection-based scheme is proposed to equivalently transform the joint signal detection and target estimation problem into a joint signal detection process across multiple snapshots. Compared with conventional successive interference cancellation (SIC) schemes, our proposed approach achieves a higher signal-to-noise ratio (SNR), and a higher ergodic rate when the radar signal is non-negligible. Nonetheless, it introduces an ill-conditioned signal detection problem, which is addressed using a non-linear detector. By jointly processing an increased number of snapshots, the proposed scheme can achieve high S&C performance simultaneously. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16455v1-abstract-full').style.display = 'none'; document.getElementById('2408.16455v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 3 figures, accepted by IEEE WCL</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.16277">arXiv:2408.16277</a> <span> [<a href="https://arxiv.org/pdf/2408.16277">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Fine-grained Classification of Port Wine Stains Using Optical Coherence Tomography Angiography </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+X">Xiaofeng Deng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Defu Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bowen Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiwan Zhang</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+H">Haixia Qiu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+W">Wu Yuan</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hongliang Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.16277v1-abstract-short" style="display: inline;"> Accurate classification of port wine stains (PWS, vascular malformations present at birth), is critical for subsequent treatment planning. However, the current method of classifying PWS based on the external skin appearance rarely reflects the underlying angiopathological heterogeneity of PWS lesions, resulting in inconsistent outcomes with the common vascular-targeted photodynamic therapy (V-PDT)… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16277v1-abstract-full').style.display = 'inline'; document.getElementById('2408.16277v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.16277v1-abstract-full" style="display: none;"> Accurate classification of port wine stains (PWS, vascular malformations present at birth), is critical for subsequent treatment planning. However, the current method of classifying PWS based on the external skin appearance rarely reflects the underlying angiopathological heterogeneity of PWS lesions, resulting in inconsistent outcomes with the common vascular-targeted photodynamic therapy (V-PDT) treatments. Conversely, optical coherence tomography angiography (OCTA) is an ideal tool for visualizing the vascular malformations of PWS. Previous studies have shown no significant correlation between OCTA quantitative metrics and the PWS subtypes determined by the current classification approach. This study proposes a new classification approach for PWS using both OCT and OCTA. By examining the hypodermic histopathology and vascular structure of PWS, we have devised a fine-grained classification method that subdivides PWS into five distinct types. To assess the angiopathological differences of various PWS subtypes, we have analyzed six metrics related to vascular morphology and depth information of PWS lesions. The five PWS types present significant differences across all metrics compared to the conventional subtypes. Our findings suggest that an angiopathology-based classification accurately reflects the heterogeneity in PWS lesions. This research marks the first attempt to classify PWS based on angiopathology, potentially guiding more effective subtyping and treatment strategies for PWS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16277v1-abstract-full').style.display = 'none'; document.getElementById('2408.16277v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12775">arXiv:2408.12775</a> <span> [<a href="https://arxiv.org/pdf/2408.12775">pdf</a>, <a href="https://arxiv.org/format/2408.12775">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Intelligent OPC Engineer Assistant for Semiconductor Manufacturing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guojin Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Haoyu Yang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Bei Yu</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Haoxing Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12775v2-abstract-short" style="display: inline;"> Advancements in chip design and manufacturing have enabled the processing of complex tasks such as deep learning and natural language processing, paving the way for the development of artificial general intelligence (AGI). AI, on the other hand, can be leveraged to innovate and streamline semiconductor technology from planning and implementation to manufacturing. In this paper, we present \textit{… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12775v2-abstract-full').style.display = 'inline'; document.getElementById('2408.12775v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12775v2-abstract-full" style="display: none;"> Advancements in chip design and manufacturing have enabled the processing of complex tasks such as deep learning and natural language processing, paving the way for the development of artificial general intelligence (AGI). AI, on the other hand, can be leveraged to innovate and streamline semiconductor technology from planning and implementation to manufacturing. In this paper, we present \textit{Intelligent OPC Engineer Assistant}, an AI/LLM-powered methodology designed to solve the core manufacturing-aware optimization problem known as optical proximity correction (OPC). The methodology involves a reinforcement learning-based OPC recipe search and a customized multi-modal agent system for recipe summarization. Experiments demonstrate that our methodology can efficiently build OPC recipes on various chip designs with specially handled design topologies, a task that typically requires the full-time effort of OPC engineers with years of experience. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12775v2-abstract-full').style.display = 'none'; document.getElementById('2408.12775v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Ren%2C+H&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Ren%2C+H&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Ren%2C+H&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Ren%2C+H&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Ren%2C+H&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Ren%2C+H&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>