Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–31 of 31 results for author: <span class="mathjax">Di, Y</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Di%2C+Y">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Di, Y"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Di%2C+Y&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Di, Y"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02414">arXiv:2502.02414</a> <span> [<a href="https://arxiv.org/pdf/2502.02414">pdf</a>, <a href="https://arxiv.org/format/2502.02414">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Transolver++: An Accurate Neural Solver for PDEs on Million-Scale Geometries </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+H">Huakun Luo</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Haixu Wu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hang Zhou</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+L">Lanxiang Xing</a>, <a href="/search/cs?searchtype=author&query=Di%2C+Y">Yichen Di</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianmin Wang</a>, <a href="/search/cs?searchtype=author&query=Long%2C+M">Mingsheng Long</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02414v2-abstract-short" style="display: inline;"> Although deep models have been widely explored in solving partial differential equations (PDEs), previous works are primarily limited to data only with up to tens of thousands of mesh points, far from the million-point scale required by industrial simulations that involve complex geometries. In the spirit of advancing neural PDE solvers to real industrial applications, we present Transolver++, a h… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02414v2-abstract-full').style.display = 'inline'; document.getElementById('2502.02414v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02414v2-abstract-full" style="display: none;"> Although deep models have been widely explored in solving partial differential equations (PDEs), previous works are primarily limited to data only with up to tens of thousands of mesh points, far from the million-point scale required by industrial simulations that involve complex geometries. In the spirit of advancing neural PDE solvers to real industrial applications, we present Transolver++, a highly parallel and efficient neural solver that can accurately solve PDEs on million-scale geometries. Building upon previous advancements in solving PDEs by learning physical states via Transolver, Transolver++ is further equipped with an extremely optimized parallelism framework and a local adaptive mechanism to efficiently capture eidetic physical states from massive mesh points, successfully tackling the thorny challenges in computation and physics learning when scaling up input mesh size. Transolver++ increases the single-GPU input capacity to million-scale points for the first time and is capable of continuously scaling input size in linear complexity by increasing GPUs. Experimentally, Transolver++ yields 13% relative promotion across six standard PDE benchmarks and achieves over 20% performance gain in million-scale high-fidelity industrial simulations, whose sizes are 100$\times$ larger than previous benchmarks, covering car and 3D aircraft designs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02414v2-abstract-full').style.display = 'none'; document.getElementById('2502.02414v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15727">arXiv:2409.15727</a> <span> [<a href="https://arxiv.org/pdf/2409.15727">pdf</a>, <a href="https://arxiv.org/format/2409.15727">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LaPose: Laplacian Mixture Shape Modeling for RGB-Based Category-Level Object Pose Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruida Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Ziqin Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Gu Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenyangguang Zhang</a>, <a href="/search/cs?searchtype=author&query=Di%2C+Y">Yan Di</a>, <a href="/search/cs?searchtype=author&query=Zuo%2C+X">Xingxing Zuo</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jiwen Tang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+X">Xiangyang Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15727v1-abstract-short" style="display: inline;"> While RGBD-based methods for category-level object pose estimation hold promise, their reliance on depth data limits their applicability in diverse scenarios. In response, recent efforts have turned to RGB-based methods; however, they face significant challenges stemming from the absence of depth information. On one hand, the lack of depth exacerbates the difficulty in handling intra-class shape v… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15727v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15727v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15727v1-abstract-full" style="display: none;"> While RGBD-based methods for category-level object pose estimation hold promise, their reliance on depth data limits their applicability in diverse scenarios. In response, recent efforts have turned to RGB-based methods; however, they face significant challenges stemming from the absence of depth information. On one hand, the lack of depth exacerbates the difficulty in handling intra-class shape variation, resulting in increased uncertainty in shape predictions. On the other hand, RGB-only inputs introduce inherent scale ambiguity, rendering the estimation of object size and translation an ill-posed problem. To tackle these challenges, we propose LaPose, a novel framework that models the object shape as the Laplacian mixture model for Pose estimation. By representing each point as a probabilistic distribution, we explicitly quantify the shape uncertainty. LaPose leverages both a generalized 3D information stream and a specialized feature stream to independently predict the Laplacian distribution for each point, capturing different aspects of object geometry. These two distributions are then integrated as a Laplacian mixture model to establish the 2D-3D correspondences, which are utilized to solve the pose via the PnP module. In order to mitigate scale ambiguity, we introduce a scale-agnostic representation for object size and translation, enhancing training efficiency and overall robustness. Extensive experiments on the NOCS datasets validate the effectiveness of LaPose, yielding state-of-the-art performance in RGB-based category-level object pose estimation. Codes are released at https://github.com/lolrudy/LaPose <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15727v1-abstract-full').style.display = 'none'; document.getElementById('2409.15727v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05112">arXiv:2409.05112</a> <span> [<a href="https://arxiv.org/pdf/2409.05112">pdf</a>, <a href="https://arxiv.org/format/2409.05112">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> WaterSeeker: Pioneering Efficient Detection of Watermarked Segments in Large Documents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pan%2C+L">Leyi Pan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A">Aiwei Liu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yijian Lu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zitian Gao</a>, <a href="/search/cs?searchtype=author&query=Di%2C+Y">Yichen Di</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Shiyu Huang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+L">Lijie Wen</a>, <a href="/search/cs?searchtype=author&query=King%2C+I">Irwin King</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+P+S">Philip S. Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05112v5-abstract-short" style="display: inline;"> Watermarking algorithms for large language models (LLMs) have attained high accuracy in detecting LLM-generated text. However, existing methods primarily focus on distinguishing fully watermarked text from non-watermarked text, overlooking real-world scenarios where LLMs generate only small sections within large documents. In this scenario, balancing time complexity and detection performance poses… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05112v5-abstract-full').style.display = 'inline'; document.getElementById('2409.05112v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05112v5-abstract-full" style="display: none;"> Watermarking algorithms for large language models (LLMs) have attained high accuracy in detecting LLM-generated text. However, existing methods primarily focus on distinguishing fully watermarked text from non-watermarked text, overlooking real-world scenarios where LLMs generate only small sections within large documents. In this scenario, balancing time complexity and detection performance poses significant challenges. This paper presents WaterSeeker, a novel approach to efficiently detect and locate watermarked segments amid extensive natural text. It first applies an efficient anomaly extraction method to preliminarily locate suspicious watermarked regions. Following this, it conducts a local traversal and performs full-text detection for more precise verification. Theoretical analysis and experimental results demonstrate that WaterSeeker achieves a superior balance between detection accuracy and computational efficiency. Moreover, its localization capability lays the foundation for building interpretable AI detection systems. Our code is available at https://github.com/THU-BPM/WaterSeeker. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05112v5-abstract-full').style.display = 'none'; document.getElementById('2409.05112v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NAACL 2025 Findings</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T50 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.10557">arXiv:2405.10557</a> <span> [<a href="https://arxiv.org/pdf/2405.10557">pdf</a>, <a href="https://arxiv.org/format/2405.10557">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Resolving Symmetry Ambiguity in Correspondence-based Methods for Instance-level Object Pose Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yongliang Lin</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yongzhi Su</a>, <a href="/search/cs?searchtype=author&query=Inuganti%2C+S">Sandeep Inuganti</a>, <a href="/search/cs?searchtype=author&query=Di%2C+Y">Yan Di</a>, <a href="/search/cs?searchtype=author&query=Ajilforoushan%2C+N">Naeem Ajilforoushan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hanqing Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/cs?searchtype=author&query=Rambach%2C+J">Jason Rambach</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.10557v1-abstract-short" style="display: inline;"> Estimating the 6D pose of an object from a single RGB image is a critical task that becomes additionally challenging when dealing with symmetric objects. Recent approaches typically establish one-to-one correspondences between image pixels and 3D object surface vertices. However, the utilization of one-to-one correspondences introduces ambiguity for symmetric objects. To address this, we propose S… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10557v1-abstract-full').style.display = 'inline'; document.getElementById('2405.10557v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.10557v1-abstract-full" style="display: none;"> Estimating the 6D pose of an object from a single RGB image is a critical task that becomes additionally challenging when dealing with symmetric objects. Recent approaches typically establish one-to-one correspondences between image pixels and 3D object surface vertices. However, the utilization of one-to-one correspondences introduces ambiguity for symmetric objects. To address this, we propose SymCode, a symmetry-aware surface encoding that encodes the object surface vertices based on one-to-many correspondences, eliminating the problem of one-to-one correspondence ambiguity. We also introduce SymNet, a fast end-to-end network that directly regresses the 6D pose parameters without solving a PnP problem. We demonstrate faster runtime and comparable accuracy achieved by our method on the T-LESS and IC-BIN benchmarks of mostly symmetric objects. Our source code will be released upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10557v1-abstract-full').style.display = 'none'; document.getElementById('2405.10557v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages,10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.00915">arXiv:2405.00915</a> <span> [<a href="https://arxiv.org/pdf/2405.00915">pdf</a>, <a href="https://arxiv.org/format/2405.00915">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> EchoScene: Indoor Scene Generation via Information Echo over Scene Graph Diffusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhai%2C+G">Guangyao Zhai</a>, <a href="/search/cs?searchtype=author&query=%C3%96rnek%2C+E+P">Evin P谋nar 脰rnek</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D+Z">Dave Zhenyu Chen</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+R">Ruotong Liao</a>, <a href="/search/cs?searchtype=author&query=Di%2C+Y">Yan Di</a>, <a href="/search/cs?searchtype=author&query=Navab%2C+N">Nassir Navab</a>, <a href="/search/cs?searchtype=author&query=Tombari%2C+F">Federico Tombari</a>, <a href="/search/cs?searchtype=author&query=Busam%2C+B">Benjamin Busam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.00915v1-abstract-short" style="display: inline;"> We present EchoScene, an interactive and controllable generative model that generates 3D indoor scenes on scene graphs. EchoScene leverages a dual-branch diffusion model that dynamically adapts to scene graphs. Existing methods struggle to handle scene graphs due to varying numbers of nodes, multiple edge combinations, and manipulator-induced node-edge operations. EchoScene overcomes this by assoc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.00915v1-abstract-full').style.display = 'inline'; document.getElementById('2405.00915v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.00915v1-abstract-full" style="display: none;"> We present EchoScene, an interactive and controllable generative model that generates 3D indoor scenes on scene graphs. EchoScene leverages a dual-branch diffusion model that dynamically adapts to scene graphs. Existing methods struggle to handle scene graphs due to varying numbers of nodes, multiple edge combinations, and manipulator-induced node-edge operations. EchoScene overcomes this by associating each node with a denoising process and enables collaborative information exchange, enhancing controllable and consistent generation aware of global constraints. This is achieved through an information echo scheme in both shape and layout branches. At every denoising step, all processes share their denoising data with an information exchange unit that combines these updates using graph convolution. The scheme ensures that the denoising processes are influenced by a holistic understanding of the scene graph, facilitating the generation of globally coherent scenes. The resulting scenes can be manipulated during inference by editing the input scene graph and sampling the noise in the diffusion model. Extensive experiments validate our approach, which maintains scene controllability and surpasses previous methods in generation fidelity. Moreover, the generated scenes are of high quality and thus directly compatible with off-the-shelf texture generation. Code and trained models are open-sourced. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.00915v1-abstract-full').style.display = 'none'; document.getElementById('2405.00915v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages. 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.01611">arXiv:2404.01611</a> <span> [<a href="https://arxiv.org/pdf/2404.01611">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Audio Simulation for Sound Source Localization in Virtual Evironment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Di+Yuan%2C+Y">Yi Di Yuan</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+S+L">Swee Liang Wong</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+J">Jonathan Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.01611v1-abstract-short" style="display: inline;"> Non-line-of-sight localization in signal-deprived environments is a challenging yet pertinent problem. Acoustic methods in such predominantly indoor scenarios encounter difficulty due to the reverberant nature. In this study, we aim to locate sound sources to specific locations within a virtual environment by leveraging physically grounded sound propagation simulations and machine learning methods… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01611v1-abstract-full').style.display = 'inline'; document.getElementById('2404.01611v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.01611v1-abstract-full" style="display: none;"> Non-line-of-sight localization in signal-deprived environments is a challenging yet pertinent problem. Acoustic methods in such predominantly indoor scenarios encounter difficulty due to the reverberant nature. In this study, we aim to locate sound sources to specific locations within a virtual environment by leveraging physically grounded sound propagation simulations and machine learning methods. This process attempts to overcome the issue of data insufficiency to localize sound sources to their location of occurrence especially in post-event localization. We achieve 0.786+/- 0.0136 F1-score using an audio transformer spectrogram approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01611v1-abstract-full').style.display = 'none'; document.getElementById('2404.01611v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2024 IEEE World Forum on Public Safety Technology</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.11324">arXiv:2403.11324</a> <span> [<a href="https://arxiv.org/pdf/2403.11324">pdf</a>, <a href="https://arxiv.org/format/2403.11324">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GeoGaussian: Geometry-aware Gaussian Splatting for Scene Rendering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yanyan Li</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+C">Chenyu Lyu</a>, <a href="/search/cs?searchtype=author&query=Di%2C+Y">Yan Di</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+G">Guangyao Zhai</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G+H">Gim Hee Lee</a>, <a href="/search/cs?searchtype=author&query=Tombari%2C+F">Federico Tombari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.11324v2-abstract-short" style="display: inline;"> During the Gaussian Splatting optimization process, the scene's geometry can gradually deteriorate if its structure is not deliberately preserved, especially in non-textured regions such as walls, ceilings, and furniture surfaces. This degradation significantly affects the rendering quality of novel views that deviate significantly from the viewpoints in the training data. To mitigate this issue,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.11324v2-abstract-full').style.display = 'inline'; document.getElementById('2403.11324v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.11324v2-abstract-full" style="display: none;"> During the Gaussian Splatting optimization process, the scene's geometry can gradually deteriorate if its structure is not deliberately preserved, especially in non-textured regions such as walls, ceilings, and furniture surfaces. This degradation significantly affects the rendering quality of novel views that deviate significantly from the viewpoints in the training data. To mitigate this issue, we propose a novel approach called GeoGaussian. Based on the smoothly connected areas observed from point clouds, this method introduces a novel pipeline to initialize thin Gaussians aligned with the surfaces, where the characteristic can be transferred to new generations through a carefully designed densification strategy. Finally, the pipeline ensures that the scene's geometry and texture are maintained through constrained optimization processes with explicit geometry constraints. Benefiting from the proposed architecture, the generative ability of 3D Gaussians is enhanced, especially in structured regions. Our proposed pipeline achieves state-of-the-art performance in novel view synthesis and geometric reconstruction, as evaluated qualitatively and quantitatively on public datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.11324v2-abstract-full').style.display = 'none'; document.getElementById('2403.11324v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted to ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.10099">arXiv:2403.10099</a> <span> [<a href="https://arxiv.org/pdf/2403.10099">pdf</a>, <a href="https://arxiv.org/format/2403.10099">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> KP-RED: Exploiting Semantic Keypoints for Joint 3D Shape Retrieval and Deformation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruida Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenyangguang Zhang</a>, <a href="/search/cs?searchtype=author&query=Di%2C+Y">Yan Di</a>, <a href="/search/cs?searchtype=author&query=Manhardt%2C+F">Fabian Manhardt</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xingyu Liu</a>, <a href="/search/cs?searchtype=author&query=Tombari%2C+F">Federico Tombari</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+X">Xiangyang Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.10099v3-abstract-short" style="display: inline;"> In this paper, we present KP-RED, a unified KeyPoint-driven REtrieval and Deformation framework that takes object scans as input and jointly retrieves and deforms the most geometrically similar CAD models from a pre-processed database to tightly match the target. Unlike existing dense matching based methods that typically struggle with noisy partial scans, we propose to leverage category-consisten… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10099v3-abstract-full').style.display = 'inline'; document.getElementById('2403.10099v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.10099v3-abstract-full" style="display: none;"> In this paper, we present KP-RED, a unified KeyPoint-driven REtrieval and Deformation framework that takes object scans as input and jointly retrieves and deforms the most geometrically similar CAD models from a pre-processed database to tightly match the target. Unlike existing dense matching based methods that typically struggle with noisy partial scans, we propose to leverage category-consistent sparse keypoints to naturally handle both full and partial object scans. Specifically, we first employ a lightweight retrieval module to establish a keypoint-based embedding space, measuring the similarity among objects by dynamically aggregating deformation-aware local-global features around extracted keypoints. Objects that are close in the embedding space are considered similar in geometry. Then we introduce the neural cage-based deformation module that estimates the influence vector of each keypoint upon cage vertices inside its local support region to control the deformation of the retrieved shape. Extensive experiments on the synthetic dataset PartNet and the real-world dataset Scan2CAD demonstrate that KP-RED surpasses existing state-of-the-art approaches by a large margin. Codes and trained models are released on https://github.com/lolrudy/KP-RED. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10099v3-abstract-full').style.display = 'none'; document.getElementById('2403.10099v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2024. We identified an error in our baseline experiments, re-ran them, and updated the results without impacting the paper's conclusions. We apologize for the oversight and appreciate your understanding</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.01970">arXiv:2401.01970</a> <span> [<a href="https://arxiv.org/pdf/2401.01970">pdf</a>, <a href="https://arxiv.org/format/2401.01970">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> FMGS: Foundation Model Embedded 3D Gaussian Splatting for Holistic 3D Scene Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zuo%2C+X">Xingxing Zuo</a>, <a href="/search/cs?searchtype=author&query=Samangouei%2C+P">Pouya Samangouei</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yunwen Zhou</a>, <a href="/search/cs?searchtype=author&query=Di%2C+Y">Yan Di</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingyang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.01970v2-abstract-short" style="display: inline;"> Precisely perceiving the geometric and semantic properties of real-world 3D objects is crucial for the continued evolution of augmented reality and robotic applications. To this end, we present Foundation Model Embedded Gaussian Splatting (FMGS), which incorporates vision-language embeddings of foundation models into 3D Gaussian Splatting (GS). The key contribution of this work is an efficient met… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.01970v2-abstract-full').style.display = 'inline'; document.getElementById('2401.01970v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.01970v2-abstract-full" style="display: none;"> Precisely perceiving the geometric and semantic properties of real-world 3D objects is crucial for the continued evolution of augmented reality and robotic applications. To this end, we present Foundation Model Embedded Gaussian Splatting (FMGS), which incorporates vision-language embeddings of foundation models into 3D Gaussian Splatting (GS). The key contribution of this work is an efficient method to reconstruct and represent 3D vision-language models. This is achieved by distilling feature maps generated from image-based foundation models into those rendered from our 3D model. To ensure high-quality rendering and fast training, we introduce a novel scene representation by integrating strengths from both GS and multi-resolution hash encodings (MHE). Our effective training procedure also introduces a pixel alignment loss that makes the rendered feature distance of the same semantic entities close, following the pixel-level semantic boundaries. Our results demonstrate remarkable multi-view semantic consistency, facilitating diverse downstream tasks, beating state-of-the-art methods by 10.2 percent on open-vocabulary language-based object detection, despite that we are 851X faster for inference. This research explores the intersection of vision, language, and 3D scene representation, paving the way for enhanced scene understanding in uncontrolled real-world environments. We plan to release the code on the project page. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.01970v2-abstract-full').style.display = 'none'; document.getElementById('2401.01970v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://xingxingzuo.github.io/fmgs</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.14189">arXiv:2311.14189</a> <span> [<a href="https://arxiv.org/pdf/2311.14189">pdf</a>, <a href="https://arxiv.org/format/2311.14189">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> D-SCo: Dual-Stream Conditional Diffusion for Monocular Hand-Held Object Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fu%2C+B">Bowen Fu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Gu Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenyangguang Zhang</a>, <a href="/search/cs?searchtype=author&query=Di%2C+Y">Yan Di</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Ziqin Huang</a>, <a href="/search/cs?searchtype=author&query=Leng%2C+Z">Zhiying Leng</a>, <a href="/search/cs?searchtype=author&query=Manhardt%2C+F">Fabian Manhardt</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+X">Xiangyang Ji</a>, <a href="/search/cs?searchtype=author&query=Tombari%2C+F">Federico Tombari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.14189v4-abstract-short" style="display: inline;"> Reconstructing hand-held objects from a single RGB image is a challenging task in computer vision. In contrast to prior works that utilize deterministic modeling paradigms, we employ a point cloud denoising diffusion model to account for the probabilistic nature of this problem. In the core, we introduce centroid-fixed dual-stream conditional diffusion for monocular hand-held object reconstruction… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.14189v4-abstract-full').style.display = 'inline'; document.getElementById('2311.14189v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.14189v4-abstract-full" style="display: none;"> Reconstructing hand-held objects from a single RGB image is a challenging task in computer vision. In contrast to prior works that utilize deterministic modeling paradigms, we employ a point cloud denoising diffusion model to account for the probabilistic nature of this problem. In the core, we introduce centroid-fixed dual-stream conditional diffusion for monocular hand-held object reconstruction (D-SCo), tackling two predominant challenges. First, to avoid the object centroid from deviating, we utilize a novel hand-constrained centroid fixing paradigm, enhancing the stability of diffusion and reverse processes and the precision of feature projection. Second, we introduce a dual-stream denoiser to semantically and geometrically model hand-object interactions with a novel unified hand-object semantic embedding, enhancing the reconstruction performance of the hand-occluded region of the object. Experiments on the synthetic ObMan dataset and three real-world datasets HO3D, MOW and DexYCB demonstrate that our approach can surpass all other state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.14189v4-abstract-full').style.display = 'none'; document.getElementById('2311.14189v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.12588">arXiv:2311.12588</a> <span> [<a href="https://arxiv.org/pdf/2311.12588">pdf</a>, <a href="https://arxiv.org/format/2311.12588">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HiPose: Hierarchical Binary Surface Encoding and Correspondence Pruning for RGB-D 6DoF Object Pose Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yongliang Lin</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yongzhi Su</a>, <a href="/search/cs?searchtype=author&query=Nathan%2C+P">Praveen Nathan</a>, <a href="/search/cs?searchtype=author&query=Inuganti%2C+S">Sandeep Inuganti</a>, <a href="/search/cs?searchtype=author&query=Di%2C+Y">Yan Di</a>, <a href="/search/cs?searchtype=author&query=Sundermeyer%2C+M">Martin Sundermeyer</a>, <a href="/search/cs?searchtype=author&query=Manhardt%2C+F">Fabian Manhardt</a>, <a href="/search/cs?searchtype=author&query=Stricker%2C+D">Didier Stricker</a>, <a href="/search/cs?searchtype=author&query=Rambach%2C+J">Jason Rambach</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.12588v3-abstract-short" style="display: inline;"> In this work, we present a novel dense-correspondence method for 6DoF object pose estimation from a single RGB-D image. While many existing data-driven methods achieve impressive performance, they tend to be time-consuming due to their reliance on rendering-based refinement approaches. To circumvent this limitation, we present HiPose, which establishes 3D-3D correspondences in a coarse-to-fine man… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.12588v3-abstract-full').style.display = 'inline'; document.getElementById('2311.12588v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.12588v3-abstract-full" style="display: none;"> In this work, we present a novel dense-correspondence method for 6DoF object pose estimation from a single RGB-D image. While many existing data-driven methods achieve impressive performance, they tend to be time-consuming due to their reliance on rendering-based refinement approaches. To circumvent this limitation, we present HiPose, which establishes 3D-3D correspondences in a coarse-to-fine manner with a hierarchical binary surface encoding. Unlike previous dense-correspondence methods, we estimate the correspondence surface by employing point-to-surface matching and iteratively constricting the surface until it becomes a correspondence point while gradually removing outliers. Extensive experiments on public benchmarks LM-O, YCB-V, and T-Less demonstrate that our method surpasses all refinement-free methods and is even on par with expensive refinement-based approaches. Crucially, our approach is computationally efficient and enables real-time critical applications with high accuracy requirements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.12588v3-abstract-full').style.display = 'none'; document.getElementById('2311.12588v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.11125">arXiv:2311.11125</a> <span> [<a href="https://arxiv.org/pdf/2311.11125">pdf</a>, <a href="https://arxiv.org/format/2311.11125">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SecondPose: SE(3)-Consistent Dual-Stream Feature Fusion for Category-Level Pose Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yamei Chen</a>, <a href="/search/cs?searchtype=author&query=Di%2C+Y">Yan Di</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+G">Guangyao Zhai</a>, <a href="/search/cs?searchtype=author&query=Manhardt%2C+F">Fabian Manhardt</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenyangguang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruida Zhang</a>, <a href="/search/cs?searchtype=author&query=Tombari%2C+F">Federico Tombari</a>, <a href="/search/cs?searchtype=author&query=Navab%2C+N">Nassir Navab</a>, <a href="/search/cs?searchtype=author&query=Busam%2C+B">Benjamin Busam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.11125v3-abstract-short" style="display: inline;"> Category-level object pose estimation, aiming to predict the 6D pose and 3D size of objects from known categories, typically struggles with large intra-class shape variation. Existing works utilizing mean shapes often fall short of capturing this variation. To address this issue, we present SecondPose, a novel approach integrating object-specific geometric features with semantic category priors fr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.11125v3-abstract-full').style.display = 'inline'; document.getElementById('2311.11125v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.11125v3-abstract-full" style="display: none;"> Category-level object pose estimation, aiming to predict the 6D pose and 3D size of objects from known categories, typically struggles with large intra-class shape variation. Existing works utilizing mean shapes often fall short of capturing this variation. To address this issue, we present SecondPose, a novel approach integrating object-specific geometric features with semantic category priors from DINOv2. Leveraging the advantage of DINOv2 in providing SE(3)-consistent semantic features, we hierarchically extract two types of SE(3)-invariant geometric features to further encapsulate local-to-global object-specific information. These geometric features are then point-aligned with DINOv2 features to establish a consistent object representation under SE(3) transformations, facilitating the mapping from camera space to the pre-defined canonical space, thus further enhancing pose estimation. Extensive experiments on NOCS-REAL275 demonstrate that SecondPose achieves a 12.4% leap forward over the state-of-the-art. Moreover, on a more complex dataset HouseCat6D which provides photometrically challenging objects, SecondPose still surpasses other competitors by a large margin. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.11125v3-abstract-full').style.display = 'none'; document.getElementById('2311.11125v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024 accepted. Code is available at: https://github.com/NOrangeeroli/SecondPose</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.11106">arXiv:2311.11106</a> <span> [<a href="https://arxiv.org/pdf/2311.11106">pdf</a>, <a href="https://arxiv.org/format/2311.11106">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ShapeMatcher: Self-Supervised Joint Shape Canonicalization, Segmentation, Retrieval and Deformation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Di%2C+Y">Yan Di</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenyangguang Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chaowei Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruida Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+G">Guangyao Zhai</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yanyan Li</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+B">Bowen Fu</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+X">Xiangyang Ji</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+S">Shan Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.11106v2-abstract-short" style="display: inline;"> In this paper, we present ShapeMatcher, a unified self-supervised learning framework for joint shape canonicalization, segmentation, retrieval and deformation. Given a partially-observed object in an arbitrary pose, we first canonicalize the object by extracting point-wise affine-invariant features, disentangling inherent structure of the object with its pose and size. These learned features are t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.11106v2-abstract-full').style.display = 'inline'; document.getElementById('2311.11106v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.11106v2-abstract-full" style="display: none;"> In this paper, we present ShapeMatcher, a unified self-supervised learning framework for joint shape canonicalization, segmentation, retrieval and deformation. Given a partially-observed object in an arbitrary pose, we first canonicalize the object by extracting point-wise affine-invariant features, disentangling inherent structure of the object with its pose and size. These learned features are then leveraged to predict semantically consistent part segmentation and corresponding part centers. Next, our lightweight retrieval module aggregates the features within each part as its retrieval token and compare all the tokens with source shapes from a pre-established database to identify the most geometrically similar shape. Finally, we deform the retrieved shape in the deformation module to tightly fit the input object by harnessing part center guided neural cage deformation. The key insight of ShapeMaker is the simultaneous training of the four highly-associated processes: canonicalization, segmentation, retrieval, and deformation, leveraging cross-task consistency losses for mutual supervision. Extensive experiments on synthetic datasets PartNet, ComplementMe, and real-world dataset Scan2CAD demonstrate that ShapeMaker surpasses competitors by a large margin. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.11106v2-abstract-full').style.display = 'none'; document.getElementById('2311.11106v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.13819">arXiv:2310.13819</a> <span> [<a href="https://arxiv.org/pdf/2310.13819">pdf</a>, <a href="https://arxiv.org/format/2310.13819">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> LanPose: Language-Instructed 6D Object Pose Estimation for Robotic Assembly </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fu%2C+B">Bowen Fu</a>, <a href="/search/cs?searchtype=author&query=Leong%2C+S+K">Sek Kun Leong</a>, <a href="/search/cs?searchtype=author&query=Di%2C+Y">Yan Di</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jiwen Tang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+X">Xiangyang Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.13819v1-abstract-short" style="display: inline;"> Comprehending natural language instructions is a critical skill for robots to cooperate effectively with humans. In this paper, we aim to learn 6D poses for roboticassembly by natural language instructions. For this purpose, Language-Instructed 6D Pose Regression Network (LanPose) is proposed to jointly predict the 6D poses of the observed object and the corresponding assembly position. Our propos… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.13819v1-abstract-full').style.display = 'inline'; document.getElementById('2310.13819v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.13819v1-abstract-full" style="display: none;"> Comprehending natural language instructions is a critical skill for robots to cooperate effectively with humans. In this paper, we aim to learn 6D poses for roboticassembly by natural language instructions. For this purpose, Language-Instructed 6D Pose Regression Network (LanPose) is proposed to jointly predict the 6D poses of the observed object and the corresponding assembly position. Our proposed approach is based on the fusion of geometric and linguistic features, which allows us to finely integrate multi-modality input and map it to the 6D pose in SE(3) space by the cross-attention mechanism and the language-integrated 6D pose mapping module, respectively. To validate the effectiveness of our approach, an integrated robotic system is established to precisely and robustly perceive, grasp, manipulate and assemble blocks by language commands. 98.09 and 93.55 in ADD(-S)-0.1d are derived for the prediction of 6D object pose and 6D assembly pose, respectively. Both quantitative and qualitative results demonstrate the effectiveness of our proposed language-instructed 6D pose estimation methodology and its potential to enable robots to better understand and execute natural language instructions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.13819v1-abstract-full').style.display = 'none'; document.getElementById('2310.13819v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.11696">arXiv:2310.11696</a> <span> [<a href="https://arxiv.org/pdf/2310.11696">pdf</a>, <a href="https://arxiv.org/format/2310.11696">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MOHO: Learning Single-view Hand-held Object Reconstruction with Multi-view Occlusion-Aware Supervision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenyangguang Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+G">Guanlong Jiao</a>, <a href="/search/cs?searchtype=author&query=Di%2C+Y">Yan Di</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Gu Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Ziqin Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruida Zhang</a>, <a href="/search/cs?searchtype=author&query=Manhardt%2C+F">Fabian Manhardt</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+B">Bowen Fu</a>, <a href="/search/cs?searchtype=author&query=Tombari%2C+F">Federico Tombari</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+X">Xiangyang Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.11696v2-abstract-short" style="display: inline;"> Previous works concerning single-view hand-held object reconstruction typically rely on supervision from 3D ground-truth models, which are hard to collect in real world. In contrast, readily accessible hand-object videos offer a promising training data source, but they only give heavily occluded object observations. In this paper, we present a novel synthetic-to-real framework to exploit Multi-vie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.11696v2-abstract-full').style.display = 'inline'; document.getElementById('2310.11696v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.11696v2-abstract-full" style="display: none;"> Previous works concerning single-view hand-held object reconstruction typically rely on supervision from 3D ground-truth models, which are hard to collect in real world. In contrast, readily accessible hand-object videos offer a promising training data source, but they only give heavily occluded object observations. In this paper, we present a novel synthetic-to-real framework to exploit Multi-view Occlusion-aware supervision from hand-object videos for Hand-held Object reconstruction (MOHO) from a single image, tackling two predominant challenges in such setting: hand-induced occlusion and object's self-occlusion. First, in the synthetic pre-training stage, we render a large-scaled synthetic dataset SOMVideo with hand-object images and multi-view occlusion-free supervisions, adopted to address hand-induced occlusion in both 2D and 3D spaces. Second, in the real-world finetuning stage, MOHO leverages the amodal-mask-weighted geometric supervision to mitigate the unfaithful guidance caused by the hand-occluded supervising views in real world. Moreover, domain-consistent occlusion-aware features are amalgamated in MOHO to resist object's self-occlusion for inferring the complete object shape. Extensive experiments on HO3D and DexYCB datasets demonstrate 2D-supervised MOHO gains superior results against 3D-supervised methods by a large margin. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.11696v2-abstract-full').style.display = 'none'; document.getElementById('2310.11696v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.16205">arXiv:2309.16205</a> <span> [<a href="https://arxiv.org/pdf/2309.16205">pdf</a>, <a href="https://arxiv.org/format/2309.16205">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> DiffGAN-F2S: Symmetric and Efficient Denoising Diffusion GANs for Structural Connectivity Prediction from Brain fMRI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zuo%2C+Q">Qiankun Zuo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruiheng Li</a>, <a href="/search/cs?searchtype=author&query=Di%2C+Y">Yi Di</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+H">Hao Tian</a>, <a href="/search/cs?searchtype=author&query=Jing%2C+C">Changhong Jing</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xuhang Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuqiang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.16205v1-abstract-short" style="display: inline;"> Mapping from functional connectivity (FC) to structural connectivity (SC) can facilitate multimodal brain network fusion and discover potential biomarkers for clinical implications. However, it is challenging to directly bridge the reliable non-linear mapping relations between SC and functional magnetic resonance imaging (fMRI). In this paper, a novel diffusision generative adversarial network-bas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.16205v1-abstract-full').style.display = 'inline'; document.getElementById('2309.16205v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.16205v1-abstract-full" style="display: none;"> Mapping from functional connectivity (FC) to structural connectivity (SC) can facilitate multimodal brain network fusion and discover potential biomarkers for clinical implications. However, it is challenging to directly bridge the reliable non-linear mapping relations between SC and functional magnetic resonance imaging (fMRI). In this paper, a novel diffusision generative adversarial network-based fMRI-to-SC (DiffGAN-F2S) model is proposed to predict SC from brain fMRI in an end-to-end manner. To be specific, the proposed DiffGAN-F2S leverages denoising diffusion probabilistic models (DDPMs) and adversarial learning to efficiently generate high-fidelity SC through a few steps from fMRI. By designing the dual-channel multi-head spatial attention (DMSA) and graph convolutional modules, the symmetric graph generator first captures global relations among direct and indirect connected brain regions, then models the local brain region interactions. It can uncover the complex mapping relations between fMRI and structural connectivity. Furthermore, the spatially connected consistency loss is devised to constrain the generator to preserve global-local topological information for accurate intrinsic SC prediction. Testing on the public Alzheimer's Disease Neuroimaging Initiative (ADNI) dataset, the proposed model can effectively generate empirical SC-preserved connectivity from four-dimensional imaging data and shows superior performance in SC prediction compared with other related models. Furthermore, the proposed model can identify the vast majority of important brain regions and connections derived from the empirical method, providing an alternative way to fuse multimodal brain networks and analyze clinical disease. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.16205v1-abstract-full').style.display = 'none'; document.getElementById('2309.16205v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.12188">arXiv:2309.12188</a> <span> [<a href="https://arxiv.org/pdf/2309.12188">pdf</a>, <a href="https://arxiv.org/format/2309.12188">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SG-Bot: Object Rearrangement via Coarse-to-Fine Robotic Imagination on Scene Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhai%2C+G">Guangyao Zhai</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+X">Xiaoni Cai</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Dianye Huang</a>, <a href="/search/cs?searchtype=author&query=Di%2C+Y">Yan Di</a>, <a href="/search/cs?searchtype=author&query=Manhardt%2C+F">Fabian Manhardt</a>, <a href="/search/cs?searchtype=author&query=Tombari%2C+F">Federico Tombari</a>, <a href="/search/cs?searchtype=author&query=Navab%2C+N">Nassir Navab</a>, <a href="/search/cs?searchtype=author&query=Busam%2C+B">Benjamin Busam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.12188v2-abstract-short" style="display: inline;"> Object rearrangement is pivotal in robotic-environment interactions, representing a significant capability in embodied AI. In this paper, we present SG-Bot, a novel rearrangement framework that utilizes a coarse-to-fine scheme with a scene graph as the scene representation. Unlike previous methods that rely on either known goal priors or zero-shot large models, SG-Bot exemplifies lightweight, real… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.12188v2-abstract-full').style.display = 'inline'; document.getElementById('2309.12188v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.12188v2-abstract-full" style="display: none;"> Object rearrangement is pivotal in robotic-environment interactions, representing a significant capability in embodied AI. In this paper, we present SG-Bot, a novel rearrangement framework that utilizes a coarse-to-fine scheme with a scene graph as the scene representation. Unlike previous methods that rely on either known goal priors or zero-shot large models, SG-Bot exemplifies lightweight, real-time, and user-controllable characteristics, seamlessly blending the consideration of commonsense knowledge with automatic generation capabilities. SG-Bot employs a three-fold procedure--observation, imagination, and execution--to adeptly address the task. Initially, objects are discerned and extracted from a cluttered scene during the observation. These objects are first coarsely organized and depicted within a scene graph, guided by either commonsense or user-defined criteria. Then, this scene graph subsequently informs a generative model, which forms a fine-grained goal scene considering the shape information from the initial scene and object semantics. Finally, for execution, the initial and envisioned goal scenes are matched to formulate robotic action policies. Experimental results demonstrate that SG-Bot outperforms competitors by a large margin. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.12188v2-abstract-full').style.display = 'none'; document.getElementById('2309.12188v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICRA 2024 accepted. Project website: https://sites.google.com/view/sg-bot</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.10564">arXiv:2308.10564</a> <span> [<a href="https://arxiv.org/pdf/2308.10564">pdf</a>, <a href="https://arxiv.org/format/2308.10564">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Software Entity Recognition with Noise-Robust Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nguyen%2C+T">Tai Nguyen</a>, <a href="/search/cs?searchtype=author&query=Di%2C+Y">Yifeng Di</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Joohan Lee</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Muhao Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianyi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.10564v1-abstract-short" style="display: inline;"> Recognizing software entities such as library names from free-form text is essential to enable many software engineering (SE) technologies, such as traceability link recovery, automated documentation, and API recommendation. While many approaches have been proposed to address this problem, they suffer from small entity vocabularies or noisy training data, hindering their ability to recognize softw… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.10564v1-abstract-full').style.display = 'inline'; document.getElementById('2308.10564v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.10564v1-abstract-full" style="display: none;"> Recognizing software entities such as library names from free-form text is essential to enable many software engineering (SE) technologies, such as traceability link recovery, automated documentation, and API recommendation. While many approaches have been proposed to address this problem, they suffer from small entity vocabularies or noisy training data, hindering their ability to recognize software entities mentioned in sophisticated narratives. To address this challenge, we leverage the Wikipedia taxonomy to develop a comprehensive entity lexicon with 79K unique software entities in 12 fine-grained types, as well as a large labeled dataset of over 1.7M sentences. Then, we propose self-regularization, a noise-robust learning approach, to the training of our software entity recognition (SER) model by accounting for many dropouts. Results show that models trained with self-regularization outperform both their vanilla counterparts and state-of-the-art approaches on our Wikipedia benchmark and two Stack Overflow benchmarks. We release our models, data, and code for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.10564v1-abstract-full').style.display = 'none'; document.getElementById('2308.10564v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ASE 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.08231">arXiv:2308.08231</a> <span> [<a href="https://arxiv.org/pdf/2308.08231">pdf</a>, <a href="https://arxiv.org/format/2308.08231">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DDF-HO: Hand-Held Object Reconstruction via Conditional Directed Distance Field </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenyangguang Zhang</a>, <a href="/search/cs?searchtype=author&query=Di%2C+Y">Yan Di</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruida Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+G">Guangyao Zhai</a>, <a href="/search/cs?searchtype=author&query=Manhardt%2C+F">Fabian Manhardt</a>, <a href="/search/cs?searchtype=author&query=Tombari%2C+F">Federico Tombari</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+X">Xiangyang Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.08231v3-abstract-short" style="display: inline;"> Reconstructing hand-held objects from a single RGB image is an important and challenging problem. Existing works utilizing Signed Distance Fields (SDF) reveal limitations in comprehensively capturing the complex hand-object interactions, since SDF is only reliable within the proximity of the target, and hence, infeasible to simultaneously encode local hand and object cues. To address this issue, w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.08231v3-abstract-full').style.display = 'inline'; document.getElementById('2308.08231v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.08231v3-abstract-full" style="display: none;"> Reconstructing hand-held objects from a single RGB image is an important and challenging problem. Existing works utilizing Signed Distance Fields (SDF) reveal limitations in comprehensively capturing the complex hand-object interactions, since SDF is only reliable within the proximity of the target, and hence, infeasible to simultaneously encode local hand and object cues. To address this issue, we propose DDF-HO, a novel approach leveraging Directed Distance Field (DDF) as the shape representation. Unlike SDF, DDF maps a ray in 3D space, consisting of an origin and a direction, to corresponding DDF values, including a binary visibility signal determining whether the ray intersects the objects and a distance value measuring the distance from origin to target in the given direction. We randomly sample multiple rays and collect local to global geometric features for them by introducing a novel 2D ray-based feature aggregation scheme and a 3D intersection-aware hand pose embedding, combining 2D-3D features to model hand-object interactions. Extensive experiments on synthetic and real-world datasets demonstrate that DDF-HO consistently outperforms all baseline methods by a large margin, especially under Chamfer Distance, with about 80% leap forward. Codes are available at https://github.com/ZhangCYG/DDFHO. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.08231v3-abstract-full').style.display = 'none'; document.getElementById('2308.08231v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Camera Ready for NeurIPS 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.07837">arXiv:2308.07837</a> <span> [<a href="https://arxiv.org/pdf/2308.07837">pdf</a>, <a href="https://arxiv.org/format/2308.07837">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CCD-3DR: Consistent Conditioning in Diffusion for Single-Image 3D Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Di%2C+Y">Yan Di</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenyangguang Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pengyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+G">Guangyao Zhai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruida Zhang</a>, <a href="/search/cs?searchtype=author&query=Manhardt%2C+F">Fabian Manhardt</a>, <a href="/search/cs?searchtype=author&query=Busam%2C+B">Benjamin Busam</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+X">Xiangyang Ji</a>, <a href="/search/cs?searchtype=author&query=Tombari%2C+F">Federico Tombari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.07837v1-abstract-short" style="display: inline;"> In this paper, we present a novel shape reconstruction method leveraging diffusion model to generate 3D sparse point cloud for the object captured in a single RGB image. Recent methods typically leverage global embedding or local projection-based features as the condition to guide the diffusion model. However, such strategies fail to consistently align the denoised point cloud with the given image… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.07837v1-abstract-full').style.display = 'inline'; document.getElementById('2308.07837v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.07837v1-abstract-full" style="display: none;"> In this paper, we present a novel shape reconstruction method leveraging diffusion model to generate 3D sparse point cloud for the object captured in a single RGB image. Recent methods typically leverage global embedding or local projection-based features as the condition to guide the diffusion model. However, such strategies fail to consistently align the denoised point cloud with the given image, leading to unstable conditioning and inferior performance. In this paper, we present CCD-3DR, which exploits a novel centered diffusion probabilistic model for consistent local feature conditioning. We constrain the noise and sampled point cloud from the diffusion model into a subspace where the point cloud center remains unchanged during the forward diffusion process and reverse process. The stable point cloud center further serves as an anchor to align each point with its corresponding local projection-based features. Extensive experiments on synthetic benchmark ShapeNet-R2N2 demonstrate that CCD-3DR outperforms all competitors by a large margin, with over 40% improvement. We also provide results on real-world dataset Pix3D to thoroughly demonstrate the potential of CCD-3DR in real-world applications. Codes will be released soon <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.07837v1-abstract-full').style.display = 'none'; document.getElementById('2308.07837v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.06383">arXiv:2308.06383</a> <span> [<a href="https://arxiv.org/pdf/2308.06383">pdf</a>, <a href="https://arxiv.org/format/2308.06383">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> U-RED: Unsupervised 3D Shape Retrieval and Deformation for Partial Point Clouds </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Di%2C+Y">Yan Di</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenyangguang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruida Zhang</a>, <a href="/search/cs?searchtype=author&query=Manhardt%2C+F">Fabian Manhardt</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yongzhi Su</a>, <a href="/search/cs?searchtype=author&query=Rambach%2C+J">Jason Rambach</a>, <a href="/search/cs?searchtype=author&query=Stricker%2C+D">Didier Stricker</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+X">Xiangyang Ji</a>, <a href="/search/cs?searchtype=author&query=Tombari%2C+F">Federico Tombari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.06383v1-abstract-short" style="display: inline;"> In this paper, we propose U-RED, an Unsupervised shape REtrieval and Deformation pipeline that takes an arbitrary object observation as input, typically captured by RGB images or scans, and jointly retrieves and deforms the geometrically similar CAD models from a pre-established database to tightly match the target. Considering existing methods typically fail to handle noisy partial observations,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.06383v1-abstract-full').style.display = 'inline'; document.getElementById('2308.06383v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.06383v1-abstract-full" style="display: none;"> In this paper, we propose U-RED, an Unsupervised shape REtrieval and Deformation pipeline that takes an arbitrary object observation as input, typically captured by RGB images or scans, and jointly retrieves and deforms the geometrically similar CAD models from a pre-established database to tightly match the target. Considering existing methods typically fail to handle noisy partial observations, U-RED is designed to address this issue from two aspects. First, since one partial shape may correspond to multiple potential full shapes, the retrieval method must allow such an ambiguous one-to-many relationship. Thereby U-RED learns to project all possible full shapes of a partial target onto the surface of a unit sphere. Then during inference, each sampling on the sphere will yield a feasible retrieval. Second, since real-world partial observations usually contain noticeable noise, a reliable learned metric that measures the similarity between shapes is necessary for stable retrieval. In U-RED, we design a novel point-wise residual-guided metric that allows noise-robust comparison. Extensive experiments on the synthetic datasets PartNet, ComplementMe and the real-world dataset Scan2CAD demonstrate that U-RED surpasses existing state-of-the-art approaches by 47.3%, 16.7% and 31.6% respectively under Chamfer Distance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.06383v1-abstract-full').style.display = 'none'; document.getElementById('2308.06383v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICCV2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.16283">arXiv:2305.16283</a> <span> [<a href="https://arxiv.org/pdf/2305.16283">pdf</a>, <a href="https://arxiv.org/format/2305.16283">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CommonScenes: Generating Commonsense 3D Indoor Scenes with Scene Graph Diffusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhai%2C+G">Guangyao Zhai</a>, <a href="/search/cs?searchtype=author&query=%C3%96rnek%2C+E+P">Evin P谋nar 脰rnek</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shun-Cheng Wu</a>, <a href="/search/cs?searchtype=author&query=Di%2C+Y">Yan Di</a>, <a href="/search/cs?searchtype=author&query=Tombari%2C+F">Federico Tombari</a>, <a href="/search/cs?searchtype=author&query=Navab%2C+N">Nassir Navab</a>, <a href="/search/cs?searchtype=author&query=Busam%2C+B">Benjamin Busam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.16283v5-abstract-short" style="display: inline;"> Controllable scene synthesis aims to create interactive environments for various industrial use cases. Scene graphs provide a highly suitable interface to facilitate these applications by abstracting the scene context in a compact manner. Existing methods, reliant on retrieval from extensive databases or pre-trained shape embeddings, often overlook scene-object and object-object relationships, lea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.16283v5-abstract-full').style.display = 'inline'; document.getElementById('2305.16283v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.16283v5-abstract-full" style="display: none;"> Controllable scene synthesis aims to create interactive environments for various industrial use cases. Scene graphs provide a highly suitable interface to facilitate these applications by abstracting the scene context in a compact manner. Existing methods, reliant on retrieval from extensive databases or pre-trained shape embeddings, often overlook scene-object and object-object relationships, leading to inconsistent results due to their limited generation capacity. To address this issue, we present CommonScenes, a fully generative model that converts scene graphs into corresponding controllable 3D scenes, which are semantically realistic and conform to commonsense. Our pipeline consists of two branches, one predicting the overall scene layout via a variational auto-encoder and the other generating compatible shapes via latent diffusion, capturing global scene-object and local inter-object relationships in the scene graph while preserving shape diversity. The generated scenes can be manipulated by editing the input scene graph and sampling the noise in the diffusion model. Due to lacking a scene graph dataset offering high-quality object-level meshes with relations, we also construct SG-FRONT, enriching the off-the-shelf indoor dataset 3D-FRONT with additional scene graph labels. Extensive experiments are conducted on SG-FRONT where CommonScenes shows clear advantages over other methods regarding generation consistency, quality, and diversity. Codes and the dataset will be released upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.16283v5-abstract-full').style.display = 'none'; document.getElementById('2305.16283v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2023 camera-ready</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.00575">arXiv:2303.00575</a> <span> [<a href="https://arxiv.org/pdf/2303.00575">pdf</a>, <a href="https://arxiv.org/format/2303.00575">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> IPCC-TP: Utilizing Incremental Pearson Correlation Coefficient for Joint Multi-Agent Trajectory Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+D">Dekai Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+G">Guangyao Zhai</a>, <a href="/search/cs?searchtype=author&query=Di%2C+Y">Yan Di</a>, <a href="/search/cs?searchtype=author&query=Manhardt%2C+F">Fabian Manhardt</a>, <a href="/search/cs?searchtype=author&query=Berkemeyer%2C+H">Hendrik Berkemeyer</a>, <a href="/search/cs?searchtype=author&query=Tran%2C+T">Tuan Tran</a>, <a href="/search/cs?searchtype=author&query=Navab%2C+N">Nassir Navab</a>, <a href="/search/cs?searchtype=author&query=Tombari%2C+F">Federico Tombari</a>, <a href="/search/cs?searchtype=author&query=Busam%2C+B">Benjamin Busam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.00575v4-abstract-short" style="display: inline;"> Reliable multi-agent trajectory prediction is crucial for the safe planning and control of autonomous systems. Compared with single-agent cases, the major challenge in simultaneously processing multiple agents lies in modeling complex social interactions caused by various driving intentions and road conditions. Previous methods typically leverage graph-based message propagation or attention mechan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.00575v4-abstract-full').style.display = 'inline'; document.getElementById('2303.00575v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.00575v4-abstract-full" style="display: none;"> Reliable multi-agent trajectory prediction is crucial for the safe planning and control of autonomous systems. Compared with single-agent cases, the major challenge in simultaneously processing multiple agents lies in modeling complex social interactions caused by various driving intentions and road conditions. Previous methods typically leverage graph-based message propagation or attention mechanism to encapsulate such interactions in the format of marginal probabilistic distributions. However, it is inherently sub-optimal. In this paper, we propose IPCC-TP, a novel relevance-aware module based on Incremental Pearson Correlation Coefficient to improve multi-agent interaction modeling. IPCC-TP learns pairwise joint Gaussian Distributions through the tightly-coupled estimation of the means and covariances according to interactive incremental movements. Our module can be conveniently embedded into existing multi-agent prediction methods to extend original motion distribution decoders. Extensive experiments on nuScenes and Argoverse 2 datasets demonstrate that IPCC-TP improves the performance of baselines by a large margin. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.00575v4-abstract-full').style.display = 'none'; document.getElementById('2303.00575v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2023 accepted</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.06524">arXiv:2212.06524</a> <span> [<a href="https://arxiv.org/pdf/2212.06524">pdf</a>, <a href="https://arxiv.org/format/2212.06524">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SST: Real-time End-to-end Monocular 3D Reconstruction via Sparse Spatial-Temporal Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenyangguang Zhang</a>, <a href="/search/cs?searchtype=author&query=Lou%2C+Z">Zhiqiang Lou</a>, <a href="/search/cs?searchtype=author&query=Di%2C+Y">Yan Di</a>, <a href="/search/cs?searchtype=author&query=Tombari%2C+F">Federico Tombari</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+X">Xiangyang Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.06524v2-abstract-short" style="display: inline;"> Real-time monocular 3D reconstruction is a challenging problem that remains unsolved. Although recent end-to-end methods have demonstrated promising results, tiny structures and geometric boundaries are hardly captured due to their insufficient supervision neglecting spatial details and oversimplified feature fusion ignoring temporal cues. To address the problems, we propose an end-to-end 3D recon… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.06524v2-abstract-full').style.display = 'inline'; document.getElementById('2212.06524v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.06524v2-abstract-full" style="display: none;"> Real-time monocular 3D reconstruction is a challenging problem that remains unsolved. Although recent end-to-end methods have demonstrated promising results, tiny structures and geometric boundaries are hardly captured due to their insufficient supervision neglecting spatial details and oversimplified feature fusion ignoring temporal cues. To address the problems, we propose an end-to-end 3D reconstruction network SST, which utilizes Sparse estimated points from visual SLAM system as additional Spatial guidance and fuses Temporal features via a novel cross-modal attention mechanism, achieving more detailed reconstruction results. We propose a Local Spatial-Temporal Fusion module to exploit more informative spatial-temporal cues from multi-view color information and sparse priors, as well a Global Spatial-Temporal Fusion module to refine the local TSDF volumes with the world-frame model from coarse to fine. Extensive experiments on ScanNet and 7-Scenes demonstrate that SST outperforms all state-of-the-art competitors, whilst keeping a high inference speed at 59 FPS, enabling real-world applications with real-time requirements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.06524v2-abstract-full').style.display = 'none'; document.getElementById('2212.06524v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICME 2023 (oral)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Report number:</span> camera ready for ICME 2023 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.01142">arXiv:2211.01142</a> <span> [<a href="https://arxiv.org/pdf/2211.01142">pdf</a>, <a href="https://arxiv.org/format/2211.01142">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> OPA-3D: Occlusion-Aware Pixel-Wise Aggregation for Monocular 3D Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yongzhi Su</a>, <a href="/search/cs?searchtype=author&query=Di%2C+Y">Yan Di</a>, <a href="/search/cs?searchtype=author&query=Manhardt%2C+F">Fabian Manhardt</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+G">Guangyao Zhai</a>, <a href="/search/cs?searchtype=author&query=Rambach%2C+J">Jason Rambach</a>, <a href="/search/cs?searchtype=author&query=Busam%2C+B">Benjamin Busam</a>, <a href="/search/cs?searchtype=author&query=Stricker%2C+D">Didier Stricker</a>, <a href="/search/cs?searchtype=author&query=Tombari%2C+F">Federico Tombari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.01142v1-abstract-short" style="display: inline;"> Despite monocular 3D object detection having recently made a significant leap forward thanks to the use of pre-trained depth estimators for pseudo-LiDAR recovery, such two-stage methods typically suffer from overfitting and are incapable of explicitly encapsulating the geometric relation between depth and object bounding box. To overcome this limitation, we instead propose OPA-3D, a single-stage,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.01142v1-abstract-full').style.display = 'inline'; document.getElementById('2211.01142v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.01142v1-abstract-full" style="display: none;"> Despite monocular 3D object detection having recently made a significant leap forward thanks to the use of pre-trained depth estimators for pseudo-LiDAR recovery, such two-stage methods typically suffer from overfitting and are incapable of explicitly encapsulating the geometric relation between depth and object bounding box. To overcome this limitation, we instead propose OPA-3D, a single-stage, end-to-end, Occlusion-Aware Pixel-Wise Aggregation network that to jointly estimate dense scene depth with depth-bounding box residuals and object bounding boxes, allowing a two-stream detection of 3D objects, leading to significantly more robust detections. Thereby, the geometry stream denoted as the Geometry Stream, combines visible depth and depth-bounding box residuals to recover the object bounding box via explicit occlusion-aware optimization. In addition, a bounding box based geometry projection scheme is employed in an effort to enhance distance perception. The second stream, named as the Context Stream, directly regresses 3D object location and size. This novel two-stream representation further enables us to enforce cross-stream consistency terms which aligns the outputs of both streams, improving the overall performance. Extensive experiments on the public benchmark demonstrate that OPA-3D outperforms state-of-the-art methods on the main Car category, whilst keeping a real-time inference speed. We plan to release all codes and trained models soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.01142v1-abstract-full').style.display = 'none'; document.getElementById('2211.01142v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.13036">arXiv:2209.13036</a> <span> [<a href="https://arxiv.org/pdf/2209.13036">pdf</a>, <a href="https://arxiv.org/format/2209.13036">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MonoGraspNet: 6-DoF Grasping with a Single RGB Image </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhai%2C+G">Guangyao Zhai</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Dianye Huang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shun-Cheng Wu</a>, <a href="/search/cs?searchtype=author&query=Jung%2C+H">Hyunjun Jung</a>, <a href="/search/cs?searchtype=author&query=Di%2C+Y">Yan Di</a>, <a href="/search/cs?searchtype=author&query=Manhardt%2C+F">Fabian Manhardt</a>, <a href="/search/cs?searchtype=author&query=Tombari%2C+F">Federico Tombari</a>, <a href="/search/cs?searchtype=author&query=Navab%2C+N">Nassir Navab</a>, <a href="/search/cs?searchtype=author&query=Busam%2C+B">Benjamin Busam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.13036v2-abstract-short" style="display: inline;"> 6-DoF robotic grasping is a long-lasting but unsolved problem. Recent methods utilize strong 3D networks to extract geometric grasping representations from depth sensors, demonstrating superior accuracy on common objects but perform unsatisfactorily on photometrically challenging objects, e.g., objects in transparent or reflective materials. The bottleneck lies in that the surface of these objects… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.13036v2-abstract-full').style.display = 'inline'; document.getElementById('2209.13036v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.13036v2-abstract-full" style="display: none;"> 6-DoF robotic grasping is a long-lasting but unsolved problem. Recent methods utilize strong 3D networks to extract geometric grasping representations from depth sensors, demonstrating superior accuracy on common objects but perform unsatisfactorily on photometrically challenging objects, e.g., objects in transparent or reflective materials. The bottleneck lies in that the surface of these objects can not reflect back accurate depth due to the absorption or refraction of light. In this paper, in contrast to exploiting the inaccurate depth data, we propose the first RGB-only 6-DoF grasping pipeline called MonoGraspNet that utilizes stable 2D features to simultaneously handle arbitrary object grasping and overcome the problems induced by photometrically challenging objects. MonoGraspNet leverages keypoint heatmap and normal map to recover the 6-DoF grasping poses represented by our novel representation parameterized with 2D keypoints with corresponding depth, grasping direction, grasping width, and angle. Extensive experiments in real scenes demonstrate that our method can achieve competitive results in grasping common objects and surpass the depth-based competitor by a large margin in grasping photometrically challenging objects. To further stimulate robotic manipulation research, we additionally annotate and open-source a multi-view and multi-scene real-world grasping dataset, containing 120 objects of mixed photometric complexity with 20M accurate grasping labels. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.13036v2-abstract-full').style.display = 'none'; document.getElementById('2209.13036v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICRA 2023 accepted. Project website: https://sites.google.com/view/monograsp</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.06661">arXiv:2208.06661</a> <span> [<a href="https://arxiv.org/pdf/2208.06661">pdf</a>, <a href="https://arxiv.org/format/2208.06661">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SSP-Pose: Symmetry-Aware Shape Prior Deformation for Direct Category-Level Object Pose Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruida Zhang</a>, <a href="/search/cs?searchtype=author&query=Di%2C+Y">Yan Di</a>, <a href="/search/cs?searchtype=author&query=Manhardt%2C+F">Fabian Manhardt</a>, <a href="/search/cs?searchtype=author&query=Tombari%2C+F">Federico Tombari</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+X">Xiangyang Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.06661v1-abstract-short" style="display: inline;"> Category-level pose estimation is a challenging problem due to intra-class shape variations. Recent methods deform pre-computed shape priors to map the observed point cloud into the normalized object coordinate space and then retrieve the pose via post-processing, i.e., Umeyama's Algorithm. The shortcomings of this two-stage strategy lie in two aspects: 1) The surrogate supervision on the intermed… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.06661v1-abstract-full').style.display = 'inline'; document.getElementById('2208.06661v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.06661v1-abstract-full" style="display: none;"> Category-level pose estimation is a challenging problem due to intra-class shape variations. Recent methods deform pre-computed shape priors to map the observed point cloud into the normalized object coordinate space and then retrieve the pose via post-processing, i.e., Umeyama's Algorithm. The shortcomings of this two-stage strategy lie in two aspects: 1) The surrogate supervision on the intermediate results can not directly guide the learning of pose, resulting in large pose error after post-processing. 2) The inference speed is limited by the post-processing step. In this paper, to handle these shortcomings, we propose an end-to-end trainable network SSP-Pose for category-level pose estimation, which integrates shape priors into a direct pose regression network. SSP-Pose stacks four individual branches on a shared feature extractor, where two branches are designed to deform and match the prior model with the observed instance, and the other two branches are applied for directly regressing the totally 9 degrees-of-freedom pose and performing symmetry reconstruction and point-wise inlier mask prediction respectively. Consistency loss terms are then naturally exploited to align the outputs of different branches and promote the performance. During inference, only the direct pose regression branch is needed. In this manner, SSP-Pose not only learns category-level pose-sensitive characteristics to boost performance but also keeps a real-time inference speed. Moreover, we utilize the symmetry information of each category to guide the shape prior deformation, and propose a novel symmetry-aware loss to mitigate the matching ambiguity. Extensive experiments on public datasets demonstrate that SSP-Pose produces superior performance compared with competitors with a real-time inference speed at about 25Hz. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.06661v1-abstract-full').style.display = 'none'; document.getElementById('2208.06661v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IROS 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.00237">arXiv:2208.00237</a> <span> [<a href="https://arxiv.org/pdf/2208.00237">pdf</a>, <a href="https://arxiv.org/format/2208.00237">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RBP-Pose: Residual Bounding Box Projection for Category-Level Pose Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruida Zhang</a>, <a href="/search/cs?searchtype=author&query=Di%2C+Y">Yan Di</a>, <a href="/search/cs?searchtype=author&query=Lou%2C+Z">Zhiqiang Lou</a>, <a href="/search/cs?searchtype=author&query=Manhardt%2C+F">Fabian Manhardt</a>, <a href="/search/cs?searchtype=author&query=Tombari%2C+F">Federico Tombari</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+X">Xiangyang Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.00237v2-abstract-short" style="display: inline;"> Category-level object pose estimation aims to predict the 6D pose as well as the 3D metric size of arbitrary objects from a known set of categories. Recent methods harness shape prior adaptation to map the observed point cloud into the canonical space and apply Umeyama algorithm to recover the pose and size. However, their shape prior integration strategy boosts pose estimation indirectly, which l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.00237v2-abstract-full').style.display = 'inline'; document.getElementById('2208.00237v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.00237v2-abstract-full" style="display: none;"> Category-level object pose estimation aims to predict the 6D pose as well as the 3D metric size of arbitrary objects from a known set of categories. Recent methods harness shape prior adaptation to map the observed point cloud into the canonical space and apply Umeyama algorithm to recover the pose and size. However, their shape prior integration strategy boosts pose estimation indirectly, which leads to insufficient pose-sensitive feature extraction and slow inference speed. To tackle this problem, in this paper, we propose a novel geometry-guided Residual Object Bounding Box Projection network RBP-Pose that jointly predicts object pose and residual vectors describing the displacements from the shape-prior-indicated object surface projections on the bounding box towards the real surface projections. Such definition of residual vectors is inherently zero-mean and relatively small, and explicitly encapsulates spatial cues of the 3D object for robust and accurate pose regression. We enforce geometry-aware consistency terms to align the predicted pose and residual vectors to further boost performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.00237v2-abstract-full').style.display = 'none'; document.getElementById('2208.00237v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ECCV 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.07918">arXiv:2203.07918</a> <span> [<a href="https://arxiv.org/pdf/2203.07918">pdf</a>, <a href="https://arxiv.org/format/2203.07918">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GPV-Pose: Category-level Object Pose Estimation via Geometry-guided Point-wise Voting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Di%2C+Y">Yan Di</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruida Zhang</a>, <a href="/search/cs?searchtype=author&query=Lou%2C+Z">Zhiqiang Lou</a>, <a href="/search/cs?searchtype=author&query=Manhardt%2C+F">Fabian Manhardt</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+X">Xiangyang Ji</a>, <a href="/search/cs?searchtype=author&query=Navab%2C+N">Nassir Navab</a>, <a href="/search/cs?searchtype=author&query=Tombari%2C+F">Federico Tombari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.07918v2-abstract-short" style="display: inline;"> While 6D object pose estimation has recently made a huge leap forward, most methods can still only handle a single or a handful of different objects, which limits their applications. To circumvent this problem, category-level object pose estimation has recently been revamped, which aims at predicting the 6D pose as well as the 3D metric size for previously unseen instances from a given set of obje… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.07918v2-abstract-full').style.display = 'inline'; document.getElementById('2203.07918v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.07918v2-abstract-full" style="display: none;"> While 6D object pose estimation has recently made a huge leap forward, most methods can still only handle a single or a handful of different objects, which limits their applications. To circumvent this problem, category-level object pose estimation has recently been revamped, which aims at predicting the 6D pose as well as the 3D metric size for previously unseen instances from a given set of object classes. This is, however, a much more challenging task due to severe intra-class shape variations. To address this issue, we propose GPV-Pose, a novel framework for robust category-level pose estimation, harnessing geometric insights to enhance the learning of category-level pose-sensitive features. First, we introduce a decoupled confidence-driven rotation representation, which allows geometry-aware recovery of the associated rotation matrix. Second, we propose a novel geometry-guided point-wise voting paradigm for robust retrieval of the 3D object bounding box. Finally, leveraging these different output streams, we can enforce several geometric consistency terms, further increasing performance, especially for non-symmetric categories. GPV-Pose produces superior results to state-of-the-art competitors on common public benchmarks, whilst almost achieving real-time inference speed at 20 FPS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.07918v2-abstract-full').style.display = 'none'; document.getElementById('2203.07918v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2108.08367">arXiv:2108.08367</a> <span> [<a href="https://arxiv.org/pdf/2108.08367">pdf</a>, <a href="https://arxiv.org/format/2108.08367">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SO-Pose: Exploiting Self-Occlusion for Direct 6D Pose Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Di%2C+Y">Yan Di</a>, <a href="/search/cs?searchtype=author&query=Manhardt%2C+F">Fabian Manhardt</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Gu Wang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+X">Xiangyang Ji</a>, <a href="/search/cs?searchtype=author&query=Navab%2C+N">Nassir Navab</a>, <a href="/search/cs?searchtype=author&query=Tombari%2C+F">Federico Tombari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2108.08367v1-abstract-short" style="display: inline;"> Directly regressing all 6 degrees-of-freedom (6DoF) for the object pose (e.g. the 3D rotation and translation) in a cluttered environment from a single RGB image is a challenging problem. While end-to-end methods have recently demonstrated promising results at high efficiency, they are still inferior when compared with elaborate P$n$P/RANSAC-based approaches in terms of pose accuracy. In this work… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.08367v1-abstract-full').style.display = 'inline'; document.getElementById('2108.08367v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2108.08367v1-abstract-full" style="display: none;"> Directly regressing all 6 degrees-of-freedom (6DoF) for the object pose (e.g. the 3D rotation and translation) in a cluttered environment from a single RGB image is a challenging problem. While end-to-end methods have recently demonstrated promising results at high efficiency, they are still inferior when compared with elaborate P$n$P/RANSAC-based approaches in terms of pose accuracy. In this work, we address this shortcoming by means of a novel reasoning about self-occlusion, in order to establish a two-layer representation for 3D objects which considerably enhances the accuracy of end-to-end 6D pose estimation. Our framework, named SO-Pose, takes a single RGB image as input and respectively generates 2D-3D correspondences as well as self-occlusion information harnessing a shared encoder and two separate decoders. Both outputs are then fused to directly regress the 6DoF pose parameters. Incorporating cross-layer consistencies that align correspondences, self-occlusion and 6D pose, we can further improve accuracy and robustness, surpassing or rivaling all other state-of-the-art approaches on various challenging datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.08367v1-abstract-full').style.display = 'none'; document.getElementById('2108.08367v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 August, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICCV2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2006.13510">arXiv:2006.13510</a> <span> [<a href="https://arxiv.org/pdf/2006.13510">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Functional Connectivity and Graph Convolution Network for Alzheimer's Disease Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=An%2C+X">Xingwei An</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yutao Zhou</a>, <a href="/search/cs?searchtype=author&query=Di%2C+Y">Yang Di</a>, <a href="/search/cs?searchtype=author&query=Ming%2C+D">Dong Ming</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2006.13510v1-abstract-short" style="display: inline;"> Alzheimer's disease (AD) is the most prevalent form of dementia. Traditional methods cannot achieve efficient and accurate diagnosis of AD. In this paper, we introduce a novel method based on dynamic functional connectivity (dFC) that can effectively capture changes in the brain. We compare and combine four different types of features including amplitude of low-frequency fluctuation (ALFF), region… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.13510v1-abstract-full').style.display = 'inline'; document.getElementById('2006.13510v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2006.13510v1-abstract-full" style="display: none;"> Alzheimer's disease (AD) is the most prevalent form of dementia. Traditional methods cannot achieve efficient and accurate diagnosis of AD. In this paper, we introduce a novel method based on dynamic functional connectivity (dFC) that can effectively capture changes in the brain. We compare and combine four different types of features including amplitude of low-frequency fluctuation (ALFF), regional homogeneity (ReHo), dFC and the adjacency matrix of different brain structures between subjects. We use graph convolution network (GCN) which consider the similarity of brain structure between patients to solve the classification problem of non-Euclidean domains. The proposed method's accuracy and the area under the receiver operating characteristic curve achieved 91.3% and 98.4%. This result demonstrated that our proposed method can be used for detecting AD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.13510v1-abstract-full').style.display = 'none'; document.getElementById('2006.13510v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 June, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2020. </p> </li> </ol> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository