CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 74 results for author: <span class="mathjax">Rao, Y</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Rao%2C+Y">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Rao, Y"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Rao%2C+Y&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Rao, Y"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Rao%2C+Y&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Rao%2C+Y&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Rao%2C+Y&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14432">arXiv:2411.14432</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.14432">pdf</a>, <a href="https://arxiv.org/format/2411.14432">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Insight-V: Exploring Long-Chain Visual Reasoning with Multimodal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Y">Yuhao Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zuyan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+H">Hai-Long Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+J">Jingkang Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+W">Winston Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yongming Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Ziwei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14432v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) demonstrate enhanced capabilities and reliability by reasoning more, evolving from Chain-of-Thought prompting to product-level solutions like OpenAI o1. Despite various efforts to improve LLM reasoning, high-quality long-chain reasoning data and optimized training pipelines still remain inadequately explored in vision-language tasks. In this paper, we present Insight-V&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14432v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14432v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14432v1-abstract-full" style="display: none;"> Large Language Models (LLMs) demonstrate enhanced capabilities and reliability by reasoning more, evolving from Chain-of-Thought prompting to product-level solutions like OpenAI o1. Despite various efforts to improve LLM reasoning, high-quality long-chain reasoning data and optimized training pipelines still remain inadequately explored in vision-language tasks. In this paper, we present Insight-V, an early effort to 1) scalably produce long and robust reasoning data for complex multi-modal tasks, and 2) an effective training pipeline to enhance the reasoning capabilities of multi-modal large language models (MLLMs). Specifically, to create long and structured reasoning data without human labor, we design a two-step pipeline with a progressive strategy to generate sufficiently long and diverse reasoning paths and a multi-granularity assessment method to ensure data quality. We observe that directly supervising MLLMs with such long and complex reasoning data will not yield ideal reasoning ability. To tackle this problem, we design a multi-agent system consisting of a reasoning agent dedicated to performing long-chain reasoning and a summary agent trained to judge and summarize reasoning results. We further incorporate an iterative DPO algorithm to enhance the reasoning agent&#39;s generation stability and quality. Based on the popular LLaVA-NeXT model and our stronger base MLLM, we demonstrate significant performance gains across challenging multi-modal benchmarks requiring visual reasoning. Benefiting from our multi-agent system, Insight-V can also easily maintain or improve performance on perception-focused multi-modal tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14432v1-abstract-full').style.display = 'none'; document.getElementById('2411.14432v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16842">arXiv:2410.16842</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.16842">pdf</a>, <a href="https://arxiv.org/format/2410.16842">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Assessment of Transformer-Based Encoder-Decoder Model for Human-Like Summarization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nair%2C+S">Sindhu Nair</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y+S">Y. S. Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Shankarmani%2C+R">Radha Shankarmani</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16842v1-abstract-short" style="display: inline;"> In recent times, extracting valuable information from large text is making significant progress. Especially in the current era of social media, people expect quick bites of information. Automatic text summarization seeks to tackle this by slimming large texts down into more manageable summaries. This important research area can aid in decision-making by digging out salient content from large text.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16842v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16842v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16842v1-abstract-full" style="display: none;"> In recent times, extracting valuable information from large text is making significant progress. Especially in the current era of social media, people expect quick bites of information. Automatic text summarization seeks to tackle this by slimming large texts down into more manageable summaries. This important research area can aid in decision-making by digging out salient content from large text. With the progress in deep learning models, significant work in language models has emerged. The encoder-decoder framework in deep learning has become the central approach for automatic text summarization. This work leverages transformer-based BART model for human-like summarization which is an open-ended problem with many challenges. On training and fine-tuning the encoder-decoder model, it is tested with diverse sample articles and the quality of summaries of diverse samples is assessed based on human evaluation parameters. Further, the finetuned model performance is compared with the baseline pretrained model based on evaluation metrics like ROUGE score and BERTScore. Additionally, domain adaptation of the model is required for improved performance of abstractive summarization of dialogues between interlocutors. On investigating, the above popular evaluation metrics are found to be insensitive to factual errors. Further investigation of the summaries generated by finetuned model is done using the contemporary evaluation metrics of factual consistency like WeCheck and SummaC. Empirical results on BBC News articles highlight that the gold standard summaries written by humans are more factually consistent by 17% than the abstractive summaries generated by finetuned model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16842v1-abstract-full').style.display = 'none'; document.getElementById('2410.16842v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Pre-print</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13464">arXiv:2410.13464</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.13464">pdf</a>, <a href="https://arxiv.org/format/2410.13464">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> IterSelectTune: An Iterative Training Framework for Efficient Instruction-Tuning Data Selection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Song%2C+J">Jielin Song</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+S">Siyu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+B">Bin Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yanghui Rao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13464v1-abstract-short" style="display: inline;"> As large language models (LLMs) continue to advance, instruction tuning has become critical for improving their ability to generate accurate and contextually appropriate responses. Although numerous instruction-tuning datasets have been developed to enhance LLM performance, selecting high-quality instruction data from large source datasets typically demands significant human effort. In this work,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13464v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13464v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13464v1-abstract-full" style="display: none;"> As large language models (LLMs) continue to advance, instruction tuning has become critical for improving their ability to generate accurate and contextually appropriate responses. Although numerous instruction-tuning datasets have been developed to enhance LLM performance, selecting high-quality instruction data from large source datasets typically demands significant human effort. In this work, we introduce $\textbf{IterSelectTune}$, an efficient, cost-effective iterative training policy for selecting high-quality instruction data with no human involvement and limited reliance on GPT-4. By fine-tuning on approximately 20\% of the source data, our method consistently outperforms models fine-tuned on the full dataset across multiple benchmarks and public test datasets. These results highlight the effectiveness of our approach in enhancing LLM performance while reducing the computational resources required for instruction tuning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13464v1-abstract-full').style.display = 'none'; document.getElementById('2410.13464v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09477">arXiv:2410.09477</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.09477">pdf</a>, <a href="https://arxiv.org/format/2410.09477">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Efficient Bipartite Graph Embedding Induced by Clustering Constraints </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+S">Shanfan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Y">Yongyi Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yuan Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Bu%2C+Z">Zhan Bu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09477v1-abstract-short" style="display: inline;"> Bipartite graph embedding (BGE) maps nodes to compressed embedding vectors that can reflect the hidden topological features of the network, and learning high-quality BGE is crucial for facilitating downstream applications such as recommender systems. However, most existing methods either struggle to efficiently learn embeddings suitable for users and items with fewer interactions, or exhibit poor&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09477v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09477v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09477v1-abstract-full" style="display: none;"> Bipartite graph embedding (BGE) maps nodes to compressed embedding vectors that can reflect the hidden topological features of the network, and learning high-quality BGE is crucial for facilitating downstream applications such as recommender systems. However, most existing methods either struggle to efficiently learn embeddings suitable for users and items with fewer interactions, or exhibit poor scalability to handle large-scale networks. In this paper, we propose a Clustering Constraints induced BIpartite graph Embedding (CCBIE) as an integrated solution to both problems. CCBIE facilitates automatic and dynamic soft clustering of items in a top-down manner, and capturing macro-preference information of users through clusters. Specifically, by leveraging the cluster embedding matrix of items, CCBIE calculates the cluster assignment matrix for items and also captures the extent of user preferences across different clusters, thereby elucidating the similarity between users and items on a macro-scale level. CCBIE effectively preserves the global properties of bipartite graphs, maintaining the cluster structure of isomorphic nodes and accounting for long-range dependencies among heterogeneous nodes. Our approach significantly enhances user-item collaborative relation modeling by integrating adaptive clustering for relationship learning, thereby markedly improving prediction performance, particularly benefiting cold users and items. Extensive experiments indicate that CCBIE consistently and significantly improves accuracy over baseline models, particularly on sparse graphs, while also enhancing training speed and reducing memory requirements on large-scale bipartite graphs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09477v1-abstract-full').style.display = 'none'; document.getElementById('2410.09477v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07673">arXiv:2410.07673</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.07673">pdf</a>, <a href="https://arxiv.org/format/2410.07673">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Multimodal Clickbait Detection by De-confounding Biases Using Causal Representation Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yu%2C+J">Jianxing Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shiqi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+H">Han Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+Z">Zhenlong Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+R">Ruobing Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yanghui Rao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07673v1-abstract-short" style="display: inline;"> This paper focuses on detecting clickbait posts on the Web. These posts often use eye-catching disinformation in mixed modalities to mislead users to click for profit. That affects the user experience and thus would be blocked by content provider. To escape detection, malicious creators use tricks to add some irrelevant non-bait content into bait posts, dressing them up as legal to fool the detect&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07673v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07673v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07673v1-abstract-full" style="display: none;"> This paper focuses on detecting clickbait posts on the Web. These posts often use eye-catching disinformation in mixed modalities to mislead users to click for profit. That affects the user experience and thus would be blocked by content provider. To escape detection, malicious creators use tricks to add some irrelevant non-bait content into bait posts, dressing them up as legal to fool the detector. This content often has biased relations with non-bait labels, yet traditional detectors tend to make predictions based on simple co-occurrence rather than grasping inherent factors that lead to malicious behavior. This spurious bias would easily cause misjudgments. To address this problem, we propose a new debiased method based on causal inference. We first employ a set of features in multiple modalities to characterize the posts. Considering these features are often mixed up with unknown biases, we then disentangle three kinds of latent factors from them, including the invariant factor that indicates intrinsic bait intention; the causal factor which reflects deceptive patterns in a certain scenario, and non-causal noise. By eliminating the noise that causes bias, we can use invariant and causal factors to build a robust model with good generalization ability. Experiments on three popular datasets show the effectiveness of our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07673v1-abstract-full').style.display = 'none'; document.getElementById('2410.07673v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12961">arXiv:2409.12961</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.12961">pdf</a>, <a href="https://arxiv.org/format/2409.12961">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Oryx MLLM: On-Demand Spatial-Temporal Understanding at Arbitrary Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zuyan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Y">Yuhao Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Ziwei Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+W">Winston Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jiwen Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yongming Rao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12961v2-abstract-short" style="display: inline;"> Visual data comes in various forms, ranging from small icons of just a few pixels to long videos spanning hours. Existing multi-modal LLMs usually standardize these diverse visual inputs to a fixed resolution for visual encoders and yield similar numbers of tokens for LLMs. This approach is non-optimal for multimodal understanding and inefficient for processing inputs with long and short visual co&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12961v2-abstract-full').style.display = 'inline'; document.getElementById('2409.12961v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12961v2-abstract-full" style="display: none;"> Visual data comes in various forms, ranging from small icons of just a few pixels to long videos spanning hours. Existing multi-modal LLMs usually standardize these diverse visual inputs to a fixed resolution for visual encoders and yield similar numbers of tokens for LLMs. This approach is non-optimal for multimodal understanding and inefficient for processing inputs with long and short visual contents. To solve the problem, we propose Oryx, a unified multimodal architecture for the spatial-temporal understanding of images, videos, and multi-view 3D scenes. Oryx offers an on-demand solution to seamlessly and efficiently process visual inputs with arbitrary spatial sizes and temporal lengths through two core innovations: 1) a pre-trained OryxViT model that can encode images at any resolution into LLM-friendly visual representations; 2) a dynamic compressor module that supports 1x to 16x compression on visual tokens by request. These design features enable Oryx to accommodate extremely long visual contexts, such as videos, with lower resolution and high compression while maintaining high recognition precision for tasks like document understanding with native resolution and no compression. Beyond the architectural improvements, enhanced data curation and specialized training on long-context retrieval and spatial-aware data help Oryx achieve strong capabilities in image, video, and 3D multimodal understanding simultaneously. Our work is open-sourced at https://github.com/Oryx-mllm/Oryx. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12961v2-abstract-full').style.display = 'none'; document.getElementById('2409.12961v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.15050">arXiv:2408.15050</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.15050">pdf</a>, <a href="https://arxiv.org/format/2408.15050">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Self-supervised Topic Taxonomy Discovery in the Box Embedding Space </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Y">Yuyin Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hegang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Mao%2C+P">Pengbo Mao</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yanghui Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+H">Haoran Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+F+L">Fu Lee Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Q">Qing Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.15050v1-abstract-short" style="display: inline;"> Topic taxonomy discovery aims at uncovering topics of different abstraction levels and constructing hierarchical relations between them. Unfortunately, most of prior work can hardly model semantic scopes of words and topics by holding the Euclidean embedding space assumption. What&#39;s worse, they infer asymmetric hierarchical relations by symmetric distances between topic embeddings. As a result, ex&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15050v1-abstract-full').style.display = 'inline'; document.getElementById('2408.15050v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.15050v1-abstract-full" style="display: none;"> Topic taxonomy discovery aims at uncovering topics of different abstraction levels and constructing hierarchical relations between them. Unfortunately, most of prior work can hardly model semantic scopes of words and topics by holding the Euclidean embedding space assumption. What&#39;s worse, they infer asymmetric hierarchical relations by symmetric distances between topic embeddings. As a result, existing methods suffer from problems of low-quality topics at high abstraction levels and inaccurate hierarchical relations. To alleviate these problems, this paper develops a Box embedding-based Topic Model (BoxTM) that maps words and topics into the box embedding space, where the asymmetric metric is defined to properly infer hierarchical relations among topics. Additionally, our BoxTM explicitly infers upper-level topics based on correlation between specific topics through recursive clustering on topic boxes. Finally, extensive experiments validate high-quality of the topic taxonomy learned by BoxTM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15050v1-abstract-full').style.display = 'none'; document.getElementById('2408.15050v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">to be published in TACL</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.00754">arXiv:2408.00754</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.00754">pdf</a>, <a href="https://arxiv.org/format/2408.00754">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Coarse Correspondences Boost Spatial-Temporal Reasoning in Multimodal Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+B">Benlin Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Y">Yuhao Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yiqin Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+Z">Zixian Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+Y">Yansong Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+L">Luming Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yongming Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+W">Wei-Chiu Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Krishna%2C+R">Ranjay Krishna</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.00754v2-abstract-short" style="display: inline;"> Multimodal language models (MLLMs) are increasingly being applied in real-world environments, necessitating their ability to interpret 3D spaces and comprehend temporal dynamics. Current methods often rely on specialized architectural designs or task-specific fine-tuning to achieve this. We introduce Coarse Correspondences, a simple lightweight method that enhances MLLMs&#39; spatial-temporal reasonin&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00754v2-abstract-full').style.display = 'inline'; document.getElementById('2408.00754v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.00754v2-abstract-full" style="display: none;"> Multimodal language models (MLLMs) are increasingly being applied in real-world environments, necessitating their ability to interpret 3D spaces and comprehend temporal dynamics. Current methods often rely on specialized architectural designs or task-specific fine-tuning to achieve this. We introduce Coarse Correspondences, a simple lightweight method that enhances MLLMs&#39; spatial-temporal reasoning with 2D images as input, without modifying the architecture or requiring task-specific fine-tuning. Our method uses a lightweight tracking model to identify primary object correspondences between frames in a video or across different image viewpoints, and then conveys this information to MLLMs through visual prompting. We demonstrate that this simple training-free approach brings substantial gains to GPT4-V/O consistently on four benchmarks that require spatial-temporal reasoning, including +20.5\% improvement on ScanQA, +9.7\% on OpenEQA&#39;s episodic memory subset, +6.0\% on the long-form video benchmark EgoSchema, and +11\% on the R2R navigation benchmark. Additionally, we show that Coarse Correspondences can also enhance open-source MLLMs&#39; spatial reasoning (by +6.9\% on ScanQA) when applied in both training and inference and that the improvement can generalize to unseen datasets such as SQA3D (+3.1\%). Taken together, we show that Coarse Correspondences effectively and efficiently boosts models&#39; performance on downstream tasks requiring spatial-temporal reasoning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00754v2-abstract-full').style.display = 'none'; document.getElementById('2408.00754v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">project page: https://coarse-correspondence.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.18121">arXiv:2407.18121</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.18121">pdf</a>, <a href="https://arxiv.org/format/2407.18121">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Efficient Inference of Vision Instruction-Following Models with Elastic Cache </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zuyan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+B">Benlin Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jiahui Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Y">Yuhao Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+G">Guangyi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yongming Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Krishna%2C+R">Ranjay Krishna</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jiwen Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.18121v1-abstract-short" style="display: inline;"> In the field of instruction-following large vision-language models (LVLMs), the efficient deployment of these models faces challenges, notably due to the high memory demands of their key-value (KV) caches. Conventional cache management strategies for LLMs focus on cache eviction, which often fails to address the specific needs of multimodal instruction-following models. Recognizing this gap, in th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18121v1-abstract-full').style.display = 'inline'; document.getElementById('2407.18121v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.18121v1-abstract-full" style="display: none;"> In the field of instruction-following large vision-language models (LVLMs), the efficient deployment of these models faces challenges, notably due to the high memory demands of their key-value (KV) caches. Conventional cache management strategies for LLMs focus on cache eviction, which often fails to address the specific needs of multimodal instruction-following models. Recognizing this gap, in this paper, we introduce Elastic Cache, a novel approach that benefits from applying distinct acceleration methods for instruction encoding and output generation stages. We investigate the metrics of importance in different stages and propose an importance-driven cache merging strategy to prune redundancy caches. Instead of discarding less important caches, our strategy identifies important key/value vectors as anchor points. Surrounding less important caches are then merged with these anchors, enhancing the preservation of contextual information in the KV caches while yielding an arbitrary acceleration ratio. For instruction encoding, we utilize the frequency to evaluate the importance of caches. Regarding output generation, we prioritize tokens based on their distance with an offset, by which both the initial and most recent tokens are retained. Results on a range of LVLMs demonstrate that Elastic Cache not only boosts efficiency but also notably outperforms existing pruning methods in language generation across various tasks. Code is available at https://github.com/liuzuyan/ElasticCache <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18121v1-abstract-full').style.display = 'none'; document.getElementById('2407.18121v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.16883">arXiv:2407.16883</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.16883">pdf</a>, <a href="https://arxiv.org/format/2407.16883">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Standardized Machine-readable Dataset Documentation Format for Responsible AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jain%2C+N">Nitisha Jain</a>, <a href="/search/cs?searchtype=author&amp;query=Akhtar%2C+M">Mubashara Akhtar</a>, <a href="/search/cs?searchtype=author&amp;query=Giner-Miguelez%2C+J">Joan Giner-Miguelez</a>, <a href="/search/cs?searchtype=author&amp;query=Shinde%2C+R">Rajat Shinde</a>, <a href="/search/cs?searchtype=author&amp;query=Vanschoren%2C+J">Joaquin Vanschoren</a>, <a href="/search/cs?searchtype=author&amp;query=Vogler%2C+S">Steffen Vogler</a>, <a href="/search/cs?searchtype=author&amp;query=Goswami%2C+S">Sujata Goswami</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yuhan Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Santos%2C+T">Tim Santos</a>, <a href="/search/cs?searchtype=author&amp;query=Oala%2C+L">Luis Oala</a>, <a href="/search/cs?searchtype=author&amp;query=Karamousadakis%2C+M">Michalis Karamousadakis</a>, <a href="/search/cs?searchtype=author&amp;query=Maskey%2C+M">Manil Maskey</a>, <a href="/search/cs?searchtype=author&amp;query=Marcenac%2C+P">Pierre Marcenac</a>, <a href="/search/cs?searchtype=author&amp;query=Conforti%2C+C">Costanza Conforti</a>, <a href="/search/cs?searchtype=author&amp;query=Kuchnik%2C+M">Michael Kuchnik</a>, <a href="/search/cs?searchtype=author&amp;query=Aroyo%2C+L">Lora Aroyo</a>, <a href="/search/cs?searchtype=author&amp;query=Benjelloun%2C+O">Omar Benjelloun</a>, <a href="/search/cs?searchtype=author&amp;query=Simperl%2C+E">Elena Simperl</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.16883v1-abstract-short" style="display: inline;"> Data is critical to advancing AI technologies, yet its quality and documentation remain significant challenges, leading to adverse downstream effects (e.g., potential biases) in AI applications. This paper addresses these issues by introducing Croissant-RAI, a machine-readable metadata format designed to enhance the discoverability, interoperability, and trustworthiness of AI datasets. Croissant-R&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16883v1-abstract-full').style.display = 'inline'; document.getElementById('2407.16883v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.16883v1-abstract-full" style="display: none;"> Data is critical to advancing AI technologies, yet its quality and documentation remain significant challenges, leading to adverse downstream effects (e.g., potential biases) in AI applications. This paper addresses these issues by introducing Croissant-RAI, a machine-readable metadata format designed to enhance the discoverability, interoperability, and trustworthiness of AI datasets. Croissant-RAI extends the Croissant metadata format and builds upon existing responsible AI (RAI) documentation frameworks, offering a standardized set of attributes and practices to facilitate community-wide adoption. Leveraging established web-publishing practices, such as Schema.org, Croissant-RAI enables dataset users to easily find and utilize RAI metadata regardless of the platform on which the datasets are published. Furthermore, it is seamlessly integrated into major data search engines, repositories, and machine learning frameworks, streamlining the reading and writing of responsible AI metadata within practitioners&#39; existing workflows. Croissant-RAI was developed through a community-led effort. It has been designed to be adaptable to evolving documentation requirements and is supported by a Python library and a visual editor. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16883v1-abstract-full').style.display = 'none'; document.getElementById('2407.16883v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, appendix</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.08349">arXiv:2407.08349</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.08349">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Spine Vision X-Ray Image based GUI Planning of Pedicle Screws Using Enhanced YOLOv5 for Vertebrae Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yashwanth Rao</a>, <a href="/search/cs?searchtype=author&amp;query=S%2C+G">Gaurisankar S</a>, <a href="/search/cs?searchtype=author&amp;query=R%2C+D">Durga R</a>, <a href="/search/cs?searchtype=author&amp;query=Purayath%2C+A">Aparna Purayath</a>, <a href="/search/cs?searchtype=author&amp;query=Maik%2C+V">Vivek Maik</a>, <a href="/search/cs?searchtype=author&amp;query=Lakshmanan%2C+M">Manojkumar Lakshmanan</a>, <a href="/search/cs?searchtype=author&amp;query=Sivaprakasm%2C+M">Mohanasankar Sivaprakasm</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.08349v1-abstract-short" style="display: inline;"> In this paper, we propose an innovative Graphical User Interface (GUI) aimed at improving preoperative planning and intra-operative guidance for precise spinal screw placement through vertebrae segmentation. The methodology encompasses both front-end and back-end computations. The front end comprises a GUI that allows surgeons to precisely adjust the placement of screws on X-Ray images, thereby im&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08349v1-abstract-full').style.display = 'inline'; document.getElementById('2407.08349v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.08349v1-abstract-full" style="display: none;"> In this paper, we propose an innovative Graphical User Interface (GUI) aimed at improving preoperative planning and intra-operative guidance for precise spinal screw placement through vertebrae segmentation. The methodology encompasses both front-end and back-end computations. The front end comprises a GUI that allows surgeons to precisely adjust the placement of screws on X-Ray images, thereby improving the simulation of surgical screw insertion in the patient&#39;s spine. On the other hand, the back-end processing involves several steps, including acquiring spinal X-ray images, performing pre-processing techniques to reduce noise, and training a neural network model to achieve real-time segmentation of the vertebrae. The integration of vertebral segmentation in the GUI ensures precise screw placement, reducing complications like nerve injury and ultimately improving surgical outcomes. The Spine-Vision provides a comprehensive solution with innovative features like synchronous AP-LP planning, accurate screw positioning via vertebrae segmentation, effective screw visualization, and dynamic position adjustments. This X-ray image-based GUI workflow emerges as a valuable tool, enhancing precision and safety in spinal screw placement and planning procedures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08349v1-abstract-full').style.display = 'none'; document.getElementById('2407.08349v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.17934">arXiv:2405.17934</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.17934">pdf</a>, <a href="https://arxiv.org/format/2405.17934">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Proof of Quality: A Costless Paradigm for Trustless Generative AI Model Inference on Blockchains </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zhenjie Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yuyang Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+H">Hao Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+X">Xiaokui Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yin Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.17934v2-abstract-short" style="display: inline;"> Generative AI models, such as GPT-4 and Stable Diffusion, have demonstrated powerful and disruptive capabilities in natural language and image tasks. However, deploying these models in decentralized environments remains challenging. Unlike traditional centralized deployment, systematically guaranteeing the integrity of AI model services in fully decentralized environments, particularly on trustles&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17934v2-abstract-full').style.display = 'inline'; document.getElementById('2405.17934v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.17934v2-abstract-full" style="display: none;"> Generative AI models, such as GPT-4 and Stable Diffusion, have demonstrated powerful and disruptive capabilities in natural language and image tasks. However, deploying these models in decentralized environments remains challenging. Unlike traditional centralized deployment, systematically guaranteeing the integrity of AI model services in fully decentralized environments, particularly on trustless blockchains, is both crucial and difficult. In this paper, we present a new inference paradigm called \emph{proof of quality} (PoQ) to enable the deployment of arbitrarily large generative models on blockchain architecture. Unlike traditional approaches based on validating inference procedures, such as ZKML or OPML, our PoQ paradigm focuses on the outcome quality of model inference. Using lightweight BERT-based cross-encoders as our underlying quality evaluation model, we design and implement PQML, the first practical protocol for real-world NLP generative model inference on blockchains, tailored for popular open-source models such as Llama 3 and Mixtral. Our analysis demonstrates that our protocol is robust against adversarial but rational participants in ecosystems, where lazy or dishonest behavior results in fewer benefits compared to well-behaving participants. The computational overhead of validating the quality evaluation is minimal, allowing quality validators to complete the quality check within a second, even using only a CPU. Preliminary simulation results show that PoQ consensus is generated in milliseconds, 1,000 times faster than any existing scheme. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17934v2-abstract-full').style.display = 'none'; document.getElementById('2405.17934v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.15010">arXiv:2404.15010</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.15010">pdf</a>, <a href="https://arxiv.org/format/2404.15010">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> X-3D: Explicit 3D Structure Modeling for Point Cloud Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sun%2C+S">Shuofeng Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yongming Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jiwen Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+H">Haibin Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.15010v1-abstract-short" style="display: inline;"> Numerous prior studies predominantly emphasize constructing relation vectors for individual neighborhood points and generating dynamic kernels for each vector and embedding these into high-dimensional spaces to capture implicit local structures. However, we contend that such implicit high-dimensional structure modeling approch inadequately represents the local geometric structure of point clouds d&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15010v1-abstract-full').style.display = 'inline'; document.getElementById('2404.15010v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.15010v1-abstract-full" style="display: none;"> Numerous prior studies predominantly emphasize constructing relation vectors for individual neighborhood points and generating dynamic kernels for each vector and embedding these into high-dimensional spaces to capture implicit local structures. However, we contend that such implicit high-dimensional structure modeling approch inadequately represents the local geometric structure of point clouds due to the absence of explicit structural information. Hence, we introduce X-3D, an explicit 3D structure modeling approach. X-3D functions by capturing the explicit local structural information within the input 3D space and employing it to produce dynamic kernels with shared weights for all neighborhood points within the current local region. This modeling approach introduces effective geometric prior and significantly diminishes the disparity between the local structure of the embedding space and the original input point cloud, thereby improving the extraction of local features. Experiments show that our method can be used on a variety of methods and achieves state-of-the-art performance on segmentation, classification, detection tasks with lower extra computational cost, such as \textbf{90.7\%} on ScanObjectNN for classification, \textbf{79.2\%} on S3DIS 6 fold and \textbf{74.3\%} on S3DIS Area 5 for segmentation, \textbf{76.3\%} on ScanNetV2 for segmentation and \textbf{64.5\%} mAP , \textbf{46.9\%} mAP on SUN RGB-D and \textbf{69.0\%} mAP , \textbf{51.1\%} mAP on ScanNetV2 . Our code is available at \href{https://github.com/sunshuofeng/X-3D}{https://github.com/sunshuofeng/X-3D}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15010v1-abstract-full').style.display = 'none'; document.getElementById('2404.15010v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> The IEEE/CVF Conference on Computer Vision and Pattern Recognition 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.12966">arXiv:2403.12966</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.12966">pdf</a>, <a href="https://arxiv.org/format/2403.12966">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Chain-of-Spot: Interactive Reasoning Improves Large Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zuyan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Y">Yuhao Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yongming Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jie Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jiwen Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.12966v2-abstract-short" style="display: inline;"> In the realm of vision-language understanding, the proficiency of models in interpreting and reasoning over visual content has become a cornerstone for numerous applications. However, it is challenging for the visual encoder in Large Vision-Language Models (LVLMs) to extract useful features tailored to questions that aid the language model&#39;s response. Furthermore, a common practice among existing&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12966v2-abstract-full').style.display = 'inline'; document.getElementById('2403.12966v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.12966v2-abstract-full" style="display: none;"> In the realm of vision-language understanding, the proficiency of models in interpreting and reasoning over visual content has become a cornerstone for numerous applications. However, it is challenging for the visual encoder in Large Vision-Language Models (LVLMs) to extract useful features tailored to questions that aid the language model&#39;s response. Furthermore, a common practice among existing LVLMs is to utilize lower-resolution images, which restricts the ability for visual recognition. Our work introduces the Chain-of-Spot (CoS) method, which we describe as Interactive Reasoning, a novel approach that enhances feature extraction by focusing on key regions of interest (ROI) within the image, corresponding to the posed questions or instructions. This technique allows LVLMs to access more detailed visual information without altering the original image resolution, thereby offering multi-granularity image features. By integrating Chain-of-Spot with instruct-following LLaVA-1.5 models, the process of image reasoning consistently improves performance across a wide range of multimodal datasets and benchmarks without bells and whistles and achieves new state-of-the-art results. Our empirical findings demonstrate a significant improvement in LVLMs&#39; ability to understand and reason about visual content, paving the way for more sophisticated visual instruction-following applications. Code and models are available at https://github.com/dongyh20/Chain-of-Spot <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12966v2-abstract-full').style.display = 'none'; document.getElementById('2403.12966v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://sites.google.com/view/chain-of-spot/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.13286">arXiv:2312.13286</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.13286">pdf</a>, <a href="https://arxiv.org/format/2312.13286">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Generative Multimodal Models are In-Context Learners </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sun%2C+Q">Quan Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+Y">Yufeng Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xiaosong Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+F">Fan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+Q">Qiying Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+Z">Zhengxiong Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yueze Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yongming Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jingjing Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+T">Tiejun Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xinlong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.13286v2-abstract-short" style="display: inline;"> The human ability to easily solve multimodal tasks in context (i.e., with only a few demonstrations or simple instructions), is what current multimodal systems have largely struggled to imitate. In this work, we demonstrate that the task-agnostic in-context learning capabilities of large multimodal models can be significantly enhanced by effective scaling-up. We introduce Emu2, a generative multim&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.13286v2-abstract-full').style.display = 'inline'; document.getElementById('2312.13286v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.13286v2-abstract-full" style="display: none;"> The human ability to easily solve multimodal tasks in context (i.e., with only a few demonstrations or simple instructions), is what current multimodal systems have largely struggled to imitate. In this work, we demonstrate that the task-agnostic in-context learning capabilities of large multimodal models can be significantly enhanced by effective scaling-up. We introduce Emu2, a generative multimodal model with 37 billion parameters, trained on large-scale multimodal sequences with a unified autoregressive objective. Emu2 exhibits strong multimodal in-context learning abilities, even emerging to solve tasks that require on-the-fly reasoning, such as visual prompting and object-grounded generation. The model sets a new record on multiple multimodal understanding tasks in few-shot settings. When instruction-tuned to follow specific instructions, Emu2 further achieves new state-of-the-art on challenging tasks such as question answering benchmarks for large multimodal models and open-ended subject-driven generation. These achievements demonstrate that Emu2 can serve as a base model and general-purpose interface for a wide range of multimodal tasks. Code and models are publicly available to facilitate future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.13286v2-abstract-full').style.display = 'none'; document.getElementById('2312.13286v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2024. Project page: https://baaivision.github.io/emu2</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.06655">arXiv:2312.06655</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.06655">pdf</a>, <a href="https://arxiv.org/format/2312.06655">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Sherpa3D: Boosting High-Fidelity Text-to-3D Generation via Coarse 3D Prior </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+F">Fangfu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+D">Diankun Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+Y">Yi Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yongming Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Duan%2C+Y">Yueqi Duan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.06655v1-abstract-short" style="display: inline;"> Recently, 3D content creation from text prompts has demonstrated remarkable progress by utilizing 2D and 3D diffusion models. While 3D diffusion models ensure great multi-view consistency, their ability to generate high-quality and diverse 3D assets is hindered by the limited 3D data. In contrast, 2D diffusion models find a distillation approach that achieves excellent generalization and rich deta&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.06655v1-abstract-full').style.display = 'inline'; document.getElementById('2312.06655v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.06655v1-abstract-full" style="display: none;"> Recently, 3D content creation from text prompts has demonstrated remarkable progress by utilizing 2D and 3D diffusion models. While 3D diffusion models ensure great multi-view consistency, their ability to generate high-quality and diverse 3D assets is hindered by the limited 3D data. In contrast, 2D diffusion models find a distillation approach that achieves excellent generalization and rich details without any 3D data. However, 2D lifting methods suffer from inherent view-agnostic ambiguity thereby leading to serious multi-face Janus issues, where text prompts fail to provide sufficient guidance to learn coherent 3D results. Instead of retraining a costly viewpoint-aware model, we study how to fully exploit easily accessible coarse 3D knowledge to enhance the prompts and guide 2D lifting optimization for refinement. In this paper, we propose Sherpa3D, a new text-to-3D framework that achieves high-fidelity, generalizability, and geometric consistency simultaneously. Specifically, we design a pair of guiding strategies derived from the coarse 3D prior generated by the 3D diffusion model: a structural guidance for geometric fidelity and a semantic guidance for 3D coherence. Employing the two types of guidance, the 2D diffusion model enriches the 3D content with diversified and high-quality results. Extensive experiments show the superiority of our Sherpa3D over the state-of-the-art text-to-3D methods in terms of quality and 3D consistency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.06655v1-abstract-full').style.display = 'none'; document.getElementById('2312.06655v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://liuff19.github.io/Sherpa3D/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.04784">arXiv:2312.04784</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.04784">pdf</a>, <a href="https://arxiv.org/format/2312.04784">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Reality&#39;s Canvas, Language&#39;s Brush: Crafting 3D Avatars from Monocular Video </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yuchen Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Pellitero%2C+E+P">Eduardo Perez Pellitero</a>, <a href="/search/cs?searchtype=author&amp;query=Busam%2C+B">Benjamin Busam</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Y">Yiren Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+J">Jifei Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.04784v2-abstract-short" style="display: inline;"> Recent advancements in 3D avatar generation excel with multi-view supervision for photorealistic models. However, monocular counterparts lag in quality despite broader applicability. We propose ReCaLaB to close this gap. ReCaLaB is a fully-differentiable pipeline that learns high-fidelity 3D human avatars from just a single RGB video. A pose-conditioned deformable NeRF is optimized to volumetrical&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.04784v2-abstract-full').style.display = 'inline'; document.getElementById('2312.04784v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.04784v2-abstract-full" style="display: none;"> Recent advancements in 3D avatar generation excel with multi-view supervision for photorealistic models. However, monocular counterparts lag in quality despite broader applicability. We propose ReCaLaB to close this gap. ReCaLaB is a fully-differentiable pipeline that learns high-fidelity 3D human avatars from just a single RGB video. A pose-conditioned deformable NeRF is optimized to volumetrically represent a human subject in canonical T-pose. The canonical representation is then leveraged to efficiently associate neural textures using 2D-3D correspondences. This enables the separation of diffused color generation and lighting correction branches that jointly compose an RGB prediction. The design allows to control intermediate results for human pose, body shape, texture, and lighting with text prompts. An image-conditioned diffusion model thereby helps to animate appearance and pose of the 3D avatar to create video sequences with previously unseen human motion. Extensive experiments show that ReCaLaB outperforms previous monocular approaches in terms of image quality for image synthesis tasks. Moreover, natural language offers an intuitive user interface for creative manipulation of 3D human avatars. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.04784v2-abstract-full').style.display = 'none'; document.getElementById('2312.04784v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Video link: https://youtu.be/Oz83z1es2J4</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.11857">arXiv:2309.11857</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2309.11857">pdf</a>, <a href="https://arxiv.org/format/2309.11857">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TCOVIS: Temporally Consistent Online Video Instance Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Junlong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+B">Bingyao Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yongming Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jie Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jiwen Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.11857v1-abstract-short" style="display: inline;"> In recent years, significant progress has been made in video instance segmentation (VIS), with many offline and online methods achieving state-of-the-art performance. While offline methods have the advantage of producing temporally consistent predictions, they are not suitable for real-time scenarios. Conversely, online methods are more practical, but maintaining temporal consistency remains a cha&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.11857v1-abstract-full').style.display = 'inline'; document.getElementById('2309.11857v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.11857v1-abstract-full" style="display: none;"> In recent years, significant progress has been made in video instance segmentation (VIS), with many offline and online methods achieving state-of-the-art performance. While offline methods have the advantage of producing temporally consistent predictions, they are not suitable for real-time scenarios. Conversely, online methods are more practical, but maintaining temporal consistency remains a challenging task. In this paper, we propose a novel online method for video instance segmentation, called TCOVIS, which fully exploits the temporal information in a video clip. The core of our method consists of a global instance assignment strategy and a spatio-temporal enhancement module, which improve the temporal consistency of the features from two aspects. Specifically, we perform global optimal matching between the predictions and ground truth across the whole video clip, and supervise the model with the global optimal objective. We also capture the spatial feature and aggregate it with the semantic feature between frames, thus realizing the spatio-temporal enhancement. We evaluate our method on four widely adopted VIS benchmarks, namely YouTube-VIS 2019/2021/2022 and OVIS, and achieve state-of-the-art performance on all benchmarks without bells-and-whistles. For instance, on YouTube-VIS 2021, TCOVIS achieves 49.5 AP and 61.3 AP with ResNet-50 and Swin-L backbones, respectively. Code is available at https://github.com/jun-long-li/TCOVIS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.11857v1-abstract-full').style.display = 'none'; document.getElementById('2309.11857v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 4 figures. This paper has been accepted for ICCV 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.05221">arXiv:2308.05221</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2308.05221">pdf</a>, <a href="https://arxiv.org/format/2308.05221">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Alexa, play with robot: Introducing the First Alexa Prize SimBot Challenge on Embodied AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Shi%2C+H">Hangjie Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Ball%2C+L">Leslie Ball</a>, <a href="/search/cs?searchtype=author&amp;query=Thattai%2C+G">Govind Thattai</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+D">Desheng Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+L">Lucy Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+Q">Qiaozi Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Shakiah%2C+S">Suhaila Shakiah</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+X">Xiaofeng Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Padmakumar%2C+A">Aishwarya Padmakumar</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+B">Bofei Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Chung%2C+C">Cadence Chung</a>, <a href="/search/cs?searchtype=author&amp;query=Guthy%2C+D">Dinakar Guthy</a>, <a href="/search/cs?searchtype=author&amp;query=Sukhatme%2C+G">Gaurav Sukhatme</a>, <a href="/search/cs?searchtype=author&amp;query=Arumugam%2C+K">Karthika Arumugam</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+M">Matthew Wen</a>, <a href="/search/cs?searchtype=author&amp;query=Ipek%2C+O">Osman Ipek</a>, <a href="/search/cs?searchtype=author&amp;query=Lange%2C+P">Patrick Lange</a>, <a href="/search/cs?searchtype=author&amp;query=Khanna%2C+R">Rohan Khanna</a>, <a href="/search/cs?searchtype=author&amp;query=Pansare%2C+S">Shreyas Pansare</a>, <a href="/search/cs?searchtype=author&amp;query=Sharma%2C+V">Vasu Sharma</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+C">Chao Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Flagg%2C+C">Cris Flagg</a>, <a href="/search/cs?searchtype=author&amp;query=Pressel%2C+D">Daniel Pressel</a>, <a href="/search/cs?searchtype=author&amp;query=Vaz%2C+L">Lavina Vaz</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+L">Luke Dai</a> , et al. (17 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.05221v1-abstract-short" style="display: inline;"> The Alexa Prize program has empowered numerous university students to explore, experiment, and showcase their talents in building conversational agents through challenges like the SocialBot Grand Challenge and the TaskBot Challenge. As conversational agents increasingly appear in multimodal and embodied contexts, it is important to explore the affordances of conversational interaction augmented wi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.05221v1-abstract-full').style.display = 'inline'; document.getElementById('2308.05221v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.05221v1-abstract-full" style="display: none;"> The Alexa Prize program has empowered numerous university students to explore, experiment, and showcase their talents in building conversational agents through challenges like the SocialBot Grand Challenge and the TaskBot Challenge. As conversational agents increasingly appear in multimodal and embodied contexts, it is important to explore the affordances of conversational interaction augmented with computer vision and physical embodiment. This paper describes the SimBot Challenge, a new challenge in which university teams compete to build robot assistants that complete tasks in a simulated physical environment. This paper provides an overview of the SimBot Challenge, which included both online and offline challenge phases. We describe the infrastructure and support provided to the teams including Alexa Arena, the simulated environment, and the ML toolkit provided to teams to accelerate their building of vision and language models. We summarize the approaches the participating teams took to overcome research challenges and extract key lessons learned. Finally, we provide analysis of the performance of the competing SimBots during the competition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.05221v1-abstract-full').style.display = 'none'; document.getElementById('2308.05221v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.14971">arXiv:2307.14971</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2307.14971">pdf</a>, <a href="https://arxiv.org/format/2307.14971">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Take-A-Photo: 3D-to-2D Generative Pre-training of Point Cloud Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Ziyi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+X">Xumin Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yongming Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jie Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jiwen Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.14971v2-abstract-short" style="display: inline;"> With the overwhelming trend of mask image modeling led by MAE, generative pre-training has shown a remarkable potential to boost the performance of fundamental models in 2D vision. However, in 3D vision, the over-reliance on Transformer-based backbones and the unordered nature of point clouds have restricted the further development of generative pre-training. In this paper, we propose a novel 3D-t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.14971v2-abstract-full').style.display = 'inline'; document.getElementById('2307.14971v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.14971v2-abstract-full" style="display: none;"> With the overwhelming trend of mask image modeling led by MAE, generative pre-training has shown a remarkable potential to boost the performance of fundamental models in 2D vision. However, in 3D vision, the over-reliance on Transformer-based backbones and the unordered nature of point clouds have restricted the further development of generative pre-training. In this paper, we propose a novel 3D-to-2D generative pre-training method that is adaptable to any point cloud model. We propose to generate view images from different instructed poses via the cross-attention mechanism as the pre-training scheme. Generating view images has more precise supervision than its point cloud counterpart, thus assisting 3D backbones to have a finer comprehension of the geometrical structure and stereoscopic relations of the point cloud. Experimental results have proved the superiority of our proposed 3D-to-2D generative pre-training over previous pre-training methods. Our method is also effective in boosting the performance of architecture-oriented approaches, achieving state-of-the-art performance when fine-tuning on ScanObjectNN classification and ShapeNetPart segmentation tasks. Code is available at https://github.com/wangzy22/TAP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.14971v2-abstract-full').style.display = 'none'; document.getElementById('2307.14971v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICCV 2023, project page: https://tap.ivg-research.xyz</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.02153">arXiv:2303.02153</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2303.02153">pdf</a>, <a href="https://arxiv.org/format/2303.02153">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unleashing Text-to-Image Diffusion Models for Visual Perception </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+W">Wenliang Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yongming Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zuyan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+B">Benlin Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jie Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jiwen Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.02153v1-abstract-short" style="display: inline;"> Diffusion models (DMs) have become the new trend of generative models and have demonstrated a powerful ability of conditional synthesis. Among those, text-to-image diffusion models pre-trained on large-scale image-text pairs are highly controllable by customizable prompts. Unlike the unconditional generative models that focus on low-level attributes and details, text-to-image diffusion models cont&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.02153v1-abstract-full').style.display = 'inline'; document.getElementById('2303.02153v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.02153v1-abstract-full" style="display: none;"> Diffusion models (DMs) have become the new trend of generative models and have demonstrated a powerful ability of conditional synthesis. Among those, text-to-image diffusion models pre-trained on large-scale image-text pairs are highly controllable by customizable prompts. Unlike the unconditional generative models that focus on low-level attributes and details, text-to-image diffusion models contain more high-level knowledge thanks to the vision-language pre-training. In this paper, we propose VPD (Visual Perception with a pre-trained Diffusion model), a new framework that exploits the semantic information of a pre-trained text-to-image diffusion model in visual perception tasks. Instead of using the pre-trained denoising autoencoder in a diffusion-based pipeline, we simply use it as a backbone and aim to study how to take full advantage of the learned knowledge. Specifically, we prompt the denoising decoder with proper textual inputs and refine the text features with an adapter, leading to a better alignment to the pre-trained stage and making the visual contents interact with the text prompts. We also propose to utilize the cross-attention maps between the visual features and the text features to provide explicit guidance. Compared with other pre-training methods, we show that vision-language pre-trained diffusion models can be faster adapted to downstream visual perception tasks using the proposed VPD. Extensive experiments on semantic segmentation, referring image segmentation and depth estimation demonstrates the effectiveness of our method. Notably, VPD attains 0.254 RMSE on NYUv2 depth estimation and 73.3% oIoU on RefCOCO-val referring image segmentation, establishing new records on these two benchmarks. Code is available at https://github.com/wl-zhao/VPD <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.02153v1-abstract-full').style.display = 'none'; document.getElementById('2303.02153v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">project page: https://vpd.ivg-research.xyz</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.01586">arXiv:2303.01586</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2303.01586">pdf</a>, <a href="https://arxiv.org/format/2303.01586">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Alexa Arena: A User-Centric Interactive Platform for Embodied AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gao%2C+Q">Qiaozi Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Thattai%2C+G">Govind Thattai</a>, <a href="/search/cs?searchtype=author&amp;query=Shakiah%2C+S">Suhaila Shakiah</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+X">Xiaofeng Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Pansare%2C+S">Shreyas Pansare</a>, <a href="/search/cs?searchtype=author&amp;query=Sharma%2C+V">Vasu Sharma</a>, <a href="/search/cs?searchtype=author&amp;query=Sukhatme%2C+G">Gaurav Sukhatme</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+H">Hangjie Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+B">Bofei Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+D">Desheng Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+L">Lucy Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Arumugam%2C+K">Karthika Arumugam</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+S">Shui Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+M">Matthew Wen</a>, <a href="/search/cs?searchtype=author&amp;query=Guthy%2C+D">Dinakar Guthy</a>, <a href="/search/cs?searchtype=author&amp;query=Chung%2C+C">Cadence Chung</a>, <a href="/search/cs?searchtype=author&amp;query=Khanna%2C+R">Rohan Khanna</a>, <a href="/search/cs?searchtype=author&amp;query=Ipek%2C+O">Osman Ipek</a>, <a href="/search/cs?searchtype=author&amp;query=Ball%2C+L">Leslie Ball</a>, <a href="/search/cs?searchtype=author&amp;query=Bland%2C+K">Kate Bland</a>, <a href="/search/cs?searchtype=author&amp;query=Rocker%2C+H">Heather Rocker</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yadunandana Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Johnston%2C+M">Michael Johnston</a>, <a href="/search/cs?searchtype=author&amp;query=Ghanadan%2C+R">Reza Ghanadan</a>, <a href="/search/cs?searchtype=author&amp;query=Mandal%2C+A">Arindam Mandal</a> , et al. (2 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.01586v2-abstract-short" style="display: inline;"> We introduce Alexa Arena, a user-centric simulation platform for Embodied AI (EAI) research. Alexa Arena provides a variety of multi-room layouts and interactable objects, for the creation of human-robot interaction (HRI) missions. With user-friendly graphics and control mechanisms, Alexa Arena supports the development of gamified robotic tasks readily accessible to general human users, thus openi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.01586v2-abstract-full').style.display = 'inline'; document.getElementById('2303.01586v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.01586v2-abstract-full" style="display: none;"> We introduce Alexa Arena, a user-centric simulation platform for Embodied AI (EAI) research. Alexa Arena provides a variety of multi-room layouts and interactable objects, for the creation of human-robot interaction (HRI) missions. With user-friendly graphics and control mechanisms, Alexa Arena supports the development of gamified robotic tasks readily accessible to general human users, thus opening a new venue for high-efficiency HRI data collection and EAI system evaluation. Along with the platform, we introduce a dialog-enabled instruction-following benchmark and provide baseline results for it. We make Alexa Arena publicly available to facilitate research in building generalizable and assistive embodied agents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.01586v2-abstract-full').style.display = 'none'; document.getElementById('2303.01586v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.04867">arXiv:2302.04867</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2302.04867">pdf</a>, <a href="https://arxiv.org/format/2302.04867">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> UniPC: A Unified Predictor-Corrector Framework for Fast Sampling of Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+W">Wenliang Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Bai%2C+L">Lujia Bai</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yongming Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jie Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jiwen Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.04867v4-abstract-short" style="display: inline;"> Diffusion probabilistic models (DPMs) have demonstrated a very promising ability in high-resolution image synthesis. However, sampling from a pre-trained DPM is time-consuming due to the multiple evaluations of the denoising network, making it more and more important to accelerate the sampling of DPMs. Despite recent progress in designing fast samplers, existing methods still cannot generate satis&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.04867v4-abstract-full').style.display = 'inline'; document.getElementById('2302.04867v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.04867v4-abstract-full" style="display: none;"> Diffusion probabilistic models (DPMs) have demonstrated a very promising ability in high-resolution image synthesis. However, sampling from a pre-trained DPM is time-consuming due to the multiple evaluations of the denoising network, making it more and more important to accelerate the sampling of DPMs. Despite recent progress in designing fast samplers, existing methods still cannot generate satisfying images in many applications where fewer steps (e.g., $&lt;$10) are favored. In this paper, we develop a unified corrector (UniC) that can be applied after any existing DPM sampler to increase the order of accuracy without extra model evaluations, and derive a unified predictor (UniP) that supports arbitrary order as a byproduct. Combining UniP and UniC, we propose a unified predictor-corrector framework called UniPC for the fast sampling of DPMs, which has a unified analytical form for any order and can significantly improve the sampling quality over previous methods, especially in extremely few steps. We evaluate our methods through extensive experiments including both unconditional and conditional sampling using pixel-space and latent-space DPMs. Our UniPC can achieve 3.87 FID on CIFAR10 (unconditional) and 7.51 FID on ImageNet 256$\times$256 (conditional) with only 10 function evaluations. Code is available at https://github.com/wl-zhao/UniPC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.04867v4-abstract-full').style.display = 'none'; document.getElementById('2302.04867v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2023. Project page: https://unipc.ivg-research.xyz</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.04545">arXiv:2301.04545</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2301.04545">pdf</a>, <a href="https://arxiv.org/format/2301.04545">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AdaPoinTr: Diverse Point Cloud Completion with Adaptive Geometry-Aware Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yu%2C+X">Xumin Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yongming Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Ziyi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jiwen Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jie Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.04545v1-abstract-short" style="display: inline;"> In this paper, we present a new method that reformulates point cloud completion as a set-to-set translation problem and design a new model, called PoinTr, which adopts a Transformer encoder-decoder architecture for point cloud completion. By representing the point cloud as a set of unordered groups of points with position embeddings, we convert the input data to a sequence of point proxies and emp&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.04545v1-abstract-full').style.display = 'inline'; document.getElementById('2301.04545v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.04545v1-abstract-full" style="display: none;"> In this paper, we present a new method that reformulates point cloud completion as a set-to-set translation problem and design a new model, called PoinTr, which adopts a Transformer encoder-decoder architecture for point cloud completion. By representing the point cloud as a set of unordered groups of points with position embeddings, we convert the input data to a sequence of point proxies and employ the Transformers for generation. To facilitate Transformers to better leverage the inductive bias about 3D geometric structures of point clouds, we further devise a geometry-aware block that models the local geometric relationships explicitly. The migration of Transformers enables our model to better learn structural knowledge and preserve detailed information for point cloud completion. Taking a step towards more complicated and diverse situations, we further propose AdaPoinTr by developing an adaptive query generation mechanism and designing a novel denoising task during completing a point cloud. Coupling these two techniques enables us to train the model efficiently and effectively: we reduce training time (by 15x or more) and improve completion performance (over 20%). We also show our method can be extended to the scene-level point cloud completion scenario by designing a new geometry-enhanced semantic scene completion framework. Extensive experiments on the existing and newly-proposed datasets demonstrate the effectiveness of our method, which attains 6.53 CD on PCN, 0.81 CD on ShapeNet-55 and 0.392 MMD on real-world KITTI, surpassing other work by a large margin and establishing new state-of-the-arts on various benchmarks. Most notably, AdaPoinTr can achieve such promising performance with higher throughputs and fewer FLOPs compared with the previous best methods in practice. The code and datasets are available at https://github.com/yuxumin/PoinTr <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.04545v1-abstract-full').style.display = 'none'; document.getElementById('2301.04545v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Extension of our ICCV 2021 work: arXiv:2108.08839 . Code is available at https://github.com/yuxumin/PoinTr</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.04638">arXiv:2212.04638</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2212.04638">pdf</a>, <a href="https://arxiv.org/format/2212.04638">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FLAG3D: A 3D Fitness Activity Dataset with Language Instruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tang%2C+Y">Yansong Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jinpeng Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+A">Aoyang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+B">Bin Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+W">Wenxun Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yongming Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jiwen Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jie Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xiu Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.04638v2-abstract-short" style="display: inline;"> With the continuously thriving popularity around the world, fitness activity analytic has become an emerging research topic in computer vision. While a variety of new tasks and algorithms have been proposed recently, there are growing hunger for data resources involved in high-quality data, fine-grained labels, and diverse environments. In this paper, we present FLAG3D, a large-scale 3D fitness ac&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.04638v2-abstract-full').style.display = 'inline'; document.getElementById('2212.04638v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.04638v2-abstract-full" style="display: none;"> With the continuously thriving popularity around the world, fitness activity analytic has become an emerging research topic in computer vision. While a variety of new tasks and algorithms have been proposed recently, there are growing hunger for data resources involved in high-quality data, fine-grained labels, and diverse environments. In this paper, we present FLAG3D, a large-scale 3D fitness activity dataset with language instruction containing 180K sequences of 60 categories. FLAG3D features the following three aspects: 1) accurate and dense 3D human pose captured from advanced MoCap system to handle the complex activity and large movement, 2) detailed and professional language instruction to describe how to perform a specific activity, 3) versatile video resources from a high-tech MoCap system, rendering software, and cost-effective smartphones in natural environments. Extensive experiments and in-depth analysis show that FLAG3D contributes great research value for various challenges, such as cross-domain human action recognition, dynamic human mesh recovery, and language-guided human action generation. Our dataset and source code are publicly available at https://andytang15.github.io/FLAG3D. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.04638v2-abstract-full').style.display = 'none'; document.getElementById('2212.04638v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.01253">arXiv:2210.01253</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2210.01253">pdf</a>, <a href="https://arxiv.org/format/2210.01253">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> PLOT: Prompt Learning with Optimal Transport for Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+G">Guangyi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+W">Weiran Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+X">Xiangchen Song</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xinyue Li</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yongming Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+K">Kun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.01253v2-abstract-short" style="display: inline;"> With the increasing attention to large vision-language models such as CLIP, there has been a significant amount of effort dedicated to building efficient prompts. Unlike conventional methods of only learning one single prompt, we propose to learn multiple comprehensive prompts to describe diverse characteristics of categories such as intrinsic attributes or extrinsic contexts. However, directly ma&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.01253v2-abstract-full').style.display = 'inline'; document.getElementById('2210.01253v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.01253v2-abstract-full" style="display: none;"> With the increasing attention to large vision-language models such as CLIP, there has been a significant amount of effort dedicated to building efficient prompts. Unlike conventional methods of only learning one single prompt, we propose to learn multiple comprehensive prompts to describe diverse characteristics of categories such as intrinsic attributes or extrinsic contexts. However, directly matching each prompt to the same visual feature is problematic, as it pushes the prompts to converge to one point. To solve this problem, we propose to apply optimal transport to match the vision and text modalities. Specifically, we first model images and the categories with visual and textual feature sets. Then, we apply a two-stage optimization strategy to learn the prompts. In the inner loop, we optimize the optimal transport distance to align visual features and prompts by the Sinkhorn algorithm, while in the outer loop, we learn the prompts by this distance from the supervised data. Extensive experiments are conducted on the few-shot recognition task and the improvement demonstrates the superiority of our method. The code is available at https://github.com/CHENGY12/PLOT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.01253v2-abstract-full').style.display = 'none'; document.getElementById('2210.01253v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2023, Spotlight</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.05555">arXiv:2209.05555</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2209.05555">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> An Embedding-Based Grocery Search Model at Instacart </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xie%2C+Y">Yuqing Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Na%2C+T">Taesik Na</a>, <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+X">Xiao Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=Manchanda%2C+S">Saurav Manchanda</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Young Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Z">Zhihong Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Shu%2C+G">Guanghua Shu</a>, <a href="/search/cs?searchtype=author&amp;query=Vasiete%2C+E">Esther Vasiete</a>, <a href="/search/cs?searchtype=author&amp;query=Tenneti%2C+T">Tejaswi Tenneti</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Haixun Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.05555v1-abstract-short" style="display: inline;"> The key to e-commerce search is how to best utilize the large yet noisy log data. In this paper, we present our embedding-based model for grocery search at Instacart. The system learns query and product representations with a two-tower transformer-based encoder architecture. To tackle the cold-start problem, we focus on content-based features. To train the model efficiently on noisy data, we propo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.05555v1-abstract-full').style.display = 'inline'; document.getElementById('2209.05555v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.05555v1-abstract-full" style="display: none;"> The key to e-commerce search is how to best utilize the large yet noisy log data. In this paper, we present our embedding-based model for grocery search at Instacart. The system learns query and product representations with a two-tower transformer-based encoder architecture. To tackle the cold-start problem, we focus on content-based features. To train the model efficiently on noisy data, we propose a self-adversarial learning method and a cascade training method. AccOn an offline human evaluation dataset, we achieve 10% relative improvement in RECALL@20, and for online A/B testing, we achieve 4.1% cart-adds per search (CAPS) and 1.5% gross merchandise value (GMV) improvement. We describe how we train and deploy the embedding based search model and give a detailed analysis of the effectiveness of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.05555v1-abstract-full').style.display = 'none'; document.getElementById('2209.05555v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by SIGIR eCom, July 15, 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.02812">arXiv:2208.02812</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2208.02812">pdf</a>, <a href="https://arxiv.org/format/2208.02812">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> P2P: Tuning Pre-trained Image Models for Point Cloud Analysis with Point-to-Pixel Prompting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Ziyi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+X">Xumin Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yongming Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jie Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jiwen Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.02812v2-abstract-short" style="display: inline;"> Nowadays, pre-training big models on large-scale datasets has become a crucial topic in deep learning. The pre-trained models with high representation ability and transferability achieve a great success and dominate many downstream tasks in natural language processing and 2D vision. However, it is non-trivial to promote such a pretraining-tuning paradigm to the 3D vision, given the limited trainin&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.02812v2-abstract-full').style.display = 'inline'; document.getElementById('2208.02812v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.02812v2-abstract-full" style="display: none;"> Nowadays, pre-training big models on large-scale datasets has become a crucial topic in deep learning. The pre-trained models with high representation ability and transferability achieve a great success and dominate many downstream tasks in natural language processing and 2D vision. However, it is non-trivial to promote such a pretraining-tuning paradigm to the 3D vision, given the limited training data that are relatively inconvenient to collect. In this paper, we provide a new perspective of leveraging pre-trained 2D knowledge in 3D domain to tackle this problem, tuning pre-trained image models with the novel Point-to-Pixel prompting for point cloud analysis at a minor parameter cost. Following the principle of prompting engineering, we transform point clouds into colorful images with geometry-preserved projection and geometry-aware coloring to adapt to pre-trained image models, whose weights are kept frozen during the end-to-end optimization of point cloud analysis tasks. We conduct extensive experiments to demonstrate that cooperating with our proposed Point-to-Pixel Prompting, better pre-trained image model will lead to consistently better performance in 3D vision. Enjoying prosperous development from image pre-training field, our method attains 89.3% accuracy on the hardest setting of ScanObjectNN, surpassing conventional point cloud models with much fewer trainable parameters. Our framework also exhibits very competitive performance on ModelNet classification and ShapeNet Part Segmentation. Code is available at https://github.com/wangzy22/P2P. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.02812v2-abstract-full').style.display = 'none'; document.getElementById('2208.02812v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2022, project page: https://p2p.ivg-research.xyz</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.14284">arXiv:2207.14284</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2207.14284">pdf</a>, <a href="https://arxiv.org/format/2207.14284">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HorNet: Efficient High-Order Spatial Interactions with Recursive Gated Convolutions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yongming Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+W">Wenliang Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+Y">Yansong Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jie Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Lim%2C+S">Ser-Nam Lim</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jiwen Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.14284v3-abstract-short" style="display: inline;"> Recent progress in vision Transformers exhibits great success in various tasks driven by the new spatial modeling mechanism based on dot-product self-attention. In this paper, we show that the key ingredients behind the vision Transformers, namely input-adaptive, long-range and high-order spatial interactions, can also be efficiently implemented with a convolution-based framework. We present the R&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.14284v3-abstract-full').style.display = 'inline'; document.getElementById('2207.14284v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.14284v3-abstract-full" style="display: none;"> Recent progress in vision Transformers exhibits great success in various tasks driven by the new spatial modeling mechanism based on dot-product self-attention. In this paper, we show that the key ingredients behind the vision Transformers, namely input-adaptive, long-range and high-order spatial interactions, can also be efficiently implemented with a convolution-based framework. We present the Recursive Gated Convolution ($\textit{g}^\textit{n}$Conv) that performs high-order spatial interactions with gated convolutions and recursive designs. The new operation is highly flexible and customizable, which is compatible with various variants of convolution and extends the two-order interactions in self-attention to arbitrary orders without introducing significant extra computation. $\textit{g}^\textit{n}$Conv can serve as a plug-and-play module to improve various vision Transformers and convolution-based models. Based on the operation, we construct a new family of generic vision backbones named HorNet. Extensive experiments on ImageNet classification, COCO object detection and ADE20K semantic segmentation show HorNet outperform Swin Transformers and ConvNeXt by a significant margin with similar overall architecture and training configurations. HorNet also shows favorable scalability to more training data and larger model sizes. Apart from the effectiveness in visual encoders, we also show $\textit{g}^\textit{n}$Conv can be applied to task-specific decoders and consistently improve dense prediction performance with less computation. Our results demonstrate that $\textit{g}^\textit{n}$Conv can be a new basic module for visual modeling that effectively combines the merits of both vision Transformers and CNNs. Code is available at https://github.com/raoyongming/HorNet <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.14284v3-abstract-full').style.display = 'none'; document.getElementById('2207.14284v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">project page: https://hornet.ivg-research.xyz</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.01580">arXiv:2207.01580</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2207.01580">pdf</a>, <a href="https://arxiv.org/format/2207.01580">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Spatial Sparsification for Efficient Vision Transformers and Convolutional Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yongming Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zuyan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+W">Wenliang Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jie Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jiwen Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.01580v2-abstract-short" style="display: inline;"> In this paper, we present a new approach for model acceleration by exploiting spatial sparsity in visual data. We observe that the final prediction in vision Transformers is only based on a subset of the most informative tokens, which is sufficient for accurate image recognition. Based on this observation, we propose a dynamic token sparsification framework to prune redundant tokens progressively&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.01580v2-abstract-full').style.display = 'inline'; document.getElementById('2207.01580v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.01580v2-abstract-full" style="display: none;"> In this paper, we present a new approach for model acceleration by exploiting spatial sparsity in visual data. We observe that the final prediction in vision Transformers is only based on a subset of the most informative tokens, which is sufficient for accurate image recognition. Based on this observation, we propose a dynamic token sparsification framework to prune redundant tokens progressively and dynamically based on the input to accelerate vision Transformers. Specifically, we devise a lightweight prediction module to estimate the importance score of each token given the current features. The module is added to different layers to prune redundant tokens hierarchically. While the framework is inspired by our observation of the sparse attention in vision Transformers, we find the idea of adaptive and asymmetric computation can be a general solution for accelerating various architectures. We extend our method to hierarchical models including CNNs and hierarchical vision Transformers as well as more complex dense prediction tasks that require structured feature maps by formulating a more generic dynamic spatial sparsification framework with progressive sparsification and asymmetric computation for different spatial locations. By applying lightweight fast paths to less informative features and using more expressive slow paths to more important locations, we can maintain the structure of feature maps while significantly reducing the overall computations. Extensive experiments demonstrate the effectiveness of our framework on various modern architectures and different visual recognition tasks. Our results clearly demonstrate that dynamic spatial sparsification offers a new and more effective dimension for model acceleration. Code is available at https://github.com/raoyongming/DynamicViT <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.01580v2-abstract-full').style.display = 'none'; document.getElementById('2207.01580v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to T-PAMI. Journal version of our NeurIPS 2021 work: arXiv:2106.02034. Code is available at https://github.com/raoyongming/DynamicViT</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.11228">arXiv:2206.11228</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2206.11228">pdf</a>, <a href="https://arxiv.org/format/2206.11228">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Adversarially trained neural representations may already be as robust as corresponding biological neural representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Guo%2C+C">Chong Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+M+J">Michael J. Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Leclerc%2C+G">Guillaume Leclerc</a>, <a href="/search/cs?searchtype=author&amp;query=Dapello%2C+J">Joel Dapello</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yug Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Madry%2C+A">Aleksander Madry</a>, <a href="/search/cs?searchtype=author&amp;query=DiCarlo%2C+J+J">James J. DiCarlo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.11228v1-abstract-short" style="display: inline;"> Visual systems of primates are the gold standard of robust perception. There is thus a general belief that mimicking the neural representations that underlie those systems will yield artificial visual systems that are adversarially robust. In this work, we develop a method for performing adversarial visual attacks directly on primate brain activity. We then leverage this method to demonstrate that&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.11228v1-abstract-full').style.display = 'inline'; document.getElementById('2206.11228v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.11228v1-abstract-full" style="display: none;"> Visual systems of primates are the gold standard of robust perception. There is thus a general belief that mimicking the neural representations that underlie those systems will yield artificial visual systems that are adversarially robust. In this work, we develop a method for performing adversarial visual attacks directly on primate brain activity. We then leverage this method to demonstrate that the above-mentioned belief might not be well founded. Specifically, we report that the biological neurons that make up visual systems of primates exhibit susceptibility to adversarial perturbations that is comparable in magnitude to existing (robustly trained) artificial neural networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.11228v1-abstract-full').style.display = 'none'; document.getElementById('2206.11228v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 6 figures, ICML2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.04916">arXiv:2206.04916</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2206.04916">pdf</a>, <a href="https://arxiv.org/format/2206.04916">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PatchComplete: Learning Multi-Resolution Patch Priors for 3D Shape Completion on Unseen Categories </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yuchen Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yinyu Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+A">Angela Dai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.04916v2-abstract-short" style="display: inline;"> While 3D shape representations enable powerful reasoning in many visual and perception applications, learning 3D shape priors tends to be constrained to the specific categories trained on, leading to an inefficient learning process, particularly for general applications with unseen categories. Thus, we propose PatchComplete, which learns effective shape priors based on multi-resolution local patch&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.04916v2-abstract-full').style.display = 'inline'; document.getElementById('2206.04916v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.04916v2-abstract-full" style="display: none;"> While 3D shape representations enable powerful reasoning in many visual and perception applications, learning 3D shape priors tends to be constrained to the specific categories trained on, leading to an inefficient learning process, particularly for general applications with unseen categories. Thus, we propose PatchComplete, which learns effective shape priors based on multi-resolution local patches, which are often more general than full shapes (e.g., chairs and tables often both share legs) and thus enable geometric reasoning about unseen class categories. To learn these shared substructures, we learn multi-resolution patch priors across all train categories, which are then associated to input partial shape observations by attention across the patch priors, and finally decoded into a complete shape reconstruction. Such patch-based priors avoid overfitting to specific train categories and enable reconstruction on entirely unseen categories at test time. We demonstrate the effectiveness of our approach on synthetic ShapeNet data as well as challenging real-scanned objects from ScanNet, which include noise and clutter, improving over state of the art in novel-category shape completion by 19.3% in chamfer distance on ShapeNet, and 9.0% for ScanNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.04916v2-abstract-full').style.display = 'none'; document.getElementById('2206.04916v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Video link: https://www.youtube.com/watch?v=Ch1rvw2D_Kc ; Project page: https://yuchenrao.github.io/projects/patchComplete/patchComplete.html ; Accepted to NeurIPS&#39;22</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.13490">arXiv:2205.13490</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2205.13490">pdf</a>, <a href="https://arxiv.org/format/2205.13490">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SemAffiNet: Semantic-Affine Transformation for Point Cloud Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Ziyi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yongming Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+X">Xumin Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jie Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jiwen Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.13490v1-abstract-short" style="display: inline;"> Conventional point cloud semantic segmentation methods usually employ an encoder-decoder architecture, where mid-level features are locally aggregated to extract geometric information. However, the over-reliance on these class-agnostic local geometric representations may raise confusion between local parts from different categories that are similar in appearance or spatially adjacent. To address t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.13490v1-abstract-full').style.display = 'inline'; document.getElementById('2205.13490v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.13490v1-abstract-full" style="display: none;"> Conventional point cloud semantic segmentation methods usually employ an encoder-decoder architecture, where mid-level features are locally aggregated to extract geometric information. However, the over-reliance on these class-agnostic local geometric representations may raise confusion between local parts from different categories that are similar in appearance or spatially adjacent. To address this issue, we argue that mid-level features can be further enhanced with semantic information, and propose semantic-affine transformation that transforms features of mid-level points belonging to different categories with class-specific affine parameters. Based on this technique, we propose SemAffiNet for point cloud semantic segmentation, which utilizes the attention mechanism in the Transformer module to implicitly and explicitly capture global structural knowledge within local parts for overall comprehension of each category. We conduct extensive experiments on the ScanNetV2 and NYUv2 datasets, and evaluate semantic-affine transformation on various 3D point cloud and 2D image segmentation baselines, where both qualitative and quantitative results demonstrate the superiority and generalization ability of our proposed approach. Code is available at https://github.com/wangzy22/SemAffiNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.13490v1-abstract-full').style.display = 'none'; document.getElementById('2205.13490v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.03646">arXiv:2204.03646</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2204.03646">pdf</a>, <a href="https://arxiv.org/format/2204.03646">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FineDiving: A Fine-grained Dataset for Procedure-aware Action Quality Assessment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+J">Jinglin Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yongming Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+X">Xumin Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+G">Guangyi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jie Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jiwen Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.03646v1-abstract-short" style="display: inline;"> Most existing action quality assessment methods rely on the deep features of an entire video to predict the score, which is less reliable due to the non-transparent inference process and poor interpretability. We argue that understanding both high-level semantics and internal temporal structures of actions in competitive sports videos is the key to making predictions accurate and interpretable. To&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.03646v1-abstract-full').style.display = 'inline'; document.getElementById('2204.03646v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.03646v1-abstract-full" style="display: none;"> Most existing action quality assessment methods rely on the deep features of an entire video to predict the score, which is less reliable due to the non-transparent inference process and poor interpretability. We argue that understanding both high-level semantics and internal temporal structures of actions in competitive sports videos is the key to making predictions accurate and interpretable. Towards this goal, we construct a new fine-grained dataset, called FineDiving, developed on diverse diving events with detailed annotations on action procedures. We also propose a procedure-aware approach for action quality assessment, learned by a new Temporal Segmentation Attention module. Specifically, we propose to parse pairwise query and exemplar action instances into consecutive steps with diverse semantic and temporal correspondences. The procedure-aware cross-attention is proposed to learn embeddings between query and exemplar steps to discover their semantic, spatial, and temporal correspondences, and further serve for fine-grained contrastive regression to derive a reliable scoring mechanism. Extensive experiments demonstrate that our approach achieves substantial improvements over state-of-the-art methods with better interpretability. The dataset and code are available at \url{https://github.com/xujinglin/FineDiving}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.03646v1-abstract-full').style.display = 'none'; document.getElementById('2204.03646v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Computer Vision and Pattern Recognition 2022 (Oral presentation)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.03636">arXiv:2204.03636</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2204.03636">pdf</a>, <a href="https://arxiv.org/format/2204.03636">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SurroundDepth: Entangling Surrounding Views for Self-Supervised Multi-Camera Depth Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wei%2C+Y">Yi Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+L">Linqing Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+W">Wenzhao Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Z">Zheng Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yongming Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+G">Guan Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jiwen Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jie Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.03636v3-abstract-short" style="display: inline;"> Depth estimation from images serves as the fundamental step of 3D perception for autonomous driving and is an economical alternative to expensive depth sensors like LiDAR. The temporal photometric constraints enables self-supervised depth estimation without labels, further facilitating its application. However, most existing methods predict the depth solely based on each monocular image and ignore&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.03636v3-abstract-full').style.display = 'inline'; document.getElementById('2204.03636v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.03636v3-abstract-full" style="display: none;"> Depth estimation from images serves as the fundamental step of 3D perception for autonomous driving and is an economical alternative to expensive depth sensors like LiDAR. The temporal photometric constraints enables self-supervised depth estimation without labels, further facilitating its application. However, most existing methods predict the depth solely based on each monocular image and ignore the correlations among multiple surrounding cameras, which are typically available for modern self-driving vehicles. In this paper, we propose a SurroundDepth method to incorporate the information from multiple surrounding views to predict depth maps across cameras. Specifically, we employ a joint network to process all the surrounding views and propose a cross-view transformer to effectively fuse the information from multiple views. We apply cross-view self-attention to efficiently enable the global interactions between multi-camera feature maps. Different from self-supervised monocular depth estimation, we are able to predict real-world scales given multi-camera extrinsic matrices. To achieve this goal, we adopt the two-frame structure-from-motion to extract scale-aware pseudo depths to pretrain the models. Further, instead of predicting the ego-motion of each individual camera, we estimate a universal ego-motion of the vehicle and transfer it to each view to achieve multi-view ego-motion consistency. In experiments, our method achieves the state-of-the-art performance on the challenging multi-camera depth estimation datasets DDAD and nuScenes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.03636v3-abstract-full').style.display = 'none'; document.getElementById('2204.03636v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CoRL 2022. Project page: https://surrounddepth.ivg-research.xyz Code: https://github.com/weiyithu/SurroundDepth</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.14956">arXiv:2203.14956</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2203.14956">pdf</a>, <a href="https://arxiv.org/format/2203.14956">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> LiDAR Distillation: Bridging the Beam-Induced Domain Gap for 3D Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wei%2C+Y">Yi Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+Z">Zibu Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yongming Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jiaxin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jie Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jiwen Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.14956v2-abstract-short" style="display: inline;"> In this paper, we propose the LiDAR Distillation to bridge the domain gap induced by different LiDAR beams for 3D object detection. In many real-world applications, the LiDAR points used by mass-produced robots and vehicles usually have fewer beams than that in large-scale public datasets. Moreover, as the LiDARs are upgraded to other product models with different beam amount, it becomes challengi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.14956v2-abstract-full').style.display = 'inline'; document.getElementById('2203.14956v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.14956v2-abstract-full" style="display: none;"> In this paper, we propose the LiDAR Distillation to bridge the domain gap induced by different LiDAR beams for 3D object detection. In many real-world applications, the LiDAR points used by mass-produced robots and vehicles usually have fewer beams than that in large-scale public datasets. Moreover, as the LiDARs are upgraded to other product models with different beam amount, it becomes challenging to utilize the labeled data captured by previous versions&#39; high-resolution sensors. Despite the recent progress on domain adaptive 3D detection, most methods struggle to eliminate the beam-induced domain gap. We find that it is essential to align the point cloud density of the source domain with that of the target domain during the training process. Inspired by this discovery, we propose a progressive framework to mitigate the beam-induced domain shift. In each iteration, we first generate low-beam pseudo LiDAR by downsampling the high-beam point clouds. Then the teacher-student framework is employed to distill rich information from the data with more beams. Extensive experiments on Waymo, nuScenes and KITTI datasets with three different LiDAR-based detectors demonstrate the effectiveness of our LiDAR Distillation. Notably, our approach does not increase any additional computation cost for inference. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.14956v2-abstract-full').style.display = 'none'; document.getElementById('2203.14956v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ECCV 2022. Code is available at https://github.com/weiyithu/LiDAR-Distillation</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.14101">arXiv:2203.14101</a> <span>&nbsp;&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Roadmap for Big Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+S">Sha Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+H">Hanyu Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+S">Shuai Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Leng%2C+J">Jiahong Leng</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+Y">Yangxiao Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xiaozhi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+J">Jifan Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Lv%2C+X">Xin Lv</a>, <a href="/search/cs?searchtype=author&amp;query=Shao%2C+Z">Zhou Shao</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+J">Jiaao He</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Y">Yankai Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xu Han</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhenghao Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+N">Ning Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yongming Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+Y">Yizhao Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Liang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+M">Ming Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Fang%2C+C">Cong Fang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yisen Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Long%2C+M">Mingsheng Long</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Y">Yinpeng Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Pang%2C+T">Tianyu Pang</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+P">Peng Cui</a> , et al. (75 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.14101v4-abstract-short" style="display: inline;"> With the rapid development of deep learning, training Big Models (BMs) for multiple downstream tasks becomes a popular paradigm. Researchers have achieved various outcomes in the construction of BMs and the BM application in many fields. At present, there is a lack of research work that sorts out the overall progress of BMs and guides the follow-up research. In this paper, we cover not only the BM&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.14101v4-abstract-full').style.display = 'inline'; document.getElementById('2203.14101v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.14101v4-abstract-full" style="display: none;"> With the rapid development of deep learning, training Big Models (BMs) for multiple downstream tasks becomes a popular paradigm. Researchers have achieved various outcomes in the construction of BMs and the BM application in many fields. At present, there is a lack of research work that sorts out the overall progress of BMs and guides the follow-up research. In this paper, we cover not only the BM technologies themselves but also the prerequisites for BM training and applications with BMs, dividing the BM review into four parts: Resource, Models, Key Technologies and Application. We introduce 16 specific BM-related topics in those four parts, they are Data, Knowledge, Computing System, Parallel Training System, Language Model, Vision Model, Multi-modal Model, Theory&amp;Interpretability, Commonsense Reasoning, Reliability&amp;Security, Governance, Evaluation, Machine Translation, Text Generation, Dialogue and Protein Research. In each topic, we summarize clearly the current studies and propose some future research directions. At the end of this paper, we conclude the further development of BMs in a more general view. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.14101v4-abstract-full').style.display = 'none'; document.getElementById('2203.14101v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This report has been withdrawn by the authors due to critical issues in Section 2.3.1 of Article 2</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.13777">arXiv:2203.13777</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2203.13777">pdf</a>, <a href="https://arxiv.org/format/2203.13777">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Stochastic Trajectory Prediction via Motion Indeterminacy Diffusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gu%2C+T">Tianpei Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+G">Guangyi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Junlong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+C">Chunze Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yongming Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jie Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jiwen Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.13777v1-abstract-short" style="display: inline;"> Human behavior has the nature of indeterminacy, which requires the pedestrian trajectory prediction system to model the multi-modality of future motion states. Unlike existing stochastic trajectory prediction methods which usually use a latent variable to represent multi-modality, we explicitly simulate the process of human motion variation from indeterminate to determinate. In this paper, we pres&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.13777v1-abstract-full').style.display = 'inline'; document.getElementById('2203.13777v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.13777v1-abstract-full" style="display: none;"> Human behavior has the nature of indeterminacy, which requires the pedestrian trajectory prediction system to model the multi-modality of future motion states. Unlike existing stochastic trajectory prediction methods which usually use a latent variable to represent multi-modality, we explicitly simulate the process of human motion variation from indeterminate to determinate. In this paper, we present a new framework to formulate the trajectory prediction task as a reverse process of motion indeterminacy diffusion (MID), in which we progressively discard indeterminacy from all the walkable areas until reaching the desired trajectory. This process is learned with a parameterized Markov chain conditioned by the observed trajectories. We can adjust the length of the chain to control the degree of indeterminacy and balance the diversity and determinacy of the predictions. Specifically, we encode the history behavior information and the social interactions as a state embedding and devise a Transformer-based diffusion model to capture the temporal dependencies of trajectories. Extensive experiments on the human trajectory prediction benchmarks including the Stanford Drone and ETH/UCY datasets demonstrate the superiority of our method. Code is available at https://github.com/gutianpei/MID. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.13777v1-abstract-full').style.display = 'none'; document.getElementById('2203.13777v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.05238">arXiv:2203.05238</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2203.05238">pdf</a>, <a href="https://arxiv.org/format/2203.05238">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Back to Reality: Weakly-supervised 3D Object Detection with Shape-guided Label Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+X">Xiuwei Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yifan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+Y">Yu Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yongming Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jie Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jiwen Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.05238v3-abstract-short" style="display: inline;"> In this paper, we propose a weakly-supervised approach for 3D object detection, which makes it possible to train a strong 3D detector with position-level annotations (i.e. annotations of object centers). In order to remedy the information loss from box annotations to centers, our method, namely Back to Reality (BR), makes use of synthetic 3D shapes to convert the weak labels into fully-annotated v&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.05238v3-abstract-full').style.display = 'inline'; document.getElementById('2203.05238v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.05238v3-abstract-full" style="display: none;"> In this paper, we propose a weakly-supervised approach for 3D object detection, which makes it possible to train a strong 3D detector with position-level annotations (i.e. annotations of object centers). In order to remedy the information loss from box annotations to centers, our method, namely Back to Reality (BR), makes use of synthetic 3D shapes to convert the weak labels into fully-annotated virtual scenes as stronger supervision, and in turn utilizes the perfect virtual labels to complement and refine the real labels. Specifically, we first assemble 3D shapes into physically reasonable virtual scenes according to the coarse scene layout extracted from position-level annotations. Then we go back to reality by applying a virtual-to-real domain adaptation method, which refine the weak labels and additionally supervise the training of detector with the virtual scenes. Furthermore, we propose a more challenging benckmark for indoor 3D object detection with more diversity in object sizes to better show the potential of BR. With less than 5% of the labeling labor, we achieve comparable detection performance with some popular fully-supervised approaches on the widely used ScanNet dataset. Code is available at: https://github.com/wyf-ACCEPT/BackToReality <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.05238v3-abstract-full').style.display = 'none'; document.getElementById('2203.05238v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.12053">arXiv:2112.12053</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2112.12053">pdf</a>, <a href="https://arxiv.org/format/2112.12053">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multi-View Partial (MVP) Point Cloud Challenge 2021 on Completion and Registration: Methods and Results </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pan%2C+L">Liang Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+T">Tong Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+Z">Zhongang Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Ziwei Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+X">Xumin Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yongming Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jiwen Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jie Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+M">Mingye Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+X">Xiaoyuan Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Fu%2C+K">Kexue Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+P">Peng Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+M">Manning Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yali Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Junsheng Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+X">Xin Wen</a>, <a href="/search/cs?searchtype=author&amp;query=Xiang%2C+P">Peng Xiang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yu-Shen Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+Z">Zhizhong Han</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+Y">Yuanjie Yan</a>, <a href="/search/cs?searchtype=author&amp;query=An%2C+J">Junyi An</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+L">Lifa Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+C">Changwei Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+D">Dongrui Liu</a> , et al. (4 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.12053v1-abstract-short" style="display: inline;"> As real-scanned point clouds are mostly partial due to occlusions and viewpoints, reconstructing complete 3D shapes based on incomplete observations becomes a fundamental problem for computer vision. With a single incomplete point cloud, it becomes the partial point cloud completion problem. Given multiple different observations, 3D reconstruction can be addressed by performing partial-to-partial&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.12053v1-abstract-full').style.display = 'inline'; document.getElementById('2112.12053v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.12053v1-abstract-full" style="display: none;"> As real-scanned point clouds are mostly partial due to occlusions and viewpoints, reconstructing complete 3D shapes based on incomplete observations becomes a fundamental problem for computer vision. With a single incomplete point cloud, it becomes the partial point cloud completion problem. Given multiple different observations, 3D reconstruction can be addressed by performing partial-to-partial point cloud registration. Recently, a large-scale Multi-View Partial (MVP) point cloud dataset has been released, which consists of over 100,000 high-quality virtual-scanned partial point clouds. Based on the MVP dataset, this paper reports methods and results in the Multi-View Partial Point Cloud Challenge 2021 on Completion and Registration. In total, 128 participants registered for the competition, and 31 teams made valid submissions. The top-ranked solutions will be analyzed, and then we will discuss future research directions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.12053v1-abstract-full').style.display = 'none'; document.getElementById('2112.12053v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 13 figures, ICCV2021 Workshop Technique Report, the codebase webpage: https://github.com/paul007pl/MVP_Benchmark</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.01518">arXiv:2112.01518</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2112.01518">pdf</a>, <a href="https://arxiv.org/format/2112.01518">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DenseCLIP: Language-Guided Dense Prediction with Context-Aware Prompting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yongming Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+W">Wenliang Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+G">Guangyi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+Y">Yansong Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Z">Zheng Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+G">Guan Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jie Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jiwen Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.01518v2-abstract-short" style="display: inline;"> Recent progress has shown that large-scale pre-training using contrastive image-text pairs can be a promising alternative for high-quality visual representation learning from natural language supervision. Benefiting from a broader source of supervision, this new paradigm exhibits impressive transferability to downstream classification tasks and datasets. However, the problem of transferring the kn&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.01518v2-abstract-full').style.display = 'inline'; document.getElementById('2112.01518v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.01518v2-abstract-full" style="display: none;"> Recent progress has shown that large-scale pre-training using contrastive image-text pairs can be a promising alternative for high-quality visual representation learning from natural language supervision. Benefiting from a broader source of supervision, this new paradigm exhibits impressive transferability to downstream classification tasks and datasets. However, the problem of transferring the knowledge learned from image-text pairs to more complex dense prediction tasks has barely been visited. In this work, we present a new framework for dense prediction by implicitly and explicitly leveraging the pre-trained knowledge from CLIP. Specifically, we convert the original image-text matching problem in CLIP to a pixel-text matching problem and use the pixel-text score maps to guide the learning of dense prediction models. By further using the contextual information from the image to prompt the language model, we are able to facilitate our model to better exploit the pre-trained knowledge. Our method is model-agnostic, which can be applied to arbitrary dense prediction systems and various pre-trained visual backbones including both CLIP models and ImageNet pre-trained models. Extensive experiments demonstrate the superior performance of our methods on semantic segmentation, object detection, and instance segmentation tasks. Code is available at https://github.com/raoyongming/DenseCLIP <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.01518v2-abstract-full').style.display = 'none'; document.getElementById('2112.01518v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR2022. Project page: https://denseclip.ivg-research.xyz</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.14819">arXiv:2111.14819</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2111.14819">pdf</a>, <a href="https://arxiv.org/format/2111.14819">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Point-BERT: Pre-training 3D Point Cloud Transformers with Masked Point Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yu%2C+X">Xumin Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+L">Lulu Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yongming Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+T">Tiejun Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jie Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jiwen Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.14819v2-abstract-short" style="display: inline;"> We present Point-BERT, a new paradigm for learning Transformers to generalize the concept of BERT to 3D point cloud. Inspired by BERT, we devise a Masked Point Modeling (MPM) task to pre-train point cloud Transformers. Specifically, we first divide a point cloud into several local point patches, and a point cloud Tokenizer with a discrete Variational AutoEncoder (dVAE) is designed to generate disc&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.14819v2-abstract-full').style.display = 'inline'; document.getElementById('2111.14819v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.14819v2-abstract-full" style="display: none;"> We present Point-BERT, a new paradigm for learning Transformers to generalize the concept of BERT to 3D point cloud. Inspired by BERT, we devise a Masked Point Modeling (MPM) task to pre-train point cloud Transformers. Specifically, we first divide a point cloud into several local point patches, and a point cloud Tokenizer with a discrete Variational AutoEncoder (dVAE) is designed to generate discrete point tokens containing meaningful local information. Then, we randomly mask out some patches of input point clouds and feed them into the backbone Transformers. The pre-training objective is to recover the original point tokens at the masked locations under the supervision of point tokens obtained by the Tokenizer. Extensive experiments demonstrate that the proposed BERT-style pre-training strategy significantly improves the performance of standard point cloud Transformers. Equipped with our pre-training strategy, we show that a pure Transformer architecture attains 93.8% accuracy on ModelNet40 and 83.1% accuracy on the hardest setting of ScanObjectNN, surpassing carefully designed point cloud models with much fewer hand-made designs. We also demonstrate that the representations learned by Point-BERT transfer well to new tasks and domains, where our models largely advance the state-of-the-art of few-shot point cloud classification task. The code and pre-trained models are available at https://github.com/lulutang0608/Point-BERT <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.14819v2-abstract-full').style.display = 'none'; document.getElementById('2111.14819v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2022, Project page: https://point-bert.ivg-research.xyz</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.14094">arXiv:2111.14094</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2111.14094">pdf</a>, <a href="https://arxiv.org/format/2111.14094">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Topic Driven Adaptive Network for Cross-Domain Sentiment Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Y">Yicheng Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+Y">Yiqiao Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Q">Qingyuan Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+F+L">Fu Lee Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yanghui Rao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.14094v2-abstract-short" style="display: inline;"> Cross-domain sentiment classification has been a hot spot these years, which aims to learn a reliable classifier using labeled data from a source domain and evaluate it on a target domain. In this vein, most approaches utilized domain adaptation that maps data from different domains into a common feature space. To further improve the model performance, several methods targeted to mine domain-speci&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.14094v2-abstract-full').style.display = 'inline'; document.getElementById('2111.14094v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.14094v2-abstract-full" style="display: none;"> Cross-domain sentiment classification has been a hot spot these years, which aims to learn a reliable classifier using labeled data from a source domain and evaluate it on a target domain. In this vein, most approaches utilized domain adaptation that maps data from different domains into a common feature space. To further improve the model performance, several methods targeted to mine domain-specific information were proposed. However, most of them only utilized a limited part of domain-specific information. In this study, we first develop a method of extracting domain-specific words based on the topic information derived from topic models. Then, we propose a Topic Driven Adaptive Network (TDAN) for cross-domain sentiment classification. The network consists of two sub-networks: a semantics attention network and a domain-specific word attention network, the structures of which are based on transformers. These sub-networks take different forms of input and their outputs are fused as the feature vector. Experiments validate the effectiveness of our TDAN on sentiment classification across domains. Case studies also indicate that topic models have the potential to add value to cross-domain sentiment classification by discovering interpretable and low-dimensional subspaces. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.14094v2-abstract-full').style.display = 'none'; document.getElementById('2111.14094v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2109.12530">arXiv:2109.12530</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2109.12530">pdf</a>, <a href="https://arxiv.org/format/2109.12530">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TPAMI.2021.3114428">10.1109/TPAMI.2021.3114428 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Structure-Preserving Image Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ma%2C+C">Cheng Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yongming Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jiwen Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jie Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2109.12530v1-abstract-short" style="display: inline;"> Structures matter in single image super-resolution (SISR). Benefiting from generative adversarial networks (GANs), recent studies have promoted the development of SISR by recovering photo-realistic images. However, there are still undesired structural distortions in the recovered images. In this paper, we propose a structure-preserving super-resolution (SPSR) method to alleviate the above issue wh&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.12530v1-abstract-full').style.display = 'inline'; document.getElementById('2109.12530v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2109.12530v1-abstract-full" style="display: none;"> Structures matter in single image super-resolution (SISR). Benefiting from generative adversarial networks (GANs), recent studies have promoted the development of SISR by recovering photo-realistic images. However, there are still undesired structural distortions in the recovered images. In this paper, we propose a structure-preserving super-resolution (SPSR) method to alleviate the above issue while maintaining the merits of GAN-based methods to generate perceptual-pleasant details. Firstly, we propose SPSR with gradient guidance (SPSR-G) by exploiting gradient maps of images to guide the recovery in two aspects. On the one hand, we restore high-resolution gradient maps by a gradient branch to provide additional structure priors for the SR process. On the other hand, we propose a gradient loss to impose a second-order restriction on the super-resolved images, which helps generative networks concentrate more on geometric structures. Secondly, since the gradient maps are handcrafted and may only be able to capture limited aspects of structural information, we further extend SPSR-G by introducing a learnable neural structure extractor (NSE) to unearth richer local structures and provide stronger supervision for SR. We propose two self-supervised structure learning methods, contrastive prediction and solving jigsaw puzzles, to train the NSEs. Our methods are model-agnostic, which can be potentially used for off-the-shelf SR networks. Experimental results on five benchmark datasets show that the proposed methods outperform state-of-the-art perceptual-driven SR methods under LPIPS, PSNR, and SSIM metrics. Visual results demonstrate the superiority of our methods in restoring structures while generating natural SR images. Code is available at https://github.com/Maclory/SPSR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.12530v1-abstract-full').style.display = 'none'; document.getElementById('2109.12530v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by T-PAMI. Journal version of arXiv:2003.13081 (CVPR 2020)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2109.01129">arXiv:2109.01129</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2109.01129">pdf</a>, <a href="https://arxiv.org/format/2109.01129">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NerfingMVS: Guided Optimization of Neural Radiance Fields for Indoor Multi-view Stereo </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wei%2C+Y">Yi Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+S">Shaohui Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yongming Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+W">Wang Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jiwen Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jie Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2109.01129v3-abstract-short" style="display: inline;"> In this work, we present a new multi-view depth estimation method that utilizes both conventional reconstruction and learning-based priors over the recently proposed neural radiance fields (NeRF). Unlike existing neural network based optimization method that relies on estimated correspondences, our method directly optimizes over implicit volumes, eliminating the challenging step of matching pixels&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.01129v3-abstract-full').style.display = 'inline'; document.getElementById('2109.01129v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2109.01129v3-abstract-full" style="display: none;"> In this work, we present a new multi-view depth estimation method that utilizes both conventional reconstruction and learning-based priors over the recently proposed neural radiance fields (NeRF). Unlike existing neural network based optimization method that relies on estimated correspondences, our method directly optimizes over implicit volumes, eliminating the challenging step of matching pixels in indoor scenes. The key to our approach is to utilize the learning-based priors to guide the optimization process of NeRF. Our system firstly adapts a monocular depth network over the target scene by finetuning on its sparse SfM+MVS reconstruction from COLMAP. Then, we show that the shape-radiance ambiguity of NeRF still exists in indoor environments and propose to address the issue by employing the adapted depth priors to monitor the sampling process of volume rendering. Finally, a per-pixel confidence map acquired by error computation on the rendered image can be used to further improve the depth quality. Experiments show that our proposed framework significantly outperforms state-of-the-art methods on indoor scenes, with surprising findings presented on the effectiveness of correspondence-based optimization and NeRF-based optimization over the adapted depth priors. In addition, we show that the guided optimization scheme does not sacrifice the original synthesis capability of neural radiance fields, improving the rendering quality on both seen and novel views. Code is available at https://github.com/weiyithu/NerfingMVS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.01129v3-abstract-full').style.display = 'none'; document.getElementById('2109.01129v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in ICCV 2021 (Oral). Project page: https://weiyithu.github.io/NerfingMVS/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2108.08839">arXiv:2108.08839</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2108.08839">pdf</a>, <a href="https://arxiv.org/format/2108.08839">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> PoinTr: Diverse Point Cloud Completion with Geometry-Aware Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yu%2C+X">Xumin Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yongming Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Ziyi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zuyan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jiwen Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jie Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2108.08839v1-abstract-short" style="display: inline;"> Point clouds captured in real-world applications are often incomplete due to the limited sensor resolution, single viewpoint, and occlusion. Therefore, recovering the complete point clouds from partial ones becomes an indispensable task in many practical applications. In this paper, we present a new method that reformulates point cloud completion as a set-to-set translation problem and design a ne&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.08839v1-abstract-full').style.display = 'inline'; document.getElementById('2108.08839v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2108.08839v1-abstract-full" style="display: none;"> Point clouds captured in real-world applications are often incomplete due to the limited sensor resolution, single viewpoint, and occlusion. Therefore, recovering the complete point clouds from partial ones becomes an indispensable task in many practical applications. In this paper, we present a new method that reformulates point cloud completion as a set-to-set translation problem and design a new model, called PoinTr that adopts a transformer encoder-decoder architecture for point cloud completion. By representing the point cloud as a set of unordered groups of points with position embeddings, we convert the point cloud to a sequence of point proxies and employ the transformers for point cloud generation. To facilitate transformers to better leverage the inductive bias about 3D geometric structures of point clouds, we further devise a geometry-aware block that models the local geometric relationships explicitly. The migration of transformers enables our model to better learn structural knowledge and preserve detailed information for point cloud completion. Furthermore, we propose two more challenging benchmarks with more diverse incomplete point clouds that can better reflect the real-world scenarios to promote future research. Experimental results show that our method outperforms state-of-the-art methods by a large margin on both the new benchmarks and the existing ones. Code is available at https://github.com/yuxumin/PoinTr <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.08839v1-abstract-full').style.display = 'none'; document.getElementById('2108.08839v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICCV 2021 (Oral Presentation)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2108.08728">arXiv:2108.08728</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2108.08728">pdf</a>, <a href="https://arxiv.org/format/2108.08728">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Counterfactual Attention Learning for Fine-Grained Visual Categorization and Re-identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yongming Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+G">Guangyi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jiwen Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jie Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2108.08728v2-abstract-short" style="display: inline;"> Attention mechanism has demonstrated great potential in fine-grained visual recognition tasks. In this paper, we present a counterfactual attention learning method to learn more effective attention based on causal inference. Unlike most existing methods that learn visual attention based on conventional likelihood, we propose to learn the attention with counterfactual causality, which provides a to&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.08728v2-abstract-full').style.display = 'inline'; document.getElementById('2108.08728v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2108.08728v2-abstract-full" style="display: none;"> Attention mechanism has demonstrated great potential in fine-grained visual recognition tasks. In this paper, we present a counterfactual attention learning method to learn more effective attention based on causal inference. Unlike most existing methods that learn visual attention based on conventional likelihood, we propose to learn the attention with counterfactual causality, which provides a tool to measure the attention quality and a powerful supervisory signal to guide the learning process. Specifically, we analyze the effect of the learned visual attention on network prediction through counterfactual intervention and maximize the effect to encourage the network to learn more useful attention for fine-grained image recognition. Empirically, we evaluate our method on a wide range of fine-grained recognition tasks where attention plays a crucial role, including fine-grained image categorization, person re-identification, and vehicle re-identification. The consistent improvement on all benchmarks demonstrates the effectiveness of our method. Code is available at https://github.com/raoyongming/CAL <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.08728v2-abstract-full').style.display = 'none'; document.getElementById('2108.08728v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 August, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICCV 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2108.07797">arXiv:2108.07797</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2108.07797">pdf</a>, <a href="https://arxiv.org/format/2108.07797">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Group-aware Contrastive Regression for Action Quality Assessment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yu%2C+X">Xumin Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yongming Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+W">Wenliang Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jiwen Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jie Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2108.07797v1-abstract-short" style="display: inline;"> Assessing action quality is challenging due to the subtle differences between videos and large variations in scores. Most existing approaches tackle this problem by regressing a quality score from a single video, suffering a lot from the large inter-video score variations. In this paper, we show that the relations among videos can provide important clues for more accurate action quality assessment&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.07797v1-abstract-full').style.display = 'inline'; document.getElementById('2108.07797v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2108.07797v1-abstract-full" style="display: none;"> Assessing action quality is challenging due to the subtle differences between videos and large variations in scores. Most existing approaches tackle this problem by regressing a quality score from a single video, suffering a lot from the large inter-video score variations. In this paper, we show that the relations among videos can provide important clues for more accurate action quality assessment during both training and inference. Specifically, we reformulate the problem of action quality assessment as regressing the relative scores with reference to another video that has shared attributes (e.g., category and difficulty), instead of learning unreferenced scores. Following this formulation, we propose a new Contrastive Regression (CoRe) framework to learn the relative scores by pair-wise comparison, which highlights the differences between videos and guides the models to learn the key hints for assessment. In order to further exploit the relative information between two videos, we devise a group-aware regression tree to convert the conventional score regression into two easier sub-problems: coarse-to-fine classification and regression in small intervals. To demonstrate the effectiveness of CoRe, we conduct extensive experiments on three mainstream AQA datasets including AQA-7, MTL-AQA and JIGSAWS. Our approach outperforms previous methods by a large margin and establishes new state-of-the-art on all three benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.07797v1-abstract-full').style.display = 'none'; document.getElementById('2108.07797v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 August, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICCV 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2108.07794">arXiv:2108.07794</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2108.07794">pdf</a>, <a href="https://arxiv.org/format/2108.07794">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> RandomRooms: Unsupervised Pre-training from Synthetic Shapes and Randomized Layouts for 3D Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yongming Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+B">Benlin Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+Y">Yi Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jiwen Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Hsieh%2C+C">Cho-Jui Hsieh</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jie Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2108.07794v1-abstract-short" style="display: inline;"> 3D point cloud understanding has made great progress in recent years. However, one major bottleneck is the scarcity of annotated real datasets, especially compared to 2D object detection tasks, since a large amount of labor is involved in annotating the real scans of a scene. A promising solution to this problem is to make better use of the synthetic dataset, which consists of CAD object models, t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.07794v1-abstract-full').style.display = 'inline'; document.getElementById('2108.07794v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2108.07794v1-abstract-full" style="display: none;"> 3D point cloud understanding has made great progress in recent years. However, one major bottleneck is the scarcity of annotated real datasets, especially compared to 2D object detection tasks, since a large amount of labor is involved in annotating the real scans of a scene. A promising solution to this problem is to make better use of the synthetic dataset, which consists of CAD object models, to boost the learning on real datasets. This can be achieved by the pre-training and fine-tuning procedure. However, recent work on 3D pre-training exhibits failure when transfer features learned on synthetic objects to other real-world applications. In this work, we put forward a new method called RandomRooms to accomplish this objective. In particular, we propose to generate random layouts of a scene by making use of the objects in the synthetic CAD dataset and learn the 3D scene representation by applying object-level contrastive learning on two random scenes generated from the same set of synthetic objects. The model pre-trained in this way can serve as a better initialization when later fine-tuning on the 3D object detection task. Empirically, we show consistent improvement in downstream 3D detection tasks on several base models, especially when less training data are used, which strongly demonstrates the effectiveness and generalization of our method. Benefiting from the rich semantic knowledge and diverse objects from synthetic data, our method establishes the new state-of-the-art on widely-used 3D detection benchmarks ScanNetV2 and SUN RGB-D. We expect our attempt to provide a new perspective for bridging object and scene-level 3D understanding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.07794v1-abstract-full').style.display = 'none'; document.getElementById('2108.07794v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 August, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICCV 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2108.06113">arXiv:2108.06113</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2108.06113">pdf</a>, <a href="https://arxiv.org/format/2108.06113">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1117/1.JEI.30.5.053013">10.1117/1.JEI.30.5.053013 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> UMFA: A photorealistic style transfer method based on U-Net and multi-layer feature aggregation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Rao%2C+D+Y">D. Y. Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X+J">X. J. Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">H. Li</a>, <a href="/search/cs?searchtype=author&amp;query=Kittler%2C+J">J. Kittler</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+T+Y">T. Y. Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2108.06113v1-abstract-short" style="display: inline;"> In this paper, we propose a photorealistic style transfer network to emphasize the natural effect of photorealistic image stylization. In general, distortion of the image content and lacking of details are two typical issues in the style transfer field. To this end, we design a novel framework employing the U-Net structure to maintain the rich spatial clues, with a multi-layer feature aggregation&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.06113v1-abstract-full').style.display = 'inline'; document.getElementById('2108.06113v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2108.06113v1-abstract-full" style="display: none;"> In this paper, we propose a photorealistic style transfer network to emphasize the natural effect of photorealistic image stylization. In general, distortion of the image content and lacking of details are two typical issues in the style transfer field. To this end, we design a novel framework employing the U-Net structure to maintain the rich spatial clues, with a multi-layer feature aggregation (MFA) method to simultaneously provide the details obtained by the shallow layers in the stylization processing. In particular, an encoder based on the dense block and a decoder form a symmetrical structure of U-Net are jointly staked to realize an effective feature extraction and image reconstruction. Besides, a transfer module based on MFA and &#34;adaptive instance normalization&#34; (AdaIN) is inserted in the skip connection positions to achieve the stylization. Accordingly, the stylized image possesses the texture of a real photo and preserves rich content details without introducing any mask or post-processing steps. The experimental results on public datasets demonstrate that our method achieves a more faithful structural similarity with a lower style loss, reflecting the effectiveness and merit of our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.06113v1-abstract-full').style.display = 'none'; document.getElementById('2108.06113v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 August, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2021. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Rao%2C+Y&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Rao%2C+Y&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Rao%2C+Y&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10