CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 54 results for author: <span class="mathjax">Lai, B</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Lai%2C+B">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Lai, B"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Lai%2C+B&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Lai, B"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Lai%2C+B&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Lai%2C+B&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Lai%2C+B&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13389">arXiv:2502.13389</a> <span> [<a href="https://arxiv.org/pdf/2502.13389">pdf</a>, <a href="https://arxiv.org/format/2502.13389">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Reasoning with Reinforced Functional Token Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kongcheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Q">Qi Yao</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Baisheng Lai</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiaxing Huang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+W">Wenkai Fang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+D">Dacheng Tao</a>, <a href="/search/cs?searchtype=author&query=Song%2C+M">Mingli Song</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shunyu Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13389v1-abstract-short" style="display: inline;"> In this work, we propose Reinforced Functional Token Tuning (RFTT), a novel reinforced fine-tuning framework that empowers Large Language Models (LLMs) with self-play learn-to-reason capabilities. Unlike prior prompt-driven reasoning efforts, RFTT embeds a rich set of learnable functional tokens (e.g., <analyze>, <verify>, <refine>) directly into the model vocabulary, enabling chain-of-thought con… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13389v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13389v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13389v1-abstract-full" style="display: none;"> In this work, we propose Reinforced Functional Token Tuning (RFTT), a novel reinforced fine-tuning framework that empowers Large Language Models (LLMs) with self-play learn-to-reason capabilities. Unlike prior prompt-driven reasoning efforts, RFTT embeds a rich set of learnable functional tokens (e.g., <analyze>, <verify>, <refine>) directly into the model vocabulary, enabling chain-of-thought construction with diverse human-like reasoning behaviors. Specifically, RFTT comprises two phases: (1) supervised fine-tuning performs prompt-driven tree search to obtain self-generated training data annotated with functional tokens, which warms up the model to learn these tokens for reasoning; and (2) online reinforcement learning further allows the model to explore different reasoning pathways through functional token sampling without relying on prompts, thereby facilitating effective self-improvement for functional reasoning. Extensive experiments demonstrate the superiority of the proposed RFTT on mathematical benchmarks, significantly boosting Qwen-2.5-7B-Instruct (70.6% to 79.8%) and LLaMA-3.1-8B-Instruct (32.2% to 60.2%) on the MATH dataset. Moreover, the performance of RFTT consistently improves with more search rollouts at inference time. Our code is available at https://github.com/sastpg/RFTT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13389v1-abstract-full').style.display = 'none'; document.getElementById('2502.13389v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04336">arXiv:2501.04336</a> <span> [<a href="https://arxiv.org/pdf/2501.04336">pdf</a>, <a href="https://arxiv.org/format/2501.04336">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Building a Mind Palace: Structuring Environment-Grounded Semantic Graphs for Effective Long Video Analysis with LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zeyi Huang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+Y">Yuyang Ji</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaofang Wang</a>, <a href="/search/cs?searchtype=author&query=Mehta%2C+N">Nikhil Mehta</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tong Xiao</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+D">Donghyun Lee</a>, <a href="/search/cs?searchtype=author&query=Vanvalkenburgh%2C+S">Sigmund Vanvalkenburgh</a>, <a href="/search/cs?searchtype=author&query=Zha%2C+S">Shengxin Zha</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Bolin Lai</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+L">Licheng Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+N">Ning Zhang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+Y+J">Yong Jae Lee</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Miao Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04336v1-abstract-short" style="display: inline;"> Long-form video understanding with Large Vision Language Models is challenged by the need to analyze temporally dispersed yet spatially concentrated key moments within limited context windows. In this work, we introduce VideoMindPalace, a new framework inspired by the "Mind Palace", which organizes critical video moments into a topologically structured semantic graph. VideoMindPalace organizes key… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04336v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04336v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04336v1-abstract-full" style="display: none;"> Long-form video understanding with Large Vision Language Models is challenged by the need to analyze temporally dispersed yet spatially concentrated key moments within limited context windows. In this work, we introduce VideoMindPalace, a new framework inspired by the "Mind Palace", which organizes critical video moments into a topologically structured semantic graph. VideoMindPalace organizes key information through (i) hand-object tracking and interaction, (ii) clustered activity zones representing specific areas of recurring activities, and (iii) environment layout mapping, allowing natural language parsing by LLMs to provide grounded insights on spatio-temporal and 3D context. In addition, we propose the Video MindPalace Benchmark (VMB), to assess human-like reasoning, including spatial localization, temporal reasoning, and layout-aware sequential understanding. Evaluated on VMB and established video QA datasets, including EgoSchema, NExT-QA, IntentQA, and the Active Memories Benchmark, VideoMindPalace demonstrates notable gains in spatio-temporal coherence and human-aligned reasoning, advancing long-form video analysis capabilities in VLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04336v1-abstract-full').style.display = 'none'; document.getElementById('2501.04336v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00243">arXiv:2501.00243</a> <span> [<a href="https://arxiv.org/pdf/2501.00243">pdf</a>, <a href="https://arxiv.org/format/2501.00243">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Cross-Layer Cache Aggregation for Token Reduction in Ultra-Fine-Grained Image Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rios%2C+E+A">Edwin Arkel Rios</a>, <a href="/search/cs?searchtype=author&query=Yuanda%2C+J+C">Jansen Christopher Yuanda</a>, <a href="/search/cs?searchtype=author&query=Ghanz%2C+V+L">Vincent Leon Ghanz</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+C">Cheng-Wei Yu</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Bo-Cheng Lai</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+M">Min-Chun Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00243v1-abstract-short" style="display: inline;"> Ultra-fine-grained image recognition (UFGIR) is a challenging task that involves classifying images within a macro-category. While traditional FGIR deals with classifying different species, UFGIR goes beyond by classifying sub-categories within a species such as cultivars of a plant. In recent times the usage of Vision Transformer-based backbones has allowed methods to obtain outstanding recogniti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00243v1-abstract-full').style.display = 'inline'; document.getElementById('2501.00243v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00243v1-abstract-full" style="display: none;"> Ultra-fine-grained image recognition (UFGIR) is a challenging task that involves classifying images within a macro-category. While traditional FGIR deals with classifying different species, UFGIR goes beyond by classifying sub-categories within a species such as cultivars of a plant. In recent times the usage of Vision Transformer-based backbones has allowed methods to obtain outstanding recognition performances in this task but this comes at a significant cost in terms of computation specially since this task significantly benefits from incorporating higher resolution images. Therefore, techniques such as token reduction have emerged to reduce the computational cost. However, dropping tokens leads to loss of essential information for fine-grained categories, specially as the token keep rate is reduced. Therefore, to counteract the loss of information brought by the usage of token reduction we propose a novel Cross-Layer Aggregation Classification Head and a Cross-Layer Cache mechanism to recover and access information from previous layers in later locations. Extensive experiments covering more than 2000 runs across diverse settings including 5 datasets, 9 backbones, 7 token reduction methods, 5 keep rates, and 2 image sizes demonstrate the effectiveness of the proposed plug-and-play modules and allow us to push the boundaries of accuracy vs cost for UFGIR by reducing the kept tokens to extremely low ratios of up to 10\% while maintaining a competitive accuracy to state-of-the-art models. Code is available at: \url{https://github.com/arkel23/CLCA} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00243v1-abstract-full').style.display = 'none'; document.getElementById('2501.00243v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICASSP 2025. Main: 5 pages, 4 figures, 1 table</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2; I.4 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.04317">arXiv:2412.04317</a> <span> [<a href="https://arxiv.org/pdf/2412.04317">pdf</a>, <a href="https://arxiv.org/format/2412.04317">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FlashSloth: Lightning Multimodal Large Language Models via Embedded Visual Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tong%2C+B">Bo Tong</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Bokai Lai</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yiyi Zhou</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+G">Gen Luo</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yunhang Shen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Ke Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xiaoshuai Sun</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+R">Rongrong Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.04317v1-abstract-short" style="display: inline;"> Despite a big leap forward in capability, multimodal large language models (MLLMs) tend to behave like a sloth in practical use, i.e., slow response and large latency. Recent efforts are devoted to building tiny MLLMs for better efficiency, but the plethora of visual tokens still used limit their actual speedup. In this paper, we propose a powerful and fast tiny MLLM called FlashSloth. Different f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04317v1-abstract-full').style.display = 'inline'; document.getElementById('2412.04317v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.04317v1-abstract-full" style="display: none;"> Despite a big leap forward in capability, multimodal large language models (MLLMs) tend to behave like a sloth in practical use, i.e., slow response and large latency. Recent efforts are devoted to building tiny MLLMs for better efficiency, but the plethora of visual tokens still used limit their actual speedup. In this paper, we propose a powerful and fast tiny MLLM called FlashSloth. Different from previous efforts, FlashSloth focuses on improving the descriptive power of visual tokens in the process of compressing their redundant semantics. In particular, FlashSloth introduces embedded visual compression designs to capture both visually salient and instruction-related image information, so as to achieving superior multimodal performance with fewer visual tokens. Extensive experiments are conducted to validate the proposed FlashSloth, and a bunch of tiny but strong MLLMs are also comprehensively compared, e.g., InternVL2, MiniCPM-V2 and Qwen2-VL. The experimental results show that compared with these advanced tiny MLLMs, our FlashSloth can greatly reduce the number of visual tokens, training memory and computation complexity while retaining high performance on various VL tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04317v1-abstract-full').style.display = 'none'; document.getElementById('2412.04317v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01027">arXiv:2412.01027</a> <span> [<a href="https://arxiv.org/pdf/2412.01027">pdf</a>, <a href="https://arxiv.org/format/2412.01027">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unleashing In-context Learning of Autoregressive Models for Few-shot Image Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lai%2C+B">Bolin Lai</a>, <a href="/search/cs?searchtype=author&query=Juefei-Xu%2C+F">Felix Juefei-Xu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Miao Liu</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+X">Xiaoliang Dai</a>, <a href="/search/cs?searchtype=author&query=Mehta%2C+N">Nikhil Mehta</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+C">Chenguang Zhu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zeyi Huang</a>, <a href="/search/cs?searchtype=author&query=Rehg%2C+J+M">James M. Rehg</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Sangmin Lee</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+N">Ning Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tong Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01027v2-abstract-short" style="display: inline;"> Text-guided image manipulation has experienced notable advancement in recent years. In order to mitigate linguistic ambiguity, few-shot learning with visual examples has been applied for instructions that are underrepresented in the training set, or difficult to describe purely in language. However, learning from visual prompts requires strong reasoning capability, which diffusion models are strug… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01027v2-abstract-full').style.display = 'inline'; document.getElementById('2412.01027v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01027v2-abstract-full" style="display: none;"> Text-guided image manipulation has experienced notable advancement in recent years. In order to mitigate linguistic ambiguity, few-shot learning with visual examples has been applied for instructions that are underrepresented in the training set, or difficult to describe purely in language. However, learning from visual prompts requires strong reasoning capability, which diffusion models are struggling with. To address this issue, we introduce a novel multi-modal autoregressive model, dubbed $\textbf{InstaManip}$, that can $\textbf{insta}$ntly learn a new image $\textbf{manip}$ulation operation from textual and visual guidance via in-context learning, and apply it to new query images. Specifically, we propose an innovative group self-attention mechanism to break down the in-context learning process into two separate stages -- learning and applying, which simplifies the complex problem into two easier tasks. We also introduce a relation regularization method to further disentangle image transformation features from irrelevant contents in exemplar images. Extensive experiments suggest that our method surpasses previous few-shot image manipulation models by a notable margin ($\geq$19% in human evaluation). We also find our model can be further boosted by increasing the number or diversity of exemplar images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01027v2-abstract-full').style.display = 'none'; document.getElementById('2412.01027v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 16 figures, 5 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14045">arXiv:2410.14045</a> <span> [<a href="https://arxiv.org/pdf/2410.14045">pdf</a>, <a href="https://arxiv.org/format/2410.14045">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Human Action Anticipation: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lai%2C+B">Bolin Lai</a>, <a href="/search/cs?searchtype=author&query=Toyer%2C+S">Sam Toyer</a>, <a href="/search/cs?searchtype=author&query=Nagarajan%2C+T">Tushar Nagarajan</a>, <a href="/search/cs?searchtype=author&query=Girdhar%2C+R">Rohit Girdhar</a>, <a href="/search/cs?searchtype=author&query=Zha%2C+S">Shengxin Zha</a>, <a href="/search/cs?searchtype=author&query=Rehg%2C+J+M">James M. Rehg</a>, <a href="/search/cs?searchtype=author&query=Kitani%2C+K">Kris Kitani</a>, <a href="/search/cs?searchtype=author&query=Grauman%2C+K">Kristen Grauman</a>, <a href="/search/cs?searchtype=author&query=Desai%2C+R">Ruta Desai</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Miao Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14045v1-abstract-short" style="display: inline;"> Predicting future human behavior is an increasingly popular topic in computer vision, driven by the interest in applications such as autonomous vehicles, digital assistants and human-robot interactions. The literature on behavior prediction spans various tasks, including action anticipation, activity forecasting, intent prediction, goal prediction, and so on. Our survey aims to tie together this f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14045v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14045v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14045v1-abstract-full" style="display: none;"> Predicting future human behavior is an increasingly popular topic in computer vision, driven by the interest in applications such as autonomous vehicles, digital assistants and human-robot interactions. The literature on behavior prediction spans various tasks, including action anticipation, activity forecasting, intent prediction, goal prediction, and so on. Our survey aims to tie together this fragmented literature, covering recent technical innovations as well as the development of new large-scale datasets for model training and evaluation. We also summarize the widely-used metrics for different tasks and provide a comprehensive performance comparison of existing approaches on eleven action anticipation datasets. This survey serves as not only a reference for contemporary methodologies in action anticipation, but also a guideline for future research direction of this evolving landscape. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14045v1-abstract-full').style.display = 'none'; document.getElementById('2410.14045v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">30 pages, 9 figures, 12 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15316">arXiv:2409.15316</a> <span> [<a href="https://arxiv.org/pdf/2409.15316">pdf</a>, <a href="https://arxiv.org/format/2409.15316">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Towards Social AI: A Survey on Understanding Social Interactions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+S">Sangmin Lee</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Minzhi Li</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Bolin Lai</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+W">Wenqi Jia</a>, <a href="/search/cs?searchtype=author&query=Ryan%2C+F">Fiona Ryan</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xu Cao</a>, <a href="/search/cs?searchtype=author&query=Kara%2C+O">Ozgur Kara</a>, <a href="/search/cs?searchtype=author&query=Boote%2C+B">Bikram Boote</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+W">Weiyan Shi</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Diyi Yang</a>, <a href="/search/cs?searchtype=author&query=Rehg%2C+J+M">James M. Rehg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15316v2-abstract-short" style="display: inline;"> Social interactions form the foundation of human societies. Artificial intelligence has made significant progress in certain areas, but enabling machines to seamlessly understand social interactions remains an open challenge. It is important to address this gap by endowing machines with social capabilities. We identify three key capabilities needed for effective social understanding: 1) understand… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15316v2-abstract-full').style.display = 'inline'; document.getElementById('2409.15316v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15316v2-abstract-full" style="display: none;"> Social interactions form the foundation of human societies. Artificial intelligence has made significant progress in certain areas, but enabling machines to seamlessly understand social interactions remains an open challenge. It is important to address this gap by endowing machines with social capabilities. We identify three key capabilities needed for effective social understanding: 1) understanding multimodal social cues, 2) understanding multi-party dynamics, and 3) understanding beliefs. Building upon these foundations, we classify and review existing machine learning works on social understanding from the perspectives of verbal, non-verbal, and multimodal social cues. The verbal branch focuses on understanding linguistic signals such as speaker intent, dialogue sentiment, and commonsense reasoning. The non-verbal branch addresses techniques for perceiving social meaning from visual behaviors such as body gestures, gaze patterns, and facial expressions. The multimodal branch covers approaches that integrate verbal and non-verbal multimodal cues to holistically interpret social interactions such as recognizing emotions, conversational dynamics, and social situations. By reviewing the scope and limitations of current approaches and benchmarks, we aim to clarify the development trajectory and illuminate the path towards more comprehensive intelligence for social understanding. We hope this survey will spur further research interest and insights into this area. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15316v2-abstract-full').style.display = 'none'; document.getElementById('2409.15316v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11051">arXiv:2409.11051</a> <span> [<a href="https://arxiv.org/pdf/2409.11051">pdf</a>, <a href="https://arxiv.org/format/2409.11051">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Down-Sampling Inter-Layer Adapter for Parameter and Computation Efficient Ultra-Fine-Grained Image Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rios%2C+E+A">Edwin Arkel Rios</a>, <a href="/search/cs?searchtype=author&query=Oyerinde%2C+F">Femiloye Oyerinde</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+M">Min-Chun Hu</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Bo-Cheng Lai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11051v1-abstract-short" style="display: inline;"> Ultra-fine-grained image recognition (UFGIR) categorizes objects with extremely small differences between classes, such as distinguishing between cultivars within the same species, as opposed to species-level classification in fine-grained image recognition (FGIR). The difficulty of this task is exacerbated due to the scarcity of samples per category. To tackle these challenges we introduce a nove… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11051v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11051v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11051v1-abstract-full" style="display: none;"> Ultra-fine-grained image recognition (UFGIR) categorizes objects with extremely small differences between classes, such as distinguishing between cultivars within the same species, as opposed to species-level classification in fine-grained image recognition (FGIR). The difficulty of this task is exacerbated due to the scarcity of samples per category. To tackle these challenges we introduce a novel approach employing down-sampling inter-layer adapters in a parameter-efficient setting, where the backbone parameters are frozen and we only fine-tune a small set of additional modules. By integrating dual-branch down-sampling, we significantly reduce the number of parameters and floating-point operations (FLOPs) required, making our method highly efficient. Comprehensive experiments on ten datasets demonstrate that our approach obtains outstanding accuracy-cost performance, highlighting its potential for practical applications in resource-constrained environments. In particular, our method increases the average accuracy by at least 6.8\% compared to other methods in the parameter-efficient setting while requiring at least 123x less trainable parameters compared to current state-of-the-art UFGIR methods and reducing the FLOPs by 30\% in average compared to other methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11051v1-abstract-full').style.display = 'none'; document.getElementById('2409.11051v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ECCV 2024 Workshop on Efficient Deep Learning for Foundation Models (EFM). Main: 13 pages, 3 figures, 2 tables. Appendix: 3 pages, 1 table. Total: 16 pages, 3 figures, 4 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> I.2; I.4 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.12891">arXiv:2407.12891</a> <span> [<a href="https://arxiv.org/pdf/2407.12891">pdf</a>, <a href="https://arxiv.org/format/2407.12891">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Global-Local Similarity for Efficient Fine-Grained Image Recognition with Vision Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rios%2C+E+A">Edwin Arkel Rios</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+M">Min-Chun Hu</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Bo-Cheng Lai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.12891v1-abstract-short" style="display: inline;"> Fine-grained recognition involves the classification of images from subordinate macro-categories, and it is challenging due to small inter-class differences. To overcome this, most methods perform discriminative feature selection enabled by a feature extraction backbone followed by a high-level feature refinement step. Recently, many studies have shown the potential behind vision transformers as a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12891v1-abstract-full').style.display = 'inline'; document.getElementById('2407.12891v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.12891v1-abstract-full" style="display: none;"> Fine-grained recognition involves the classification of images from subordinate macro-categories, and it is challenging due to small inter-class differences. To overcome this, most methods perform discriminative feature selection enabled by a feature extraction backbone followed by a high-level feature refinement step. Recently, many studies have shown the potential behind vision transformers as a backbone for fine-grained recognition, but their usage of its attention mechanism to select discriminative tokens can be computationally expensive. In this work, we propose a novel and computationally inexpensive metric to identify discriminative regions in an image. We compare the similarity between the global representation of an image given by the CLS token, a learnable token used by transformers for classification, and the local representation of individual patches. We select the regions with the highest similarity to obtain crops, which are forwarded through the same transformer encoder. Finally, high-level features of the original and cropped representations are further refined together in order to make more robust predictions. Through extensive experimental evaluation we demonstrate the effectiveness of our proposed method, obtaining favorable results in terms of accuracy across a variety of datasets. Furthermore, our method achieves these results at a much lower computational cost compared to the alternatives. Code and checkpoints are available at: \url{https://github.com/arkel23/GLSim}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12891v1-abstract-full').style.display = 'none'; document.getElementById('2407.12891v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Main: 12 pages, 5 figures, 5 tables. Appendix: 9 pages, 9 figures, 10 tables. Total: 21 pages, 14 figures, 15 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2; I.4 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.17126">arXiv:2406.17126</a> <span> [<a href="https://arxiv.org/pdf/2406.17126">pdf</a>, <a href="https://arxiv.org/format/2406.17126">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MM-SpuBench: Towards Better Understanding of Spurious Biases in Multimodal LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+W">Wenqian Ye</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+G">Guangtao Zheng</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yunsheng Ma</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xu Cao</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Bolin Lai</a>, <a href="/search/cs?searchtype=author&query=Rehg%2C+J+M">James M. Rehg</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+A">Aidong Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.17126v1-abstract-short" style="display: inline;"> Spurious bias, a tendency to use spurious correlations between non-essential input attributes and target variables for predictions, has revealed a severe robustness pitfall in deep learning models trained on single modality data. Multimodal Large Language Models (MLLMs), which integrate both vision and language models, have demonstrated strong capability in joint vision-language understanding. How… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17126v1-abstract-full').style.display = 'inline'; document.getElementById('2406.17126v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.17126v1-abstract-full" style="display: none;"> Spurious bias, a tendency to use spurious correlations between non-essential input attributes and target variables for predictions, has revealed a severe robustness pitfall in deep learning models trained on single modality data. Multimodal Large Language Models (MLLMs), which integrate both vision and language models, have demonstrated strong capability in joint vision-language understanding. However, whether spurious biases are prevalent in MLLMs remains under-explored. We mitigate this gap by analyzing the spurious biases in a multimodal setting, uncovering the specific test data patterns that can manifest this problem when biases in the vision model cascade into the alignment between visual and text tokens in MLLMs. To better understand this problem, we introduce MM-SpuBench, a comprehensive visual question-answering (VQA) benchmark designed to evaluate MLLMs' reliance on nine distinct categories of spurious correlations from five open-source image datasets. The VQA dataset is built from human-understandable concept information (attributes). Leveraging this benchmark, we conduct a thorough evaluation of current state-of-the-art MLLMs. Our findings illuminate the persistence of the reliance on spurious correlations from these models and underscore the urge for new methodologies to mitigate spurious biases. To support the MLLM robustness research, we release our VQA benchmark at https://huggingface.co/datasets/mmbench/MM-SpuBench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17126v1-abstract-full').style.display = 'none'; document.getElementById('2406.17126v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.10424">arXiv:2406.10424</a> <span> [<a href="https://arxiv.org/pdf/2406.10424">pdf</a>, <a href="https://arxiv.org/format/2406.10424">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> What is the Visual Cognition Gap between Humans and Multimodal LLMs? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xu Cao</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Bolin Lai</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+W">Wenqian Ye</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yunsheng Ma</a>, <a href="/search/cs?searchtype=author&query=Heintz%2C+J">Joerg Heintz</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jintai Chen</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jianguo Cao</a>, <a href="/search/cs?searchtype=author&query=Rehg%2C+J+M">James M. Rehg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.10424v1-abstract-short" style="display: inline;"> Recently, Multimodal Large Language Models (MLLMs) have shown great promise in language-guided perceptual tasks such as recognition, segmentation, and object detection. However, their effectiveness in addressing visual cognition problems that require high-level reasoning is not well-established. One such challenge is abstract visual reasoning (AVR) -- the cognitive ability to discern relationships… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10424v1-abstract-full').style.display = 'inline'; document.getElementById('2406.10424v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.10424v1-abstract-full" style="display: none;"> Recently, Multimodal Large Language Models (MLLMs) have shown great promise in language-guided perceptual tasks such as recognition, segmentation, and object detection. However, their effectiveness in addressing visual cognition problems that require high-level reasoning is not well-established. One such challenge is abstract visual reasoning (AVR) -- the cognitive ability to discern relationships among patterns in a set of images and extrapolate to predict subsequent patterns. This skill is crucial during the early neurodevelopmental stages of children. Inspired by the AVR tasks in Raven's Progressive Matrices (RPM) and Wechsler Intelligence Scale for Children (WISC), we propose a new dataset MaRs-VQA and a new benchmark VCog-Bench containing three datasets to evaluate the zero-shot AVR capability of MLLMs and compare their performance with existing human intelligent investigation. Our comparative experiments with different open-source and closed-source MLLMs on the VCog-Bench revealed a gap between MLLMs and human intelligence, highlighting the visual cognitive limitations of current MLLMs. We believe that the public release of VCog-Bench, consisting of MaRs-VQA, and the inference pipeline will drive progress toward the next generation of MLLMs with human-like visual cognition abilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10424v1-abstract-full').style.display = 'none'; document.getElementById('2406.10424v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 4 figures, the appendix will be updated soon</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T01 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.12999">arXiv:2403.12999</a> <span> [<a href="https://arxiv.org/pdf/2403.12999">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Prompt Selection and Augmentation for Few Examples Code Generation in Large Language Model and its Application in Robotics Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+O+T">On Tai Wu</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+F+K+S">Frodo Kin Sun Chan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zunhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Law%2C+Y+N">Yan Nei Law</a>, <a href="/search/cs?searchtype=author&query=Drescher%2C+B">Benny Drescher</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+E+S+B">Edmond Shiao Bun Lai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.12999v1-abstract-short" style="display: inline;"> Few-shot prompting and step-by-step reasoning have enhanced the capabilities of Large Language Models (LLMs) in tackling complex tasks including code generation. In this paper, we introduce a prompt selection and augmentation algorithm aimed at improving mathematical reasoning and robot arm operations. Our approach incorporates a multi-stage example augmentation scheme combined with an example sel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12999v1-abstract-full').style.display = 'inline'; document.getElementById('2403.12999v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.12999v1-abstract-full" style="display: none;"> Few-shot prompting and step-by-step reasoning have enhanced the capabilities of Large Language Models (LLMs) in tackling complex tasks including code generation. In this paper, we introduce a prompt selection and augmentation algorithm aimed at improving mathematical reasoning and robot arm operations. Our approach incorporates a multi-stage example augmentation scheme combined with an example selection scheme. This algorithm improves LLM performance by selecting a set of examples that increase diversity, minimize redundancy, and increase relevance to the question. When combined with the Program-of-Thought prompting, our algorithm demonstrates an improvement in performance on the GSM8K and SVAMP benchmarks, with increases of 0.3% and 1.1% respectively. Furthermore, in simulated tabletop environments, our algorithm surpasses the Code-as-Policies approach by achieving a 3.4% increase in successful task completions and a decrease of over 70% in the number of examples used. Its ability to discard examples that contribute little to solving the problem reduces the inferencing time of an LLM-powered robotics system. This algorithm also offers important benefits for industrial process automation by streamlining the development and deployment process, reducing manual programming effort, and enhancing code reusability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12999v1-abstract-full').style.display = 'none'; document.getElementById('2403.12999v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.04430">arXiv:2403.04430</a> <span> [<a href="https://arxiv.org/pdf/2403.04430">pdf</a>, <a href="https://arxiv.org/format/2403.04430">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> On-demand Quantization for Green Federated Generative Diffusion in Mobile Edge Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lai%2C+B">Bingkun Lai</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jiayi He</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Gaolei Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Minrui Xu</a>, <a href="/search/cs?searchtype=author&query=zhang%2C+T">Tao zhang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shengli Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.04430v1-abstract-short" style="display: inline;"> Generative Artificial Intelligence (GAI) shows remarkable productivity and creativity in Mobile Edge Networks, such as the metaverse and the Industrial Internet of Things. Federated learning is a promising technique for effectively training GAI models in mobile edge networks due to its data distribution. However, there is a notable issue with communication consumption when training large GAI model… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.04430v1-abstract-full').style.display = 'inline'; document.getElementById('2403.04430v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.04430v1-abstract-full" style="display: none;"> Generative Artificial Intelligence (GAI) shows remarkable productivity and creativity in Mobile Edge Networks, such as the metaverse and the Industrial Internet of Things. Federated learning is a promising technique for effectively training GAI models in mobile edge networks due to its data distribution. However, there is a notable issue with communication consumption when training large GAI models like generative diffusion models in mobile edge networks. Additionally, the substantial energy consumption associated with training diffusion-based models, along with the limited resources of edge devices and complexities of network environments, pose challenges for improving the training efficiency of GAI models. To address this challenge, we propose an on-demand quantized energy-efficient federated diffusion approach for mobile edge networks. Specifically, we first design a dynamic quantized federated diffusion training scheme considering various demands from the edge devices. Then, we study an energy efficiency problem based on specific quantization requirements. Numerical results show that our proposed method significantly reduces system energy consumption and transmitted model size compared to both baseline federated diffusion and fixed quantized federated diffusion methods while effectively maintaining reasonable quality and diversity of generated data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.04430v1-abstract-full').style.display = 'none'; document.getElementById('2403.04430v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.02090">arXiv:2403.02090</a> <span> [<a href="https://arxiv.org/pdf/2403.02090">pdf</a>, <a href="https://arxiv.org/format/2403.02090">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Modeling Multimodal Social Interactions: New Challenges and Baselines with Densely Aligned Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+S">Sangmin Lee</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Bolin Lai</a>, <a href="/search/cs?searchtype=author&query=Ryan%2C+F">Fiona Ryan</a>, <a href="/search/cs?searchtype=author&query=Boote%2C+B">Bikram Boote</a>, <a href="/search/cs?searchtype=author&query=Rehg%2C+J+M">James M. Rehg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.02090v3-abstract-short" style="display: inline;"> Understanding social interactions involving both verbal and non-verbal cues is essential for effectively interpreting social situations. However, most prior works on multimodal social cues focus predominantly on single-person behaviors or rely on holistic visual representations that are not aligned to utterances in multi-party environments. Consequently, they are limited in modeling the intricate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.02090v3-abstract-full').style.display = 'inline'; document.getElementById('2403.02090v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.02090v3-abstract-full" style="display: none;"> Understanding social interactions involving both verbal and non-verbal cues is essential for effectively interpreting social situations. However, most prior works on multimodal social cues focus predominantly on single-person behaviors or rely on holistic visual representations that are not aligned to utterances in multi-party environments. Consequently, they are limited in modeling the intricate dynamics of multi-party interactions. In this paper, we introduce three new challenging tasks to model the fine-grained dynamics between multiple people: speaking target identification, pronoun coreference resolution, and mentioned player prediction. We contribute extensive data annotations to curate these new challenges in social deduction game settings. Furthermore, we propose a novel multimodal baseline that leverages densely aligned language-visual representations by synchronizing visual features with their corresponding utterances. This facilitates concurrently capturing verbal and non-verbal cues pertinent to social reasoning. Experiments demonstrate the effectiveness of the proposed approach with densely aligned multimodal representations in modeling fine-grained social interactions. Project website: https://sangmin-git.github.io/projects/MMSI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.02090v3-abstract-full').style.display = 'none'; document.getElementById('2403.02090v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024 Oral</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.08910">arXiv:2402.08910</a> <span> [<a href="https://arxiv.org/pdf/2402.08910">pdf</a>, <a href="https://arxiv.org/format/2402.08910">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Learning-based Bone Quality Classification Method for Spinal Metastasis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+S">Shiqi Peng</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Bolin Lai</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+G">Guangyu Yao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaoyun Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Ya Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yan-Feng Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hui Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.08910v1-abstract-short" style="display: inline;"> Spinal metastasis is the most common disease in bone metastasis and may cause pain, instability and neurological injuries. Early detection of spinal metastasis is critical for accurate staging and optimal treatment. The diagnosis is usually facilitated with Computed Tomography (CT) scans, which requires considerable efforts from well-trained radiologists. In this paper, we explore a learning-based… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.08910v1-abstract-full').style.display = 'inline'; document.getElementById('2402.08910v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.08910v1-abstract-full" style="display: none;"> Spinal metastasis is the most common disease in bone metastasis and may cause pain, instability and neurological injuries. Early detection of spinal metastasis is critical for accurate staging and optimal treatment. The diagnosis is usually facilitated with Computed Tomography (CT) scans, which requires considerable efforts from well-trained radiologists. In this paper, we explore a learning-based automatic bone quality classification method for spinal metastasis based on CT images. We simultaneously take the posterolateral spine involvement classification task into account, and employ multi-task learning (MTL) technique to improve the performance. MTL acts as a form of inductive bias which helps the model generalize better on each task by sharing representations between related tasks. Based on the prior knowledge that the mixed type can be viewed as both blastic and lytic, we model the task of bone quality classification as two binary classification sub-tasks, i.e., whether blastic and whether lytic, and leverage a multiple layer perceptron to combine their predictions. In order to make the model more robust and generalize better, self-paced learning is adopted to gradually involve from easy to more complex samples into the training process. The proposed learning-based method is evaluated on a proprietary spinal metastasis CT dataset. At slice level, our method significantly outperforms an 121-layer DenseNet classifier in sensitivities by $+12.54\%$, $+7.23\%$ and $+29.06\%$ for blastic, mixed and lytic lesions, respectively, meanwhile $+12.33\%$, $+23.21\%$ and $+34.25\%$ at vertebrae level. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.08910v1-abstract-full').style.display = 'none'; document.getElementById('2402.08910v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.08892">arXiv:2402.08892</a> <span> [<a href="https://arxiv.org/pdf/2402.08892">pdf</a>, <a href="https://arxiv.org/format/2402.08892">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Weakly Supervised Segmentation of Vertebral Bodies with Iterative Slice-propagation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+S">Shiqi Peng</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Bolin Lai</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+G">Guangyu Yao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaoyun Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Ya Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yan-Feng Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hui Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.08892v1-abstract-short" style="display: inline;"> Vertebral body (VB) segmentation is an important preliminary step towards medical visual diagnosis for spinal diseases. However, most previous works require pixel/voxel-wise strong supervisions, which is expensive, tedious and time-consuming for experts to annotate. In this paper, we propose a Weakly supervised Iterative Spinal Segmentation (WISS) method leveraging only four corner landmark weak l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.08892v1-abstract-full').style.display = 'inline'; document.getElementById('2402.08892v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.08892v1-abstract-full" style="display: none;"> Vertebral body (VB) segmentation is an important preliminary step towards medical visual diagnosis for spinal diseases. However, most previous works require pixel/voxel-wise strong supervisions, which is expensive, tedious and time-consuming for experts to annotate. In this paper, we propose a Weakly supervised Iterative Spinal Segmentation (WISS) method leveraging only four corner landmark weak labels on a single sagittal slice to achieve automatic volumetric segmentation from CT images for VBs. WISS first segments VBs on an annotated sagittal slice in an iterative self-training manner. This self-training method alternates between training and refining labels in the training set. Then WISS proceeds to segment the whole VBs slice by slice with a slice-propagation method to obtain volumetric segmentations. We evaluate the performance of WISS on a private spinal metastases CT dataset and the public lumbar CT dataset. On the first dataset, WISS achieves distinct improvements with regard to two different backbones. For the second dataset, WISS achieves dice coefficients of $91.7\%$ and $83.7\%$ for mid-sagittal slices and 3D CT volumes, respectively, saving a lot of labeling costs and only sacrificing a little segmentation performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.08892v1-abstract-full').style.display = 'none'; document.getElementById('2402.08892v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:1412.7062 by other authors</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.04274">arXiv:2402.04274</a> <span> [<a href="https://arxiv.org/pdf/2402.04274">pdf</a>, <a href="https://arxiv.org/format/2402.04274">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> FPGA Deployment of LFADS for Real-time Neuroscience Experiments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaohan Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">ChiJui Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">YanLun Huang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">LingChi Yang</a>, <a href="/search/cs?searchtype=author&query=Khoda%2C+E+E">Elham E Khoda</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yihui Chen</a>, <a href="/search/cs?searchtype=author&query=Hauck%2C+S">Scott Hauck</a>, <a href="/search/cs?searchtype=author&query=Hsu%2C+S">Shih-Chieh Hsu</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Bo-Cheng Lai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.04274v1-abstract-short" style="display: inline;"> Large-scale recordings of neural activity are providing new opportunities to study neural population dynamics. A powerful method for analyzing such high-dimensional measurements is to deploy an algorithm to learn the low-dimensional latent dynamics. LFADS (Latent Factor Analysis via Dynamical Systems) is a deep learning method for inferring latent dynamics from high-dimensional neural spiking data… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.04274v1-abstract-full').style.display = 'inline'; document.getElementById('2402.04274v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.04274v1-abstract-full" style="display: none;"> Large-scale recordings of neural activity are providing new opportunities to study neural population dynamics. A powerful method for analyzing such high-dimensional measurements is to deploy an algorithm to learn the low-dimensional latent dynamics. LFADS (Latent Factor Analysis via Dynamical Systems) is a deep learning method for inferring latent dynamics from high-dimensional neural spiking data recorded simultaneously in single trials. This method has shown a remarkable performance in modeling complex brain signals with an average inference latency in milliseconds. As our capacity of simultaneously recording many neurons is increasing exponentially, it is becoming crucial to build capacity for deploying low-latency inference of the computing algorithms. To improve the real-time processing ability of LFADS, we introduce an efficient implementation of the LFADS models onto Field Programmable Gate Arrays (FPGA). Our implementation shows an inference latency of 41.97 $渭$s for processing the data in a single trial on a Xilinx U55C. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.04274v1-abstract-full').style.display = 'none'; document.getElementById('2402.04274v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 8 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Fast Machine Learning for Science, ICCAD 2023 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.12063">arXiv:2312.12063</a> <span> [<a href="https://arxiv.org/pdf/2312.12063">pdf</a>, <a href="https://arxiv.org/format/2312.12063">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> </div> </div> <p class="title is-5 mathjax"> Resource-efficient Generative Mobile Edge Networks in 6G Era: Fundamentals, Framework and Case Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lai%2C+B">Bingkun Lai</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+J">Jinbo Wen</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+H">Hongyang Du</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jiangtian Nie</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+C">Changyan Yi</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D+I">Dong In Kim</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shengli Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.12063v1-abstract-short" style="display: inline;"> As the next-generation wireless communication system, Sixth-Generation (6G) technologies are emerging, enabling various mobile edge networks that can revolutionize wireless communication and connectivity. By integrating Generative Artificial Intelligence (GAI) with mobile edge networks, generative mobile edge networks possess immense potential to enhance the intelligence and efficiency of wireless… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.12063v1-abstract-full').style.display = 'inline'; document.getElementById('2312.12063v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.12063v1-abstract-full" style="display: none;"> As the next-generation wireless communication system, Sixth-Generation (6G) technologies are emerging, enabling various mobile edge networks that can revolutionize wireless communication and connectivity. By integrating Generative Artificial Intelligence (GAI) with mobile edge networks, generative mobile edge networks possess immense potential to enhance the intelligence and efficiency of wireless communication networks. In this article, we propose the concept of generative mobile edge networks and overview widely adopted GAI technologies and their applications in mobile edge networks. We then discuss the potential challenges faced by generative mobile edge networks in resource-constrained scenarios. To address these challenges, we develop a universal resource-efficient generative incentive mechanism framework, in which we design resource-efficient methods for network overhead reduction, formulate appropriate incentive mechanisms for the resource allocation problem, and utilize Generative Diffusion Models (GDMs) to find the optimal incentive mechanism solutions. Furthermore, we conduct a case study on resource-constrained mobile edge networks, employing model partition for efficient AI task offloading and proposing a GDM-based Stackelberg model to motivate edge devices to contribute computing resources for mobile edge intelligence. Finally, we propose several open directions that could contribute to the future popularity of generative mobile edge networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.12063v1-abstract-full').style.display = 'none'; document.getElementById('2312.12063v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.03849">arXiv:2312.03849</a> <span> [<a href="https://arxiv.org/pdf/2312.03849">pdf</a>, <a href="https://arxiv.org/format/2312.03849">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LEGO: Learning EGOcentric Action Frame Generation via Visual Instruction Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lai%2C+B">Bolin Lai</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+X">Xiaoliang Dai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lawrence Chen</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+G">Guan Pang</a>, <a href="/search/cs?searchtype=author&query=Rehg%2C+J+M">James M. Rehg</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Miao Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.03849v2-abstract-short" style="display: inline;"> Generating instructional images of human daily actions from an egocentric viewpoint serves as a key step towards efficient skill transfer. In this paper, we introduce a novel problem -- egocentric action frame generation. The goal is to synthesize an image depicting an action in the user's context (i.e., action frame) by conditioning on a user prompt and an input egocentric image. Notably, existin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.03849v2-abstract-full').style.display = 'inline'; document.getElementById('2312.03849v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.03849v2-abstract-full" style="display: none;"> Generating instructional images of human daily actions from an egocentric viewpoint serves as a key step towards efficient skill transfer. In this paper, we introduce a novel problem -- egocentric action frame generation. The goal is to synthesize an image depicting an action in the user's context (i.e., action frame) by conditioning on a user prompt and an input egocentric image. Notably, existing egocentric action datasets lack the detailed annotations that describe the execution of actions. Additionally, existing diffusion-based image manipulation models are sub-optimal in controlling the state transition of an action in egocentric image pixel space because of the domain gap. To this end, we propose to Learn EGOcentric (LEGO) action frame generation via visual instruction tuning. First, we introduce a prompt enhancement scheme to generate enriched action descriptions from a visual large language model (VLLM) by visual instruction tuning. Then we propose a novel method to leverage image and text embeddings from the VLLM as additional conditioning to improve the performance of a diffusion model. We validate our model on two egocentric datasets -- Ego4D and Epic-Kitchens. Our experiments show substantial improvement over prior image manipulation models in both quantitative and qualitative evaluation. We also conduct detailed ablation studies and analysis to provide insights in our method. More details of the dataset and code are available on the website (https://bolinlai.github.io/Lego_EgoActGen/). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.03849v2-abstract-full').style.display = 'none'; document.getElementById('2312.03849v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">34 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.09985">arXiv:2308.09985</a> <span> [<a href="https://arxiv.org/pdf/2308.09985">pdf</a>, <a href="https://arxiv.org/format/2308.09985">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TNNLS.2024.3384987">10.1109/TNNLS.2024.3384987 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> HICL: Hashtag-Driven In-Context Learning for Social Media Natural Language Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+H">Hanzhuo Tan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chunpu Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jing Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuqun Zhang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zeyang Fang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zeyu Chen</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Baohua Lai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.09985v1-abstract-short" style="display: inline;"> Natural language understanding (NLU) is integral to various social media applications. However, existing NLU models rely heavily on context for semantic learning, resulting in compromised performance when faced with short and noisy social media content. To address this issue, we leverage in-context learning (ICL), wherein language models learn to make inferences by conditioning on a handful of dem… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.09985v1-abstract-full').style.display = 'inline'; document.getElementById('2308.09985v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.09985v1-abstract-full" style="display: none;"> Natural language understanding (NLU) is integral to various social media applications. However, existing NLU models rely heavily on context for semantic learning, resulting in compromised performance when faced with short and noisy social media content. To address this issue, we leverage in-context learning (ICL), wherein language models learn to make inferences by conditioning on a handful of demonstrations to enrich the context and propose a novel hashtag-driven in-context learning (HICL) framework. Concretely, we pre-train a model #Encoder, which employs #hashtags (user-annotated topic labels) to drive BERT-based pre-training through contrastive learning. Our objective here is to enable #Encoder to gain the ability to incorporate topic-related semantic information, which allows it to retrieve topic-related posts to enrich contexts and enhance social media NLU with noisy contexts. To further integrate the retrieved context with the source text, we employ a gradient-based method to identify trigger terms useful in fusing information from both sources. For empirical studies, we collected 45M tweets to set up an in-context NLU benchmark, and the experimental results on seven downstream tasks show that HICL substantially advances the previous state-of-the-art results. Furthermore, we conducted extensive analyzes and found that: (1) combining source input with a top-retrieved post from #Encoder is more effective than using semantically similar posts; (2) trigger words can largely benefit in merging context from the source and retrieved posts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.09985v1-abstract-full').style.display = 'none'; document.getElementById('2308.09985v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">https://github.com/albertan017/HICL</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> 10.1109/TNNLS.2024.3384987 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.15975">arXiv:2307.15975</a> <span> [<a href="https://arxiv.org/pdf/2307.15975">pdf</a>, <a href="https://arxiv.org/ps/2307.15975">ps</a>, <a href="https://arxiv.org/format/2307.15975">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Blockchain-empowered Federated Learning for Healthcare Metaverses: User-centric Incentive Mechanism with Optimal Data Freshness </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+J">Jinbo Wen</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+D">Dongdong Ye</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Bingkun Lai</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tianhao Wu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Z">Zehui Xiong</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jiangtian Nie</a>, <a href="/search/cs?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yang Zhang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shengli Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.15975v1-abstract-short" style="display: inline;"> Given the revolutionary role of metaverses, healthcare metaverses are emerging as a transformative force, creating intelligent healthcare systems that offer immersive and personalized services. The healthcare metaverses allow for effective decision-making and data analytics for users. However, there still exist critical challenges in building healthcare metaverses, such as the risk of sensitive da… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.15975v1-abstract-full').style.display = 'inline'; document.getElementById('2307.15975v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.15975v1-abstract-full" style="display: none;"> Given the revolutionary role of metaverses, healthcare metaverses are emerging as a transformative force, creating intelligent healthcare systems that offer immersive and personalized services. The healthcare metaverses allow for effective decision-making and data analytics for users. However, there still exist critical challenges in building healthcare metaverses, such as the risk of sensitive data leakage and issues with sensing data security and freshness, as well as concerns around incentivizing data sharing. In this paper, we first design a user-centric privacy-preserving framework based on decentralized Federated Learning (FL) for healthcare metaverses. To further improve the privacy protection of healthcare metaverses, a cross-chain empowered FL framework is utilized to enhance sensing data security. This framework utilizes a hierarchical cross-chain architecture with a main chain and multiple subchains to perform decentralized, privacy-preserving, and secure data training in both virtual and physical spaces. Moreover, we utilize Age of Information (AoI) as an effective data-freshness metric and propose an AoI-based contract theory model under Prospect Theory (PT) to motivate sensing data sharing in a user-centric manner. This model exploits PT to better capture the subjective utility of the service provider. Finally, our numerical results demonstrate the effectiveness of the proposed schemes for healthcare metaverses. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.15975v1-abstract-full').style.display = 'none'; document.getElementById('2307.15975v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.11330">arXiv:2306.11330</a> <span> [<a href="https://arxiv.org/pdf/2306.11330">pdf</a>, <a href="https://arxiv.org/format/2306.11330">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="High Energy Physics - Experiment">hep-ex</span> </div> </div> <p class="title is-5 mathjax"> Low Latency Edge Classification GNN for Particle Trajectory Tracking on FPGAs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+S">Shi-Yu Huang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yun-Chen Yang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yu-Ru Su</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Bo-Cheng Lai</a>, <a href="/search/cs?searchtype=author&query=Duarte%2C+J">Javier Duarte</a>, <a href="/search/cs?searchtype=author&query=Hauck%2C+S">Scott Hauck</a>, <a href="/search/cs?searchtype=author&query=Hsu%2C+S">Shih-Chieh Hsu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jin-Xuan Hu</a>, <a href="/search/cs?searchtype=author&query=Neubauer%2C+M+S">Mark S. Neubauer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.11330v2-abstract-short" style="display: inline;"> In-time particle trajectory reconstruction in the Large Hadron Collider is challenging due to the high collision rate and numerous particle hits. Using GNN (Graph Neural Network) on FPGA has enabled superior accuracy with flexible trajectory classification. However, existing GNN architectures have inefficient resource usage and insufficient parallelism for edge classification. This paper introduce… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.11330v2-abstract-full').style.display = 'inline'; document.getElementById('2306.11330v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.11330v2-abstract-full" style="display: none;"> In-time particle trajectory reconstruction in the Large Hadron Collider is challenging due to the high collision rate and numerous particle hits. Using GNN (Graph Neural Network) on FPGA has enabled superior accuracy with flexible trajectory classification. However, existing GNN architectures have inefficient resource usage and insufficient parallelism for edge classification. This paper introduces a resource-efficient GNN architecture on FPGAs for low latency particle tracking. The modular architecture facilitates design scalability to support large graphs. Leveraging the geometric properties of hit detectors further reduces graph complexity and resource usage. Our results on Xilinx UltraScale+ VU9P demonstrate 1625x and 1574x performance improvement over CPU and GPU respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.11330v2-abstract-full').style.display = 'none'; document.getElementById('2306.11330v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.03907">arXiv:2305.03907</a> <span> [<a href="https://arxiv.org/pdf/2305.03907">pdf</a>, <a href="https://arxiv.org/format/2305.03907">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Listen to Look into the Future: Audio-Visual Egocentric Gaze Anticipation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lai%2C+B">Bolin Lai</a>, <a href="/search/cs?searchtype=author&query=Ryan%2C+F">Fiona Ryan</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+W">Wenqi Jia</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Miao Liu</a>, <a href="/search/cs?searchtype=author&query=Rehg%2C+J+M">James M. Rehg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.03907v3-abstract-short" style="display: inline;"> Egocentric gaze anticipation serves as a key building block for the emerging capability of Augmented Reality. Notably, gaze behavior is driven by both visual cues and audio signals during daily activities. Motivated by this observation, we introduce the first model that leverages both the video and audio modalities for egocentric gaze anticipation. Specifically, we propose a Contrastive Spatial-Te… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.03907v3-abstract-full').style.display = 'inline'; document.getElementById('2305.03907v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.03907v3-abstract-full" style="display: none;"> Egocentric gaze anticipation serves as a key building block for the emerging capability of Augmented Reality. Notably, gaze behavior is driven by both visual cues and audio signals during daily activities. Motivated by this observation, we introduce the first model that leverages both the video and audio modalities for egocentric gaze anticipation. Specifically, we propose a Contrastive Spatial-Temporal Separable (CSTS) fusion approach that adopts two modules to separately capture audio-visual correlations in spatial and temporal dimensions, and applies a contrastive loss on the re-weighted audio-visual features from fusion modules for representation learning. We conduct extensive ablation studies and thorough analysis using two egocentric video datasets: Ego4D and Aria, to validate our model design. We demonstrate the audio improves the performance by +2.5% and +2.4% on the two datasets. Our model also outperforms the prior state-of-the-art methods by at least +1.9% and +1.6%. Moreover, we provide visualizations to show the gaze anticipation results and provide additional insights into audio-visual representation learning. The code and data split are available on our website (https://bolinlai.github.io/CSTS-EgoGazeAnticipation/). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.03907v3-abstract-full').style.display = 'none'; document.getElementById('2305.03907v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">30 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.08279">arXiv:2212.08279</a> <span> [<a href="https://arxiv.org/pdf/2212.08279">pdf</a>, <a href="https://arxiv.org/format/2212.08279">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Werewolf Among Us: A Multimodal Dataset for Modeling Persuasion Behaviors in Social Deduction Games </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lai%2C+B">Bolin Lai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongxin Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Miao Liu</a>, <a href="/search/cs?searchtype=author&query=Pariani%2C+A">Aryan Pariani</a>, <a href="/search/cs?searchtype=author&query=Ryan%2C+F">Fiona Ryan</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+W">Wenqi Jia</a>, <a href="/search/cs?searchtype=author&query=Hayati%2C+S+A">Shirley Anugrah Hayati</a>, <a href="/search/cs?searchtype=author&query=Rehg%2C+J+M">James M. Rehg</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Diyi Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.08279v1-abstract-short" style="display: inline;"> Persuasion modeling is a key building block for conversational agents. Existing works in this direction are limited to analyzing textual dialogue corpus. We argue that visual signals also play an important role in understanding human persuasive behaviors. In this paper, we introduce the first multimodal dataset for modeling persuasion behaviors. Our dataset includes 199 dialogue transcriptions and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.08279v1-abstract-full').style.display = 'inline'; document.getElementById('2212.08279v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.08279v1-abstract-full" style="display: none;"> Persuasion modeling is a key building block for conversational agents. Existing works in this direction are limited to analyzing textual dialogue corpus. We argue that visual signals also play an important role in understanding human persuasive behaviors. In this paper, we introduce the first multimodal dataset for modeling persuasion behaviors. Our dataset includes 199 dialogue transcriptions and videos captured in a multi-player social deduction game setting, 26,647 utterance level annotations of persuasion strategy, and game level annotations of deduction game outcomes. We provide extensive experiments to show how dialogue context and visual signals benefit persuasion strategy prediction. We also explore the generalization ability of language models for persuasion modeling and the role of persuasion strategies in predicting social deduction game outcomes. Our dataset, code, and models can be found at https://persuasion-deductiongame.socialai-data.org. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.08279v1-abstract-full').style.display = 'none'; document.getElementById('2212.08279v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.10984">arXiv:2210.10984</a> <span> [<a href="https://arxiv.org/pdf/2210.10984">pdf</a>, <a href="https://arxiv.org/format/2210.10984">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RAIS: Robust and Accurate Interactive Segmentation via Continual Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hao%2C+Y">Yuying Hao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yi Liu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+J">Juncai Peng</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+H">Haoyi Xiong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guowei Chen</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Shiyu Tang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zeyu Chen</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Baohua Lai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.10984v1-abstract-short" style="display: inline;"> Interactive image segmentation aims at segmenting a target region through a way of human-computer interaction. Recent works based on deep learning have achieved excellent performance, while most of them focus on improving the accuracy of the training set and ignore potential improvement on the test set. In the inference phase, they tend to have a good performance on similar domains to the training… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.10984v1-abstract-full').style.display = 'inline'; document.getElementById('2210.10984v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.10984v1-abstract-full" style="display: none;"> Interactive image segmentation aims at segmenting a target region through a way of human-computer interaction. Recent works based on deep learning have achieved excellent performance, while most of them focus on improving the accuracy of the training set and ignore potential improvement on the test set. In the inference phase, they tend to have a good performance on similar domains to the training set, and lack adaptability to domain shift, so they require more user efforts to obtain satisfactory results. In this work, we propose RAIS, a robust and accurate architecture for interactive segmentation with continuous learning, where the model can learn from both train and test data sets. For efficient learning on the test set, we propose a novel optimization strategy to update global and local parameters with a basic segmentation module and adaptation module, respectively. Moreover, we perform extensive experiments on several benchmarks that show our method can handle data distribution shifts and achieves SOTA performance compared with recent interactive segmentation methods. Besides, our method also shows its robustness in the datasets of remote sensing and medical imaging where the data domains are completely different between training and testing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.10984v1-abstract-full').style.display = 'none'; document.getElementById('2210.10984v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.08788">arXiv:2210.08788</a> <span> [<a href="https://arxiv.org/pdf/2210.08788">pdf</a>, <a href="https://arxiv.org/format/2210.08788">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EISeg: An Efficient Interactive Segmentation Tool based on PaddlePaddle </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hao%2C+Y">Yuying Hao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yi Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yizhou Chen</a>, <a href="/search/cs?searchtype=author&query=Han%2C+L">Lin Han</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+J">Juncai Peng</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Shiyu Tang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guowei Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zewu Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zeyu Chen</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Baohua Lai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.08788v2-abstract-short" style="display: inline;"> In recent years, the rapid development of deep learning has brought great advancements to image and video segmentation methods based on neural networks. However, to unleash the full potential of such models, large numbers of high-quality annotated images are necessary for model training. Currently, many widely used open-source image segmentation software relies heavily on manual annotation which i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.08788v2-abstract-full').style.display = 'inline'; document.getElementById('2210.08788v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.08788v2-abstract-full" style="display: none;"> In recent years, the rapid development of deep learning has brought great advancements to image and video segmentation methods based on neural networks. However, to unleash the full potential of such models, large numbers of high-quality annotated images are necessary for model training. Currently, many widely used open-source image segmentation software relies heavily on manual annotation which is tedious and time-consuming. In this work, we introduce EISeg, an Efficient Interactive SEGmentation annotation tool that can drastically improve image segmentation annotation efficiency, generating highly accurate segmentation masks with only a few clicks. We also provide various domain-specific models for remote sensing, medical imaging, industrial quality inspections, human segmentation, and temporal aware models for video segmentation. The source code for our algorithm and user interface are available at: https://github.com/PaddlePaddle/PaddleSeg. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.08788v2-abstract-full').style.display = 'none'; document.getElementById('2210.08788v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.04464">arXiv:2208.04464</a> <span> [<a href="https://arxiv.org/pdf/2208.04464">pdf</a>, <a href="https://arxiv.org/format/2208.04464">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> In the Eye of Transformer: Global-Local Correlation for Egocentric Gaze Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lai%2C+B">Bolin Lai</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Miao Liu</a>, <a href="/search/cs?searchtype=author&query=Ryan%2C+F">Fiona Ryan</a>, <a href="/search/cs?searchtype=author&query=Rehg%2C+J+M">James M. Rehg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.04464v3-abstract-short" style="display: inline;"> In this paper, we present the first transformer-based model to address the challenging problem of egocentric gaze estimation. We observe that the connection between the global scene context and local visual information is vital for localizing the gaze fixation from egocentric video frames. To this end, we design the transformer encoder to embed the global context as one additional visual token and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.04464v3-abstract-full').style.display = 'inline'; document.getElementById('2208.04464v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.04464v3-abstract-full" style="display: none;"> In this paper, we present the first transformer-based model to address the challenging problem of egocentric gaze estimation. We observe that the connection between the global scene context and local visual information is vital for localizing the gaze fixation from egocentric video frames. To this end, we design the transformer encoder to embed the global context as one additional visual token and further propose a novel Global-Local Correlation (GLC) module to explicitly model the correlation of the global token and each local token. We validate our model on two egocentric video datasets - EGTEA Gaze+ and Ego4D. Our detailed ablation studies demonstrate the benefits of our method. In addition, our approach exceeds previous state-of-the-arts by a large margin. We also provide additional visualizations to support our claim that global-local correlation serves a key representation for predicting gaze fixation from egocentric videos. More details can be found in our website (https://bolinlai.github.io/GLC-EgoGazeEst). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.04464v3-abstract-full').style.display = 'none'; document.getElementById('2208.04464v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.03001">arXiv:2206.03001</a> <span> [<a href="https://arxiv.org/pdf/2206.03001">pdf</a>, <a href="https://arxiv.org/format/2206.03001">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PP-OCRv3: More Attempts for the Improvement of Ultra Lightweight OCR System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenxia Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Weiwei Liu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+R">Ruoyu Guo</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+X">Xiaoting Yin</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+K">Kaitao Jiang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yongkun Du</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yuning Du</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Lingfeng Zhu</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Baohua Lai</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiaoguang Hu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+D">Dianhai Yu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yanjun Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.03001v2-abstract-short" style="display: inline;"> Optical character recognition (OCR) technology has been widely used in various scenes, as shown in Figure 1. Designing a practical OCR system is still a meaningful but challenging task. In previous work, considering the efficiency and accuracy, we proposed a practical ultra lightweight OCR system (PP-OCR), and an optimized version PP-OCRv2. In order to further improve the performance of PP-OCRv2,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.03001v2-abstract-full').style.display = 'inline'; document.getElementById('2206.03001v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.03001v2-abstract-full" style="display: none;"> Optical character recognition (OCR) technology has been widely used in various scenes, as shown in Figure 1. Designing a practical OCR system is still a meaningful but challenging task. In previous work, considering the efficiency and accuracy, we proposed a practical ultra lightweight OCR system (PP-OCR), and an optimized version PP-OCRv2. In order to further improve the performance of PP-OCRv2, a more robust OCR system PP-OCRv3 is proposed in this paper. PP-OCRv3 upgrades the text detection model and text recognition model in 9 aspects based on PP-OCRv2. For text detector, we introduce a PAN module with large receptive field named LK-PAN, a FPN module with residual attention mechanism named RSE-FPN, and DML distillation strategy. For text recognizer, the base model is replaced from CRNN to SVTR, and we introduce lightweight text recognition network SVTR LCNet, guided training of CTC by attention, data augmentation strategy TextConAug, better pre-trained model by self-supervised TextRotNet, UDML, and UIM to accelerate the model and improve the effect. Experiments on real data show that the hmean of PP-OCRv3 is 5% higher than PP-OCRv2 under comparable inference speed. All the above mentioned models are open-sourced and the code is available in the GitHub repository PaddleOCR which is powered by PaddlePaddle. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.03001v2-abstract-full').style.display = 'none'; document.getElementById('2206.03001v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2109.03144</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.02681">arXiv:2204.02681</a> <span> [<a href="https://arxiv.org/pdf/2204.02681">pdf</a>, <a href="https://arxiv.org/format/2204.02681">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PP-LiteSeg: A Superior Real-Time Semantic Segmentation Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+J">Juncai Peng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yi Liu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Shiyu Tang</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Y">Yuying Hao</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+L">Lutao Chu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guowei Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zewu Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zeyu Chen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhiliang Yu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yuning Du</a>, <a href="/search/cs?searchtype=author&query=Dang%2C+Q">Qingqing Dang</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Baohua Lai</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qiwen Liu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiaoguang Hu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+D">Dianhai Yu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yanjun Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.02681v1-abstract-short" style="display: inline;"> Real-world applications have high demands for semantic segmentation methods. Although semantic segmentation has made remarkable leap-forwards with deep learning, the performance of real-time methods is not satisfactory. In this work, we propose PP-LiteSeg, a novel lightweight model for the real-time semantic segmentation task. Specifically, we present a Flexible and Lightweight Decoder (FLD) to re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.02681v1-abstract-full').style.display = 'inline'; document.getElementById('2204.02681v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.02681v1-abstract-full" style="display: none;"> Real-world applications have high demands for semantic segmentation methods. Although semantic segmentation has made remarkable leap-forwards with deep learning, the performance of real-time methods is not satisfactory. In this work, we propose PP-LiteSeg, a novel lightweight model for the real-time semantic segmentation task. Specifically, we present a Flexible and Lightweight Decoder (FLD) to reduce computation overhead of previous decoder. To strengthen feature representations, we propose a Unified Attention Fusion Module (UAFM), which takes advantage of spatial and channel attention to produce a weight and then fuses the input features with the weight. Moreover, a Simple Pyramid Pooling Module (SPPM) is proposed to aggregate global context with low computation cost. Extensive evaluations demonstrate that PP-LiteSeg achieves a superior trade-off between accuracy and speed compared to other methods. On the Cityscapes test set, PP-LiteSeg achieves 72.0% mIoU/273.6 FPS and 77.5% mIoU/102.6 FPS on NVIDIA GTX 1080Ti. Source code and models are available at PaddleSeg: https://github.com/PaddlePaddle/PaddleSeg. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.02681v1-abstract-full').style.display = 'none'; document.getElementById('2204.02681v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.00826">arXiv:2204.00826</a> <span> [<a href="https://arxiv.org/pdf/2204.00826">pdf</a>, <a href="https://arxiv.org/format/2204.00826">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Online Convolutional Re-parameterization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+M">Mu Hu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+J">Junyi Feng</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+J">Jiashen Hua</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Baisheng Lai</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jianqiang Huang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+X">Xiaojin Gong</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+X">Xiansheng Hua</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.00826v1-abstract-short" style="display: inline;"> Structural re-parameterization has drawn increasing attention in various computer vision tasks. It aims at improving the performance of deep models without introducing any inference-time cost. Though efficient during inference, such models rely heavily on the complicated training-time blocks to achieve high accuracy, leading to large extra training cost. In this paper, we present online convolutio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.00826v1-abstract-full').style.display = 'inline'; document.getElementById('2204.00826v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.00826v1-abstract-full" style="display: none;"> Structural re-parameterization has drawn increasing attention in various computer vision tasks. It aims at improving the performance of deep models without introducing any inference-time cost. Though efficient during inference, such models rely heavily on the complicated training-time blocks to achieve high accuracy, leading to large extra training cost. In this paper, we present online convolutional re-parameterization (OREPA), a two-stage pipeline, aiming to reduce the huge training overhead by squeezing the complex training-time block into a single convolution. To achieve this goal, we introduce a linear scaling layer for better optimizing the online blocks. Assisted with the reduced training cost, we also explore some more effective re-param components. Compared with the state-of-the-art re-param models, OREPA is able to save the training-time memory cost by about 70% and accelerate the training speed by around 2x. Meanwhile, equipped with OREPA, the models outperform previous methods on ImageNet by up to +0.6%.We also conduct experiments on object detection and semantic segmentation and show consistent improvements on the downstream tasks. Codes are available at https://github.com/JUGGHM/OREPA_CVPR2022 . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.00826v1-abstract-full').style.display = 'none'; document.getElementById('2204.00826v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.16250">arXiv:2203.16250</a> <span> [<a href="https://arxiv.org/pdf/2203.16250">pdf</a>, <a href="https://arxiv.org/format/2203.16250">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PP-YOLOE: An evolved version of YOLO </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shangliang Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinxin Wang</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+W">Wenyu Lv</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+Q">Qinyao Chang</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+C">Cheng Cui</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+K">Kaipeng Deng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guanzhong Wang</a>, <a href="/search/cs?searchtype=author&query=Dang%2C+Q">Qingqing Dang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+S">Shengyu Wei</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yuning Du</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Baohua Lai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.16250v3-abstract-short" style="display: inline;"> In this report, we present PP-YOLOE, an industrial state-of-the-art object detector with high performance and friendly deployment. We optimize on the basis of the previous PP-YOLOv2, using anchor-free paradigm, more powerful backbone and neck equipped with CSPRepResStage, ET-head and dynamic label assignment algorithm TAL. We provide s/m/l/x models for different practice scenarios. As a result, PP… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.16250v3-abstract-full').style.display = 'inline'; document.getElementById('2203.16250v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.16250v3-abstract-full" style="display: none;"> In this report, we present PP-YOLOE, an industrial state-of-the-art object detector with high performance and friendly deployment. We optimize on the basis of the previous PP-YOLOv2, using anchor-free paradigm, more powerful backbone and neck equipped with CSPRepResStage, ET-head and dynamic label assignment algorithm TAL. We provide s/m/l/x models for different practice scenarios. As a result, PP-YOLOE-l achieves 51.4 mAP on COCO test-dev and 78.1 FPS on Tesla V100, yielding a remarkable improvement of (+1.9 AP, +13.35% speed up) and (+1.3 AP, +24.96% speed up), compared to the previous state-of-the-art industrial models PP-YOLOv2 and YOLOX respectively. Further, PP-YOLOE inference speed achieves 149.2 FPS with TensorRT and FP16-precision. We also conduct extensive experiments to verify the effectiveness of our designs. Source code and pre-trained models are available at https://github.com/PaddlePaddle/PaddleDetection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.16250v3-abstract-full').style.display = 'none'; document.getElementById('2203.16250v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 3 figures, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2201.05820">arXiv:2201.05820</a> <span> [<a href="https://arxiv.org/pdf/2201.05820">pdf</a>, <a href="https://arxiv.org/format/2201.05820">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TIP.2022.3213193">10.1109/TIP.2022.3213193 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Offline-Online Associated Camera-Aware Proxies for Unsupervised Person Re-identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+M">Menglin Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiachen Li</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Baisheng Lai</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+X">Xiaojin Gong</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+X">Xian-Sheng Hua</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2201.05820v2-abstract-short" style="display: inline;"> Recently, unsupervised person re-identification (Re-ID) has received increasing research attention due to its potential for label-free applications. A promising way to address unsupervised Re-ID is clustering-based, which generates pseudo labels by clustering and uses the pseudo labels to train a Re-ID model iteratively. However, most clustering-based methods take each cluster as a pseudo identity… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.05820v2-abstract-full').style.display = 'inline'; document.getElementById('2201.05820v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2201.05820v2-abstract-full" style="display: none;"> Recently, unsupervised person re-identification (Re-ID) has received increasing research attention due to its potential for label-free applications. A promising way to address unsupervised Re-ID is clustering-based, which generates pseudo labels by clustering and uses the pseudo labels to train a Re-ID model iteratively. However, most clustering-based methods take each cluster as a pseudo identity class, neglecting the intra-cluster variance mainly caused by the change of cameras. To address this issue, we propose to split each single cluster into multiple proxies according to camera views. The camera-aware proxies explicitly capture local structures within clusters, by which the intra-ID variance and inter-ID similarity can be better tackled. Assisted with the camera-aware proxies, we design two proxy-level contrastive learning losses that are, respectively, based on offline and online association results. The offline association directly associates proxies according to the clustering and splitting results, while the online strategy dynamically associates proxies in terms of up-to-date features to reduce the noise caused by the delayed update of pseudo labels. The combination of two losses enables us to train a desirable Re-ID model. Extensive experiments on three person Re-ID datasets and one vehicle Re-ID dataset show that our proposed approach demonstrates competitive performance with state-of-the-art methods. Code will be available at: https://github.com/Terminator8758/O2CAP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.05820v2-abstract-full').style.display = 'none'; document.getElementById('2201.05820v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to TIP</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.07146">arXiv:2112.07146</a> <span> [<a href="https://arxiv.org/pdf/2112.07146">pdf</a>, <a href="https://arxiv.org/format/2112.07146">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> PP-HumanSeg: Connectivity-Aware Portrait Segmentation with a Large-Scale Teleconferencing Video Dataset </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chu%2C+L">Lutao Chu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yi Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zewu Wu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Shiyu Tang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guowei Chen</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Y">Yuying Hao</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+J">Juncai Peng</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhiliang Yu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zeyu Chen</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Baohua Lai</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+H">Haoyi Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.07146v1-abstract-short" style="display: inline;"> As the COVID-19 pandemic rampages across the world, the demands of video conferencing surge. To this end, real-time portrait segmentation becomes a popular feature to replace backgrounds of conferencing participants. While feature-rich datasets, models and algorithms have been offered for segmentation that extract body postures from life scenes, portrait segmentation has yet not been well covered… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.07146v1-abstract-full').style.display = 'inline'; document.getElementById('2112.07146v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.07146v1-abstract-full" style="display: none;"> As the COVID-19 pandemic rampages across the world, the demands of video conferencing surge. To this end, real-time portrait segmentation becomes a popular feature to replace backgrounds of conferencing participants. While feature-rich datasets, models and algorithms have been offered for segmentation that extract body postures from life scenes, portrait segmentation has yet not been well covered in a video conferencing context. To facilitate the progress in this field, we introduce an open-source solution named PP-HumanSeg. This work is the first to construct a large-scale video portrait dataset that contains 291 videos from 23 conference scenes with 14K fine-labeled frames and extensions to multi-camera teleconferencing. Furthermore, we propose a novel Semantic Connectivity-aware Learning (SCL) for semantic segmentation, which introduces a semantic connectivity-aware loss to improve the quality of segmentation results from the perspective of connectivity. And we propose an ultra-lightweight model with SCL for practical portrait segmentation, which achieves the best trade-off between IoU and the speed of inference. Extensive evaluations on our dataset demonstrate the superiority of SCL and our model. The source code is available at https://github.com/PaddlePaddle/PaddleSeg. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.07146v1-abstract-full').style.display = 'none'; document.getElementById('2112.07146v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by WACV workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.02828">arXiv:2112.02828</a> <span> [<a href="https://arxiv.org/pdf/2112.02828">pdf</a>, <a href="https://arxiv.org/format/2112.02828">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PP-MSVSR: Multi-Stage Video Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+L">Lielin Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+N">Na Wang</a>, <a href="/search/cs?searchtype=author&query=Dang%2C+Q">Qingqing Dang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+R">Rui Liu</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Baohua Lai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.02828v1-abstract-short" style="display: inline;"> Different from the Single Image Super-Resolution(SISR) task, the key for Video Super-Resolution(VSR) task is to make full use of complementary information across frames to reconstruct the high-resolution sequence. Since images from different frames with diverse motion and scene, accurately aligning multiple frames and effectively fusing different frames has always been the key research work of VSR… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.02828v1-abstract-full').style.display = 'inline'; document.getElementById('2112.02828v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.02828v1-abstract-full" style="display: none;"> Different from the Single Image Super-Resolution(SISR) task, the key for Video Super-Resolution(VSR) task is to make full use of complementary information across frames to reconstruct the high-resolution sequence. Since images from different frames with diverse motion and scene, accurately aligning multiple frames and effectively fusing different frames has always been the key research work of VSR tasks. To utilize rich complementary information of neighboring frames, in this paper, we propose a multi-stage VSR deep architecture, dubbed as PP-MSVSR, with local fusion module, auxiliary loss and re-align module to refine the enhanced result progressively. Specifically, in order to strengthen the fusion of features across frames in feature propagation, a local fusion module is designed in stage-1 to perform local feature fusion before feature propagation. Moreover, we introduce an auxiliary loss in stage-2 to make the features obtained by the propagation module reserve more correlated information connected to the HR space, and introduce a re-align module in stage-3 to make full use of the feature information of the previous stage. Extensive experiments substantiate that PP-MSVSR achieves a promising performance of Vid4 datasets, which achieves a PSNR of 28.13dB with only 1.45M parameters. And the PP-MSVSR-L exceeds all state of the art method on REDS4 datasets with considerable parameters. Code and models will be released in PaddleGAN\footnote{https://github.com/PaddlePaddle/PaddleGAN.}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.02828v1-abstract-full').style.display = 'none'; document.getElementById('2112.02828v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 6 figures, 3 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.02521">arXiv:2112.02521</a> <span> [<a href="https://arxiv.org/pdf/2112.02521">pdf</a>, <a href="https://arxiv.org/format/2112.02521">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Inf-CP: A Reliable Channel Pruning based on Channel Influence </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lai%2C+B">Bilan Lai</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+H">Haoran Xiang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+F">Furao Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.02521v1-abstract-short" style="display: inline;"> One of the most effective methods of channel pruning is to trim on the basis of the importance of each neuron. However, measuring the importance of each neuron is an NP-hard problem. Previous works have proposed to trim by considering the statistics of a single layer or a plurality of successive layers of neurons. These works cannot eliminate the influence of different data on the model in the rec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.02521v1-abstract-full').style.display = 'inline'; document.getElementById('2112.02521v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.02521v1-abstract-full" style="display: none;"> One of the most effective methods of channel pruning is to trim on the basis of the importance of each neuron. However, measuring the importance of each neuron is an NP-hard problem. Previous works have proposed to trim by considering the statistics of a single layer or a plurality of successive layers of neurons. These works cannot eliminate the influence of different data on the model in the reconstruction error, and currently, there is no work to prove that the absolute values of the parameters can be directly used as the basis for judging the importance of the weights. A more reasonable approach is to eliminate the difference between batch data that accurately measures the weight of influence. In this paper, we propose to use ensemble learning to train a model for different batches of data and use the influence function (a classic technique from robust statistics) to learn the algorithm to track the model's prediction and return its training parameter gradient, so that we can determine the responsibility for each parameter, which we call "influence", in the prediction process. In addition, we theoretically prove that the back-propagation of the deep network is a first-order Taylor approximation of the influence function of the weights. We perform extensive experiments to prove that pruning based on the influence function using the idea of ensemble learning will be much more effective than just focusing on error reconstruction. Experiments on CIFAR shows that the influence pruning achieves the state-of-the-art result. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.02521v1-abstract-full').style.display = 'none'; document.getElementById('2112.02521v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.02048">arXiv:2112.02048</a> <span> [<a href="https://arxiv.org/pdf/2112.02048">pdf</a>, <a href="https://arxiv.org/format/2112.02048">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Instrumentation and Detectors">physics.ins-det</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="High Energy Physics - Experiment">hep-ex</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.3389/fdata.2022.828666">10.3389/fdata.2022.828666 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Graph Neural Networks for Charged Particle Tracking on FPGAs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Elabd%2C+A">Abdelrahman Elabd</a>, <a href="/search/cs?searchtype=author&query=Razavimaleki%2C+V">Vesal Razavimaleki</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Shi-Yu Huang</a>, <a href="/search/cs?searchtype=author&query=Duarte%2C+J">Javier Duarte</a>, <a href="/search/cs?searchtype=author&query=Atkinson%2C+M">Markus Atkinson</a>, <a href="/search/cs?searchtype=author&query=DeZoort%2C+G">Gage DeZoort</a>, <a href="/search/cs?searchtype=author&query=Elmer%2C+P">Peter Elmer</a>, <a href="/search/cs?searchtype=author&query=Hauck%2C+S">Scott Hauck</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jin-Xuan Hu</a>, <a href="/search/cs?searchtype=author&query=Hsu%2C+S">Shih-Chieh Hsu</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Bo-Cheng Lai</a>, <a href="/search/cs?searchtype=author&query=Neubauer%2C+M">Mark Neubauer</a>, <a href="/search/cs?searchtype=author&query=Ojalvo%2C+I">Isobel Ojalvo</a>, <a href="/search/cs?searchtype=author&query=Thais%2C+S">Savannah Thais</a>, <a href="/search/cs?searchtype=author&query=Trahms%2C+M">Matthew Trahms</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.02048v3-abstract-short" style="display: inline;"> The determination of charged particle trajectories in collisions at the CERN Large Hadron Collider (LHC) is an important but challenging problem, especially in the high interaction density conditions expected during the future high-luminosity phase of the LHC (HL-LHC). Graph neural networks (GNNs) are a type of geometric deep learning algorithm that has successfully been applied to this task by em… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.02048v3-abstract-full').style.display = 'inline'; document.getElementById('2112.02048v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.02048v3-abstract-full" style="display: none;"> The determination of charged particle trajectories in collisions at the CERN Large Hadron Collider (LHC) is an important but challenging problem, especially in the high interaction density conditions expected during the future high-luminosity phase of the LHC (HL-LHC). Graph neural networks (GNNs) are a type of geometric deep learning algorithm that has successfully been applied to this task by embedding tracker data as a graph -- nodes represent hits, while edges represent possible track segments -- and classifying the edges as true or fake track segments. However, their study in hardware- or software-based trigger applications has been limited due to their large computational cost. In this paper, we introduce an automated translation workflow, integrated into a broader tool called $\texttt{hls4ml}$, for converting GNNs into firmware for field-programmable gate arrays (FPGAs). We use this translation tool to implement GNNs for charged particle tracking, trained using the TrackML challenge dataset, on FPGAs with designs targeting different graph sizes, task complexites, and latency/throughput requirements. This work could enable the inclusion of charged particle tracking GNNs at the trigger level for HL-LHC experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.02048v3-abstract-full').style.display = 'none'; document.getElementById('2112.02048v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">28 pages, 17 figures, 1 table, published version</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Front. Big Data 5 (2022) 828666 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.00902">arXiv:2111.00902</a> <span> [<a href="https://arxiv.org/pdf/2111.00902">pdf</a>, <a href="https://arxiv.org/format/2111.00902">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PP-PicoDet: A Better Real-Time Object Detector on Mobile Devices </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+G">Guanghua Yu</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+Q">Qinyao Chang</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+W">Wenyu Lv</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chang Xu</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+C">Cheng Cui</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+W">Wei Ji</a>, <a href="/search/cs?searchtype=author&query=Dang%2C+Q">Qingqing Dang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+K">Kaipeng Deng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guanzhong Wang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yuning Du</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Baohua Lai</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qiwen Liu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiaoguang Hu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+D">Dianhai Yu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yanjun Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.00902v1-abstract-short" style="display: inline;"> The better accuracy and efficiency trade-off has been a challenging problem in object detection. In this work, we are dedicated to studying key optimizations and neural network architecture choices for object detection to improve accuracy and efficiency. We investigate the applicability of the anchor-free strategy on lightweight object detection models. We enhance the backbone structure and design… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.00902v1-abstract-full').style.display = 'inline'; document.getElementById('2111.00902v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.00902v1-abstract-full" style="display: none;"> The better accuracy and efficiency trade-off has been a challenging problem in object detection. In this work, we are dedicated to studying key optimizations and neural network architecture choices for object detection to improve accuracy and efficiency. We investigate the applicability of the anchor-free strategy on lightweight object detection models. We enhance the backbone structure and design the lightweight structure of the neck, which improves the feature extraction ability of the network. We improve label assignment strategy and loss function to make training more stable and efficient. Through these optimizations, we create a new family of real-time object detectors, named PP-PicoDet, which achieves superior performance on object detection for mobile devices. Our models achieve better trade-offs between accuracy and latency compared to other popular models. PicoDet-S with only 0.99M parameters achieves 30.6% mAP, which is an absolute 4.8% improvement in mAP while reducing mobile CPU inference latency by 55% compared to YOLOX-Nano, and is an absolute 7.1% improvement in mAP compared to NanoDet. It reaches 123 FPS (150 FPS using Paddle Lite) on mobile ARM CPU when the input size is 320. PicoDet-L with only 3.3M parameters achieves 40.9% mAP, which is an absolute 3.7% improvement in mAP and 44% faster than YOLOv5s. As shown in Figure 1, our models far outperform the state-of-the-art results for lightweight object detection. Code and pre-trained models are available at https://github.com/PaddlePaddle/PaddleDetection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.00902v1-abstract-full').style.display = 'none'; document.getElementById('2111.00902v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 3 figures, 5 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.08817">arXiv:2110.08817</a> <span> [<a href="https://arxiv.org/pdf/2110.08817">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A deep learning pipeline for localization, differentiation, and uncertainty estimation of liver lesions using multi-phasic and multi-sequence MRI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peng Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yuhsuan Wu</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Bolin Lai</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xiao-Yun Zhou</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+L">Le Lu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wendi Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Huabang Zhou</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Lingyun Huang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jing Xiao</a>, <a href="/search/cs?searchtype=author&query=Harrison%2C+A+P">Adam P. Harrison</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+N">Ningyang Jia</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Heping Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.08817v1-abstract-short" style="display: inline;"> Objectives: to propose a fully-automatic computer-aided diagnosis (CAD) solution for liver lesion characterization, with uncertainty estimation. Methods: we enrolled 400 patients who had either liver resection or a biopsy and was diagnosed with either hepatocellular carcinoma (HCC), intrahepatic cholangiocarcinoma, or secondary metastasis, from 2006 to 2019. Each patient was scanned with T1WI, T… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.08817v1-abstract-full').style.display = 'inline'; document.getElementById('2110.08817v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.08817v1-abstract-full" style="display: none;"> Objectives: to propose a fully-automatic computer-aided diagnosis (CAD) solution for liver lesion characterization, with uncertainty estimation. Methods: we enrolled 400 patients who had either liver resection or a biopsy and was diagnosed with either hepatocellular carcinoma (HCC), intrahepatic cholangiocarcinoma, or secondary metastasis, from 2006 to 2019. Each patient was scanned with T1WI, T2WI, T1WI venous phase (T2WI-V), T1WI arterial phase (T1WI-A), and DWI MRI sequences. We propose a fully-automatic deep CAD pipeline that localizes lesions from 3D MRI studies using key-slice parsing and provides a confidence measure for its diagnoses. We evaluate using five-fold cross validation and compare performance against three radiologists, including a senior hepatology radiologist, a junior hepatology radiologist and an abdominal radiologist. Results: the proposed CAD solution achieves a mean F1 score of 0.62, outperforming the abdominal radiologist (0.47), matching the junior hepatology radiologist (0.61), and underperforming the senior hepatology radiologist (0.68). The CAD system can informatively assess its diagnostic confidence, i.e., when only evaluating on the 70% most confident cases the mean f1 score and sensitivity at 80% specificity for HCC vs. others are boosted from 0.62 to 0.71 and 0.84 to 0.92, respectively. Conclusion: the proposed fully-automatic CAD solution can provide good diagnostic performance with informative confidence assessments in finding and discriminating liver lesions from MRI studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.08817v1-abstract-full').style.display = 'none'; document.getElementById('2110.08817v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2109.09406">arXiv:2109.09406</a> <span> [<a href="https://arxiv.org/pdf/2109.09406">pdf</a>, <a href="https://arxiv.org/format/2109.09406">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> EdgeFlow: Achieving Practical Interactive Segmentation with Edge-Guided Flow </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hao%2C+Y">Yuying Hao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yi Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zewu Wu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+L">Lin Han</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yizhou Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guowei Chen</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+L">Lutao Chu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Shiyu Tang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhiliang Yu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zeyu Chen</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Baohua Lai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2109.09406v2-abstract-short" style="display: inline;"> High-quality training data play a key role in image segmentation tasks. Usually, pixel-level annotations are expensive, laborious and time-consuming for the large volume of training data. To reduce labelling cost and improve segmentation quality, interactive segmentation methods have been proposed, which provide the result with just a few clicks. However, their performance does not meet the requir… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.09406v2-abstract-full').style.display = 'inline'; document.getElementById('2109.09406v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2109.09406v2-abstract-full" style="display: none;"> High-quality training data play a key role in image segmentation tasks. Usually, pixel-level annotations are expensive, laborious and time-consuming for the large volume of training data. To reduce labelling cost and improve segmentation quality, interactive segmentation methods have been proposed, which provide the result with just a few clicks. However, their performance does not meet the requirements of practical segmentation tasks in terms of speed and accuracy. In this work, we propose EdgeFlow, a novel architecture that fully utilizes interactive information of user clicks with edge-guided flow. Our method achieves state-of-the-art performance without any post-processing or iterative optimization scheme. Comprehensive experiments on benchmarks also demonstrate the superiority of our method. In addition, with the proposed method, we develop an efficient interactive segmentation tool for practical data annotation tasks. The source code and tool is avaliable at https://github.com/PaddlePaddle/PaddleSeg. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.09406v2-abstract-full').style.display = 'none'; document.getElementById('2109.09406v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by ICCV Workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.14629">arXiv:2104.14629</a> <span> [<a href="https://arxiv.org/pdf/2104.14629">pdf</a>, <a href="https://arxiv.org/format/2104.14629">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Scalable Semi-supervised Landmark Localization for X-ray Images using Few-shot Deep Adaptive Graph </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xiao-Yun Zhou</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Bolin Lai</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weijian Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yirui Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+K">Kang Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fakai Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chihung Lin</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+L">Le Lu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Lingyun Huang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+M">Mei Han</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+G">Guotong Xie</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jing Xiao</a>, <a href="/search/cs?searchtype=author&query=Chang-Fu%2C+K">Kuo Chang-Fu</a>, <a href="/search/cs?searchtype=author&query=Harrison%2C+A">Adam Harrison</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+S">Shun Miao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.14629v1-abstract-short" style="display: inline;"> Landmark localization plays an important role in medical image analysis. Learning based methods, including CNN and GCN, have demonstrated the state-of-the-art performance. However, most of these methods are fully-supervised and heavily rely on manual labeling of a large training dataset. In this paper, based on a fully-supervised graph-based method, DAG, we proposed a semi-supervised extension of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.14629v1-abstract-full').style.display = 'inline'; document.getElementById('2104.14629v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.14629v1-abstract-full" style="display: none;"> Landmark localization plays an important role in medical image analysis. Learning based methods, including CNN and GCN, have demonstrated the state-of-the-art performance. However, most of these methods are fully-supervised and heavily rely on manual labeling of a large training dataset. In this paper, based on a fully-supervised graph-based method, DAG, we proposed a semi-supervised extension of it, termed few-shot DAG, \ie five-shot DAG. It first trains a DAG model on the labeled data and then fine-tunes the pre-trained model on the unlabeled data with a teacher-student SSL mechanism. In addition to the semi-supervised loss, we propose another loss using JS divergence to regulate the consistency of the intermediate feature maps. We extensively evaluated our method on pelvis, hand and chest landmark detection tasks. Our experiment results demonstrate consistent and significant improvements over previous methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.14629v1-abstract-full').style.display = 'none'; document.getElementById('2104.14629v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2103.12972">arXiv:2103.12972</a> <span> [<a href="https://arxiv.org/pdf/2103.12972">pdf</a>, <a href="https://arxiv.org/format/2103.12972">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Hetero-Modal Learning and Expansive Consistency Constraints for Semi-Supervised Detection from Multi-Sequence Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lai%2C+B">Bolin Lai</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yuhsuan Wu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xiao-Yun Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peng Wang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+L">Le Lu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Lingyun Huang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+M">Mei Han</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jing Xiao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Heping Hu</a>, <a href="/search/cs?searchtype=author&query=Harrison%2C+A+P">Adam P. Harrison</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2103.12972v1-abstract-short" style="display: inline;"> Lesion detection serves a critical role in early diagnosis and has been well explored in recent years due to methodological advancesand increased data availability. However, the high costs of annotations hinder the collection of large and completely labeled datasets, motivating semi-supervised detection approaches. In this paper, we introduce mean teacher hetero-modal detection (MTHD), which addre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.12972v1-abstract-full').style.display = 'inline'; document.getElementById('2103.12972v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2103.12972v1-abstract-full" style="display: none;"> Lesion detection serves a critical role in early diagnosis and has been well explored in recent years due to methodological advancesand increased data availability. However, the high costs of annotations hinder the collection of large and completely labeled datasets, motivating semi-supervised detection approaches. In this paper, we introduce mean teacher hetero-modal detection (MTHD), which addresses two important gaps in current semi-supervised detection. First, it is not obvious how to enforce unlabeled consistency constraints across the very different outputs of various detectors, which has resulted in various compromises being used in the state of the art. Using an anchor-free framework, MTHD formulates a mean teacher approach without such compromises, enforcing consistency on the soft-output of object centers and size. Second, multi-sequence data is often critical, e.g., for abdominal lesion detection, but unlabeled data is often missing sequences. To deal with this, MTHD incorporates hetero-modal learning in its framework. Unlike prior art, MTHD is able to incorporate an expansive set of consistency constraints that include geometric transforms and random sequence combinations. We train and evaluate MTHD on liver lesion detection using the largest MR lesion dataset to date (1099 patients with >5000 volumes). MTHD surpasses the best fully-supervised and semi-supervised competitors by 10.1% and 3.5%, respectively, in average sensitivity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.12972v1-abstract-full').style.display = 'none'; document.getElementById('2103.12972v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2101.08674">arXiv:2101.08674</a> <span> [<a href="https://arxiv.org/pdf/2101.08674">pdf</a>, <a href="https://arxiv.org/format/2101.08674">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DAF:re: A Challenging, Crowd-Sourced, Large-Scale, Long-Tailed Dataset For Anime Character Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rios%2C+E+A">Edwin Arkel Rios</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+W">Wen-Huang Cheng</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Bo-Cheng Lai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2101.08674v1-abstract-short" style="display: inline;"> In this work we tackle the challenging problem of anime character recognition. Anime, referring to animation produced within Japan and work derived or inspired from it. For this purpose we present DAF:re (DanbooruAnimeFaces:revamped), a large-scale, crowd-sourced, long-tailed dataset with almost 500 K images spread across more than 3000 classes. Additionally, we conduct experiments on DAF:re and s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2101.08674v1-abstract-full').style.display = 'inline'; document.getElementById('2101.08674v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2101.08674v1-abstract-full" style="display: none;"> In this work we tackle the challenging problem of anime character recognition. Anime, referring to animation produced within Japan and work derived or inspired from it. For this purpose we present DAF:re (DanbooruAnimeFaces:revamped), a large-scale, crowd-sourced, long-tailed dataset with almost 500 K images spread across more than 3000 classes. Additionally, we conduct experiments on DAF:re and similar datasets using a variety of classification models, including CNN based ResNets and self-attention based Vision Transformer (ViT). Our results give new insights into the generalization and transfer learning properties of ViT models on substantially different domain datasets from those used for the upstream pre-training, including the influence of batch and image size in their training. Additionally, we share our dataset, source-code, pre-trained checkpoints and results, as Animesion, the first end-to-end framework for large-scale anime character recognition: https://github.com/arkel23/animesion <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2101.08674v1-abstract-full').style.display = 'none'; document.getElementById('2101.08674v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 3 figures, 4 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2; I.4 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2101.06175">arXiv:2101.06175</a> <span> [<a href="https://arxiv.org/pdf/2101.06175">pdf</a>, <a href="https://arxiv.org/format/2101.06175">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PaddleSeg: A High-Efficient Development Toolkit for Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yi Liu</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+L">Lutao Chu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guowei Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zewu Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zeyu Chen</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Baohua Lai</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Y">Yuying Hao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2101.06175v1-abstract-short" style="display: inline;"> Image Segmentation plays an essential role in computer vision and image processing with various applications from medical diagnosis to autonomous car driving. A lot of segmentation algorithms have been proposed for addressing specific problems. In recent years, the success of deep learning techniques has tremendously influenced a wide range of computer vision areas, and the modern approaches of im… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2101.06175v1-abstract-full').style.display = 'inline'; document.getElementById('2101.06175v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2101.06175v1-abstract-full" style="display: none;"> Image Segmentation plays an essential role in computer vision and image processing with various applications from medical diagnosis to autonomous car driving. A lot of segmentation algorithms have been proposed for addressing specific problems. In recent years, the success of deep learning techniques has tremendously influenced a wide range of computer vision areas, and the modern approaches of image segmentation based on deep learning are becoming prevalent. In this article, we introduce a high-efficient development toolkit for image segmentation, named PaddleSeg. The toolkit aims to help both developers and researchers in the whole process of designing segmentation models, training models, optimizing performance and inference speed, and deploying models. Currently, PaddleSeg supports around 20 popular segmentation models and more than 50 pre-trained models from real-time and high-accuracy levels. With modular components and backbone networks, users can easily build over one hundred models for different requirements. Furthermore, we provide comprehensive benchmarks and evaluations to show that these segmentation algorithms trained on our toolkit have more competitive accuracy. Also, we provide various real industrial applications and practical cases based on PaddleSeg. All codes and examples of PaddleSeg are available at https://github.com/PaddlePaddle/PaddleSeg. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2101.06175v1-abstract-full').style.display = 'none'; document.getElementById('2101.06175v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 January, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2012.10674">arXiv:2012.10674</a> <span> [<a href="https://arxiv.org/pdf/2012.10674">pdf</a>, <a href="https://arxiv.org/format/2012.10674">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Camera-aware Proxies for Unsupervised Person Re-Identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+M">Menglin Wang</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Baisheng Lai</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jianqiang Huang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+X">Xiaojin Gong</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+X">Xian-Sheng Hua</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2012.10674v2-abstract-short" style="display: inline;"> This paper tackles the purely unsupervised person re-identification (Re-ID) problem that requires no annotations. Some previous methods adopt clustering techniques to generate pseudo labels and use the produced labels to train Re-ID models progressively. These methods are relatively simple but effective. However, most clustering-based methods take each cluster as a pseudo identity class, neglectin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.10674v2-abstract-full').style.display = 'inline'; document.getElementById('2012.10674v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2012.10674v2-abstract-full" style="display: none;"> This paper tackles the purely unsupervised person re-identification (Re-ID) problem that requires no annotations. Some previous methods adopt clustering techniques to generate pseudo labels and use the produced labels to train Re-ID models progressively. These methods are relatively simple but effective. However, most clustering-based methods take each cluster as a pseudo identity class, neglecting the large intra-ID variance caused mainly by the change of camera views. To address this issue, we propose to split each single cluster into multiple proxies and each proxy represents the instances coming from the same camera. These camera-aware proxies enable us to deal with large intra-ID variance and generate more reliable pseudo labels for learning. Based on the camera-aware proxies, we design both intra- and inter-camera contrastive learning components for our Re-ID model to effectively learn the ID discrimination ability within and across cameras. Meanwhile, a proxy-balanced sampling strategy is also designed, which facilitates our learning further. Extensive experiments on three large-scale Re-ID datasets show that our proposed approach outperforms most unsupervised methods by a significant margin. Especially, on the challenging MSMT17 dataset, we gain $14.3\%$ Rank-1 and $10.2\%$ mAP improvements when compared to the second place. Code is available at: \texttt{https://github.com/Terminator8758/CAP-master}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.10674v2-abstract-full').style.display = 'none'; document.getElementById('2012.10674v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 December, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to AAAI 2021. Code is available at: https://github.com/Terminator8758/CAP-master</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2012.06964">arXiv:2012.06964</a> <span> [<a href="https://arxiv.org/pdf/2012.06964">pdf</a>, <a href="https://arxiv.org/format/2012.06964">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Fully-Automated Liver Tumor Localization and Characterization from Multi-Phase MR Volumes Using Key-Slice ROI Parsing: A Physician-Inspired Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lai%2C+B">Bolin Lai</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yuhsuan Wu</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+X">Xiaoyu Bai</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xiao-Yun Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peng Wang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+J">Jinzheng Cai</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+Y">Yuankai Huo</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Lingyun Huang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Yong Xia</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jing Xiao</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+L">Le Lu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Heping Hu</a>, <a href="/search/cs?searchtype=author&query=Harrison%2C+A">Adam Harrison</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2012.06964v3-abstract-short" style="display: inline;"> Using radiological scans to identify liver tumors is crucial for proper patient treatment. This is highly challenging, as top radiologists only achieve F1 scores of roughly 80% (hepatocellular carcinoma (HCC) vs. others) with only moderate inter-rater agreement, even when using multi-phase magnetic resonance (MR) imagery. Thus, there is great impetus for computer-aided diagnosis (CAD) solutions. A… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.06964v3-abstract-full').style.display = 'inline'; document.getElementById('2012.06964v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2012.06964v3-abstract-full" style="display: none;"> Using radiological scans to identify liver tumors is crucial for proper patient treatment. This is highly challenging, as top radiologists only achieve F1 scores of roughly 80% (hepatocellular carcinoma (HCC) vs. others) with only moderate inter-rater agreement, even when using multi-phase magnetic resonance (MR) imagery. Thus, there is great impetus for computer-aided diagnosis (CAD) solutions. A critical challenge is to robustly parse a 3D MR volume to localize diagnosable regions of interest (ROI), especially for edge cases. In this paper, we break down this problem using a key-slice parser (KSP), which emulates physician workflows by first identifying key slices and then localizing their corresponding key ROIs. To achieve robustness, the KSP also uses curve-parsing and detection confidence re-weighting. We evaluate our approach on the largest multi-phase MR liver lesion test dataset to date (430 biopsy-confirmed patients). Experiments demonstrate that our KSP can localize diagnosable ROIs with high reliability: 87% patients have an average 3D overlap of >= 40% with the ground truth compared to only 79% using the best tested detector. When coupled with a classifier, we achieve an HCC vs. others F1 score of 0.801, providing a fully-automated CAD performance comparable to top human physicians. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.06964v3-abstract-full').style.display = 'none'; document.getElementById('2012.06964v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 December, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2012.04265">arXiv:2012.04265</a> <span> [<a href="https://arxiv.org/pdf/2012.04265">pdf</a>, <a href="https://arxiv.org/format/2012.04265">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Learning to Generate Content-Aware Dynamic Detectors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+J">Junyi Feng</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+J">Jiashen Hua</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Baisheng Lai</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jianqiang Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xi Li</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+X">Xian-sheng Hua</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2012.04265v1-abstract-short" style="display: inline;"> Model efficiency is crucial for object detection. Mostprevious works rely on either hand-crafted design or auto-search methods to obtain a static architecture, regardless ofthe difference of inputs. In this paper, we introduce a newperspective of designing efficient detectors, which is automatically generating sample-adaptive model architectureon the fly. The proposed method is named content-aware… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.04265v1-abstract-full').style.display = 'inline'; document.getElementById('2012.04265v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2012.04265v1-abstract-full" style="display: none;"> Model efficiency is crucial for object detection. Mostprevious works rely on either hand-crafted design or auto-search methods to obtain a static architecture, regardless ofthe difference of inputs. In this paper, we introduce a newperspective of designing efficient detectors, which is automatically generating sample-adaptive model architectureon the fly. The proposed method is named content-aware dynamic detectors (CADDet). It first applies a multi-scale densely connected network with dynamic routing as the supernet. Furthermore, we introduce a course-to-fine strat-egy tailored for object detection to guide the learning of dynamic routing, which contains two metrics: 1) dynamic global budget constraint assigns data-dependent expectedbudgets for individual samples; 2) local path similarity regularization aims to generate more diverse routing paths. With these, our method achieves higher computational efficiency while maintaining good performance. To the best of our knowledge, our CADDet is the first work to introduce dynamic routing mechanism in object detection. Experiments on MS-COCO dataset demonstrate that CADDet achieves 1.8 higher mAP with 10% fewer FLOPs compared with vanilla routing strategy. Compared with the models based upon similar building blocks, CADDet achieves a 42% FLOPs reduction with a competitive mAP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.04265v1-abstract-full').style.display = 'none'; document.getElementById('2012.04265v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2012.02782">arXiv:2012.02782</a> <span> [<a href="https://arxiv.org/pdf/2012.02782">pdf</a>, <a href="https://arxiv.org/format/2012.02782">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Batch Group Normalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xiao-Yun Zhou</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jiacheng Sun</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+N">Nanyang Ye</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+X">Xu Lan</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Q">Qijun Luo</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Bo-Lin Lai</a>, <a href="/search/cs?searchtype=author&query=Esperanca%2C+P">Pedro Esperanca</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guang-Zhong Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhenguo Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2012.02782v2-abstract-short" style="display: inline;"> Deep Convolutional Neural Networks (DCNNs) are hard and time-consuming to train. Normalization is one of the effective solutions. Among previous normalization methods, Batch Normalization (BN) performs well at medium and large batch sizes and is with good generalizability to multiple vision tasks, while its performance degrades significantly at small batch sizes. In this paper, we find that BN sat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.02782v2-abstract-full').style.display = 'inline'; document.getElementById('2012.02782v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2012.02782v2-abstract-full" style="display: none;"> Deep Convolutional Neural Networks (DCNNs) are hard and time-consuming to train. Normalization is one of the effective solutions. Among previous normalization methods, Batch Normalization (BN) performs well at medium and large batch sizes and is with good generalizability to multiple vision tasks, while its performance degrades significantly at small batch sizes. In this paper, we find that BN saturates at extreme large batch sizes, i.e., 128 images per worker, i.e., GPU, as well and propose that the degradation/saturation of BN at small/extreme large batch sizes is caused by noisy/confused statistic calculation. Hence without adding new trainable parameters, using multiple-layer or multi-iteration information, or introducing extra computation, Batch Group Normalization (BGN) is proposed to solve the noisy/confused statistic calculation of BN at small/extreme large batch sizes with introducing the channel, height and width dimension to compensate. The group technique in Group Normalization (GN) is used and a hyper-parameter G is used to control the number of feature instances used for statistic calculation, hence to offer neither noisy nor confused statistic for different batch sizes. We empirically demonstrate that BGN consistently outperforms BN, Instance Normalization (IN), Layer Normalization (LN), GN, and Positional Normalization (PN), across a wide spectrum of vision tasks, including image classification, Neural Architecture Search (NAS), adversarial learning, Few Shot Learning (FSL) and Unsupervised Domain Adaptation (UDA), indicating its good performance, robust stability to batch size and wide generalizability. For example, for training ResNet-50 on ImageNet with a batch size of 2, BN achieves Top1 accuracy of 66.512% while BGN achieves 76.096% with notable improvement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.02782v2-abstract-full').style.display = 'none'; document.getElementById('2012.02782v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 December, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2008.07012">arXiv:2008.07012</a> <span> [<a href="https://arxiv.org/pdf/2008.07012">pdf</a>, <a href="https://arxiv.org/format/2008.07012">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DyStaB: Unsupervised Object Segmentation via Dynamic-Static Bootstrapping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yanchao Yang</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Brian Lai</a>, <a href="/search/cs?searchtype=author&query=Soatto%2C+S">Stefano Soatto</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2008.07012v2-abstract-short" style="display: inline;"> We describe an unsupervised method to detect and segment portions of images of live scenes that, at some point in time, are seen moving as a coherent whole, which we refer to as objects. Our method first partitions the motion field by minimizing the mutual information between segments. Then, it uses the segments to learn object models that can be used for detection in a static image. Static and dy… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.07012v2-abstract-full').style.display = 'inline'; document.getElementById('2008.07012v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2008.07012v2-abstract-full" style="display: none;"> We describe an unsupervised method to detect and segment portions of images of live scenes that, at some point in time, are seen moving as a coherent whole, which we refer to as objects. Our method first partitions the motion field by minimizing the mutual information between segments. Then, it uses the segments to learn object models that can be used for detection in a static image. Static and dynamic models are represented by deep neural networks trained jointly in a bootstrapping strategy, which enables extrapolation to previously unseen objects. While the training process requires motion, the resulting object segmentation network can be used on either static images or videos at inference time. As the volume of seen videos grows, more and more objects are seen moving, priming their detection, which then serves as a regularizer for new objects, turning our method into unsupervised continual learning to segment objects. Our models are compared to the state of the art in both video object segmentation and salient object detection. In the six benchmark datasets tested, our models compare favorably even to those using pixel-level supervision, despite requiring no manual annotation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.07012v2-abstract-full').style.display = 'none'; document.getElementById('2008.07012v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 August, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">camera-ready version for CVPR 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2002.04932">arXiv:2002.04932</a> <span> [<a href="https://arxiv.org/pdf/2002.04932">pdf</a>, <a href="https://arxiv.org/format/2002.04932">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Precise Intra-camera Supervised Person Re-identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+M">Menglin Wang</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Baisheng Lai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haokun Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jianqiang Huang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+X">Xiaojin Gong</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+X">Xian-Sheng Hua</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2002.04932v2-abstract-short" style="display: inline;"> Intra-camera supervision (ICS) for person re-identification (Re-ID) assumes that identity labels are independently annotated within each camera view and no inter-camera identity association is labeled. It is a new setting proposed recently to reduce the burden of annotation while expect to maintain desirable Re-ID performance. However, the lack of inter-camera labels makes the ICS Re-ID problem mu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2002.04932v2-abstract-full').style.display = 'inline'; document.getElementById('2002.04932v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2002.04932v2-abstract-full" style="display: none;"> Intra-camera supervision (ICS) for person re-identification (Re-ID) assumes that identity labels are independently annotated within each camera view and no inter-camera identity association is labeled. It is a new setting proposed recently to reduce the burden of annotation while expect to maintain desirable Re-ID performance. However, the lack of inter-camera labels makes the ICS Re-ID problem much more challenging than the fully supervised counterpart. By investigating the characteristics of ICS, this paper proposes camera-specific non-parametric classifiers, together with a hybrid mining quintuplet loss, to perform intra-camera learning. Then, an inter-camera learning module consisting of a graph-based ID association step and a Re-ID model updating step is conducted. Extensive experiments on three large-scale Re-ID datasets show that our approach outperforms all existing ICS works by a great margin. Our approach performs even comparable to state-of-the-art fully supervised methods in two of the datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2002.04932v2-abstract-full').style.display = 'none'; document.getElementById('2002.04932v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 February, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by WACV2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1910.04814">arXiv:1910.04814</a> <span> [<a href="https://arxiv.org/pdf/1910.04814">pdf</a>, <a href="https://arxiv.org/format/1910.04814">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ErrorNet: Learning error representations from limited data to improve vascular segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tajbakhsh%2C+N">Nima Tajbakhsh</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Brian Lai</a>, <a href="/search/cs?searchtype=author&query=Ananth%2C+S">Shilpa Ananth</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+X">Xiaowei Ding</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1910.04814v4-abstract-short" style="display: inline;"> Deep convolutional neural networks have proved effective in segmenting lesions and anatomies in various medical imaging modalities. However, in the presence of small sample size and domain shift problems, these models often produce masks with non-intuitive segmentation mistakes. In this paper, we propose a segmentation framework called ErrorNet, which learns to correct these segmentation mistakes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.04814v4-abstract-full').style.display = 'inline'; document.getElementById('1910.04814v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1910.04814v4-abstract-full" style="display: none;"> Deep convolutional neural networks have proved effective in segmenting lesions and anatomies in various medical imaging modalities. However, in the presence of small sample size and domain shift problems, these models often produce masks with non-intuitive segmentation mistakes. In this paper, we propose a segmentation framework called ErrorNet, which learns to correct these segmentation mistakes through the repeated process of injecting systematic segmentation errors to the segmentation result based on a learned shape prior, followed by attempting to predict the injected error. During inference, ErrorNet corrects the segmentation mistakes by adding the predicted error map to the initial segmentation result. ErrorNet has advantages over alternatives based on domain adaptation or CRF-based post processing, because it requires neither domain-specific parameter tuning nor any data from the target domains. We have evaluated ErrorNet using five public datasets for the task of retinal vessel segmentation. The selected datasets differ in size and patient population, allowing us to evaluate the effectiveness of ErrorNet in handling small sample size and domain shift problems. Our experiments demonstrate that ErrorNet outperforms a base segmentation model, a CRF-based post processing scheme, and a domain adaptation method, with a greater performance gain in the presence of the aforementioned dataset limitations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.04814v4-abstract-full').style.display = 'none'; document.getElementById('1910.04814v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 February, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 October, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in ISBI 2019. The supplementary material is only available in the arxiv version of our paper</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Lai%2C+B&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Lai%2C+B&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Lai%2C+B&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>