CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 57 results for author: <span class="mathjax">Arbel谩ez, P</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Arbel谩ez, P"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Arbel%C3%A1ez%2C+P&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Arbel谩ez, P"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09593">arXiv:2411.09593</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.09593">pdf</a>, <a href="https://arxiv.org/format/2411.09593">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SMILE-UHURA Challenge -- Small Vessel Segmentation at Mesoscopic Scale from Ultra-High Resolution 7T Magnetic Resonance Angiograms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chatterjee%2C+S">Soumick Chatterjee</a>, <a href="/search/cs?searchtype=author&amp;query=Mattern%2C+H">Hendrik Mattern</a>, <a href="/search/cs?searchtype=author&amp;query=D%C3%B6rner%2C+M">Marc D枚rner</a>, <a href="/search/cs?searchtype=author&amp;query=Sciarra%2C+A">Alessandro Sciarra</a>, <a href="/search/cs?searchtype=author&amp;query=Dubost%2C+F">Florian Dubost</a>, <a href="/search/cs?searchtype=author&amp;query=Schnurre%2C+H">Hannes Schnurre</a>, <a href="/search/cs?searchtype=author&amp;query=Khatun%2C+R">Rupali Khatun</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+C">Chun-Chih Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Hsieh%2C+T">Tsung-Lin Hsieh</a>, <a href="/search/cs?searchtype=author&amp;query=Tsai%2C+Y">Yi-Shan Tsai</a>, <a href="/search/cs?searchtype=author&amp;query=Fang%2C+Y">Yi-Zeng Fang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yung-Ching Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+J">Juinn-Dar Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+M">Marshall Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+S">Siyu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Ribeiro%2C+F+L">Fernanda L. Ribeiro</a>, <a href="/search/cs?searchtype=author&amp;query=Bollmann%2C+S">Saskia Bollmann</a>, <a href="/search/cs?searchtype=author&amp;query=Chintalapati%2C+K+V">Karthikesh Varma Chintalapati</a>, <a href="/search/cs?searchtype=author&amp;query=Radhakrishna%2C+C+M">Chethan Mysuru Radhakrishna</a>, <a href="/search/cs?searchtype=author&amp;query=Kumara%2C+S+C+H+R">Sri Chandana Hudukula Ram Kumara</a>, <a href="/search/cs?searchtype=author&amp;query=Sutrave%2C+R">Raviteja Sutrave</a>, <a href="/search/cs?searchtype=author&amp;query=Qayyum%2C+A">Abdul Qayyum</a>, <a href="/search/cs?searchtype=author&amp;query=Mazher%2C+M">Moona Mazher</a>, <a href="/search/cs?searchtype=author&amp;query=Razzak%2C+I">Imran Razzak</a>, <a href="/search/cs?searchtype=author&amp;query=Rodero%2C+C">Cristobal Rodero</a> , et al. (23 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09593v1-abstract-short" style="display: inline;"> The human brain receives nutrients and oxygen through an intricate network of blood vessels. Pathology affecting small vessels, at the mesoscopic scale, represents a critical vulnerability within the cerebral blood supply and can lead to severe conditions, such as Cerebral Small Vessel Diseases. The advent of 7 Tesla MRI systems has enabled the acquisition of higher spatial resolution images, maki&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09593v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09593v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09593v1-abstract-full" style="display: none;"> The human brain receives nutrients and oxygen through an intricate network of blood vessels. Pathology affecting small vessels, at the mesoscopic scale, represents a critical vulnerability within the cerebral blood supply and can lead to severe conditions, such as Cerebral Small Vessel Diseases. The advent of 7 Tesla MRI systems has enabled the acquisition of higher spatial resolution images, making it possible to visualise such vessels in the brain. However, the lack of publicly available annotated datasets has impeded the development of robust, machine learning-driven segmentation algorithms. To address this, the SMILE-UHURA challenge was organised. This challenge, held in conjunction with the ISBI 2023, in Cartagena de Indias, Colombia, aimed to provide a platform for researchers working on related topics. The SMILE-UHURA challenge addresses the gap in publicly available annotated datasets by providing an annotated dataset of Time-of-Flight angiography acquired with 7T MRI. This dataset was created through a combination of automated pre-segmentation and extensive manual refinement. In this manuscript, sixteen submitted methods and two baseline methods are compared both quantitatively and qualitatively on two different datasets: held-out test MRAs from the same dataset as the training data (with labels kept secret) and a separate 7T ToF MRA dataset where both input volumes and labels are kept secret. The results demonstrate that most of the submitted deep learning methods, trained on the provided training dataset, achieved reliable segmentation performance. Dice scores reached up to 0.838 $\pm$ 0.066 and 0.716 $\pm$ 0.125 on the respective datasets, with an average performance of up to 0.804 $\pm$ 0.15. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09593v1-abstract-full').style.display = 'none'; document.getElementById('2411.09593v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.01184">arXiv:2409.01184</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.01184">pdf</a>, <a href="https://arxiv.org/format/2409.01184">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PitVis-2023 Challenge: Workflow Recognition in videos of Endoscopic Pituitary Surgery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Das%2C+A">Adrito Das</a>, <a href="/search/cs?searchtype=author&amp;query=Khan%2C+D+Z">Danyal Z. Khan</a>, <a href="/search/cs?searchtype=author&amp;query=Psychogyios%2C+D">Dimitrios Psychogyios</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yitong Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Hanrahan%2C+J+G">John G. Hanrahan</a>, <a href="/search/cs?searchtype=author&amp;query=Vasconcelos%2C+F">Francisco Vasconcelos</a>, <a href="/search/cs?searchtype=author&amp;query=Pang%2C+Y">You Pang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zhen Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+J">Jinlin Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Zou%2C+X">Xiaoyang Zou</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+G">Guoyan Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Qayyum%2C+A">Abdul Qayyum</a>, <a href="/search/cs?searchtype=author&amp;query=Mazher%2C+M">Moona Mazher</a>, <a href="/search/cs?searchtype=author&amp;query=Razzak%2C+I">Imran Razzak</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+T">Tianbin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jin Ye</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+J">Junjun He</a>, <a href="/search/cs?searchtype=author&amp;query=P%C5%82otka%2C+S">Szymon P艂otka</a>, <a href="/search/cs?searchtype=author&amp;query=Kaleta%2C+J">Joanna Kaleta</a>, <a href="/search/cs?searchtype=author&amp;query=Yamlahi%2C+A">Amine Yamlahi</a>, <a href="/search/cs?searchtype=author&amp;query=Jund%2C+A">Antoine Jund</a>, <a href="/search/cs?searchtype=author&amp;query=Godau%2C+P">Patrick Godau</a>, <a href="/search/cs?searchtype=author&amp;query=Kondo%2C+S">Satoshi Kondo</a>, <a href="/search/cs?searchtype=author&amp;query=Kasai%2C+S">Satoshi Kasai</a>, <a href="/search/cs?searchtype=author&amp;query=Hirasawa%2C+K">Kousuke Hirasawa</a> , et al. (7 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.01184v1-abstract-short" style="display: inline;"> The field of computer vision applied to videos of minimally invasive surgery is ever-growing. Workflow recognition pertains to the automated recognition of various aspects of a surgery: including which surgical steps are performed; and which surgical instruments are used. This information can later be used to assist clinicians when learning the surgery; during live surgery; and when writing operat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01184v1-abstract-full').style.display = 'inline'; document.getElementById('2409.01184v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.01184v1-abstract-full" style="display: none;"> The field of computer vision applied to videos of minimally invasive surgery is ever-growing. Workflow recognition pertains to the automated recognition of various aspects of a surgery: including which surgical steps are performed; and which surgical instruments are used. This information can later be used to assist clinicians when learning the surgery; during live surgery; and when writing operation notes. The Pituitary Vision (PitVis) 2023 Challenge tasks the community to step and instrument recognition in videos of endoscopic pituitary surgery. This is a unique task when compared to other minimally invasive surgeries due to the smaller working space, which limits and distorts vision; and higher frequency of instrument and step switching, which requires more precise model predictions. Participants were provided with 25-videos, with results presented at the MICCAI-2023 conference as part of the Endoscopic Vision 2023 Challenge in Vancouver, Canada, on 08-Oct-2023. There were 18-submissions from 9-teams across 6-countries, using a variety of deep learning models. A commonality between the top performing models was incorporating spatio-temporal and multi-task methods, with greater than 50% and 10% macro-F1-score improvement over purely spacial single-task models in step and instrument recognition respectively. The PitVis-2023 Challenge therefore demonstrates state-of-the-art computer vision models in minimally invasive surgery are transferable to a new dataset, with surgery specific techniques used to enhance performance, progressing the field further. Benchmark results are provided in the paper, and the dataset is publicly available at: https://doi.org/10.5522/04/26531686. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01184v1-abstract-full').style.display = 'none'; document.getElementById('2409.01184v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13135">arXiv:2408.13135</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.13135">pdf</a>, <a href="https://arxiv.org/format/2408.13135">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Deep Learning at the Intersection: Certified Robustness as a Tool for 3D Vision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=S%2C+G+P">Gabriel P茅rez S</a>, <a href="/search/cs?searchtype=author&amp;query=P%C3%A9rez%2C+J+C">Juan C. P茅rez</a>, <a href="/search/cs?searchtype=author&amp;query=Alfarra%2C+M">Motasem Alfarra</a>, <a href="/search/cs?searchtype=author&amp;query=Zarzar%2C+J">Jes煤s Zarzar</a>, <a href="/search/cs?searchtype=author&amp;query=Rojas%2C+S">Sara Rojas</a>, <a href="/search/cs?searchtype=author&amp;query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P">Pablo Arbel谩ez</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13135v1-abstract-short" style="display: inline;"> This paper presents preliminary work on a novel connection between certified robustness in machine learning and the modeling of 3D objects. We highlight an intriguing link between the Maximal Certified Radius (MCR) of a classifier representing a space&#39;s occupancy and the space&#39;s Signed Distance Function (SDF). Leveraging this relationship, we propose to use the certification method of randomized s&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13135v1-abstract-full').style.display = 'inline'; document.getElementById('2408.13135v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13135v1-abstract-full" style="display: none;"> This paper presents preliminary work on a novel connection between certified robustness in machine learning and the modeling of 3D objects. We highlight an intriguing link between the Maximal Certified Radius (MCR) of a classifier representing a space&#39;s occupancy and the space&#39;s Signed Distance Function (SDF). Leveraging this relationship, we propose to use the certification method of randomized smoothing (RS) to compute SDFs. Since RS&#39; high computational cost prevents its practical usage as a way to compute SDFs, we propose an algorithm to efficiently run RS in low-dimensional applications, such as 3D space, by expressing RS&#39; fundamental operations as Gaussian smoothing on pre-computed voxel grids. Our approach offers an innovative and practical tool to compute SDFs, validated through proof-of-concept experiments in novel view synthesis. This paper bridges two previously disparate areas of machine learning, opening new avenues for further exploration and potential cross-domain advancements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13135v1-abstract-full').style.display = 'none'; document.getElementById('2408.13135v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is an accepted extended abstract to the LatinX workshop at ICCV 2023. This was uploaded a year late</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.17361">arXiv:2407.17361</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.17361">pdf</a>, <a href="https://arxiv.org/format/2407.17361">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MuST: Multi-Scale Transformers for Surgical Phase Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=P%C3%A9rez%2C+A">Alejandra P茅rez</a>, <a href="/search/cs?searchtype=author&amp;query=Rodr%C3%ADguez%2C+S">Santiago Rodr铆guez</a>, <a href="/search/cs?searchtype=author&amp;query=Ayobi%2C+N">Nicol谩s Ayobi</a>, <a href="/search/cs?searchtype=author&amp;query=Aparicio%2C+N">Nicol谩s Aparicio</a>, <a href="/search/cs?searchtype=author&amp;query=Dessevres%2C+E">Eug茅nie Dessevres</a>, <a href="/search/cs?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P">Pablo Arbel谩ez</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.17361v1-abstract-short" style="display: inline;"> Phase recognition in surgical videos is crucial for enhancing computer-aided surgical systems as it enables automated understanding of sequential procedural stages. Existing methods often rely on fixed temporal windows for video analysis to identify dynamic surgical phases. Thus, they struggle to simultaneously capture short-, mid-, and long-term information necessary to fully understand complex s&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17361v1-abstract-full').style.display = 'inline'; document.getElementById('2407.17361v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.17361v1-abstract-full" style="display: none;"> Phase recognition in surgical videos is crucial for enhancing computer-aided surgical systems as it enables automated understanding of sequential procedural stages. Existing methods often rely on fixed temporal windows for video analysis to identify dynamic surgical phases. Thus, they struggle to simultaneously capture short-, mid-, and long-term information necessary to fully understand complex surgical procedures. To address these issues, we propose Multi-Scale Transformers for Surgical Phase Recognition (MuST), a novel Transformer-based approach that combines a Multi-Term Frame encoder with a Temporal Consistency Module to capture information across multiple temporal scales of a surgical video. Our Multi-Term Frame Encoder computes interdependencies across a hierarchy of temporal scales by sampling sequences at increasing strides around the frame of interest. Furthermore, we employ a long-term Transformer encoder over the frame embeddings to further enhance long-term reasoning. MuST achieves higher performance than previous state-of-the-art methods on three different public benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17361v1-abstract-full').style.display = 'none'; document.getElementById('2407.17361v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.13027">arXiv:2407.13027</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.13027">pdf</a>, <a href="https://arxiv.org/format/2407.13027">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SpaRED benchmark: Enhancing Gene Expression Prediction from Histology Images with Spatial Transcriptomics Completion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mejia%2C+G">Gabriel Mejia</a>, <a href="/search/cs?searchtype=author&amp;query=Ruiz%2C+D">Daniela Ruiz</a>, <a href="/search/cs?searchtype=author&amp;query=C%C3%A1rdenas%2C+P">Paula C谩rdenas</a>, <a href="/search/cs?searchtype=author&amp;query=Manrique%2C+L">Leonardo Manrique</a>, <a href="/search/cs?searchtype=author&amp;query=Vega%2C+D">Daniela Vega</a>, <a href="/search/cs?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P">Pablo Arbel谩ez</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.13027v2-abstract-short" style="display: inline;"> Spatial Transcriptomics is a novel technology that aligns histology images with spatially resolved gene expression profiles. Although groundbreaking, it struggles with gene capture yielding high corruption in acquired data. Given potential applications, recent efforts have focused on predicting transcriptomic profiles solely from histology images. However, differences in databases, preprocessing t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13027v2-abstract-full').style.display = 'inline'; document.getElementById('2407.13027v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.13027v2-abstract-full" style="display: none;"> Spatial Transcriptomics is a novel technology that aligns histology images with spatially resolved gene expression profiles. Although groundbreaking, it struggles with gene capture yielding high corruption in acquired data. Given potential applications, recent efforts have focused on predicting transcriptomic profiles solely from histology images. However, differences in databases, preprocessing techniques, and training hyperparameters hinder a fair comparison between methods. To address these challenges, we present a systematically curated and processed database collected from 26 public sources, representing an 8.6-fold increase compared to previous works. Additionally, we propose a state-of-the-art transformer based completion technique for inferring missing gene expression, which significantly boosts the performance of transcriptomic profile predictions across all datasets. Altogether, our contributions constitute the most comprehensive benchmark of gene expression prediction from histology images to date and a stepping stone for future research on spatial transcriptomics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13027v2-abstract-full').style.display = 'none'; document.getElementById('2407.13027v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.03359">arXiv:2406.03359</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.03359">pdf</a>, <a href="https://arxiv.org/format/2406.03359">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/978-3-031-16980-9_13">10.1007/978-3-031-16980-9_13 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> SuperFormer: Volumetric Transformer Architectures for MRI Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Forigua%2C+C">Cristhian Forigua</a>, <a href="/search/cs?searchtype=author&amp;query=Escobar%2C+M">Maria Escobar</a>, <a href="/search/cs?searchtype=author&amp;query=Arbelaez%2C+P">Pablo Arbelaez</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.03359v1-abstract-short" style="display: inline;"> This paper presents a novel framework for processing volumetric medical information using Visual Transformers (ViTs). First, We extend the state-of-the-art Swin Transformer model to the 3D medical domain. Second, we propose a new approach for processing volumetric information and encoding position in ViTs for 3D applications. We instantiate the proposed framework and present SuperFormer, a volumet&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03359v1-abstract-full').style.display = 'inline'; document.getElementById('2406.03359v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.03359v1-abstract-full" style="display: none;"> This paper presents a novel framework for processing volumetric medical information using Visual Transformers (ViTs). First, We extend the state-of-the-art Swin Transformer model to the 3D medical domain. Second, we propose a new approach for processing volumetric information and encoding position in ViTs for 3D applications. We instantiate the proposed framework and present SuperFormer, a volumetric transformer-based approach for Magnetic Resonance Imaging (MRI) Super-Resolution. Our method leverages the 3D information of the MRI domain and uses a local self-attention mechanism with a 3D relative positional encoding to recover anatomical details. In addition, our approach takes advantage of multi-domain information from volume and feature domains and fuses them to reconstruct the High-Resolution MRI. We perform an extensive validation on the Human Connectome Project dataset and demonstrate the superiority of volumetric transformers over 3D CNN-based methods. Our code and pretrained models are available at https://github.com/BCV-Uniandes/SuperFormer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03359v1-abstract-full').style.display = 'none'; document.getElementById('2406.03359v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> 7th International Workshop, SASHIMI 2022, Held in Conjunction with MICCAI 2022, Singapore, September 18, 2022, Proceedings </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.12930">arXiv:2405.12930</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.12930">pdf</a>, <a href="https://arxiv.org/format/2405.12930">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Pytorch-Wildlife: A Collaborative Deep Learning Framework for Conservation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hernandez%2C+A">Andres Hernandez</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Z">Zhongqi Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Vargas%2C+L">Luisa Vargas</a>, <a href="/search/cs?searchtype=author&amp;query=Dodhia%2C+R">Rahul Dodhia</a>, <a href="/search/cs?searchtype=author&amp;query=Arbelaez%2C+P">Pablo Arbelaez</a>, <a href="/search/cs?searchtype=author&amp;query=Ferres%2C+J+M+L">Juan M. Lavista Ferres</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.12930v3-abstract-short" style="display: inline;"> The alarming decline in global biodiversity, driven by various factors, underscores the urgent need for large-scale wildlife monitoring. In response, scientists have turned to automated deep learning methods for data processing in wildlife monitoring. However, applying these advanced methods in real-world scenarios is challenging due to their complexity and the need for specialized knowledge, prim&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.12930v3-abstract-full').style.display = 'inline'; document.getElementById('2405.12930v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.12930v3-abstract-full" style="display: none;"> The alarming decline in global biodiversity, driven by various factors, underscores the urgent need for large-scale wildlife monitoring. In response, scientists have turned to automated deep learning methods for data processing in wildlife monitoring. However, applying these advanced methods in real-world scenarios is challenging due to their complexity and the need for specialized knowledge, primarily because of technical challenges and interdisciplinary barriers. To address these challenges, we introduce Pytorch-Wildlife, an open-source deep learning platform built on PyTorch. It is designed for creating, modifying, and sharing powerful AI models. This platform emphasizes usability and accessibility, making it accessible to individuals with limited or no technical background. It also offers a modular codebase to simplify feature expansion and further development. Pytorch-Wildlife offers an intuitive, user-friendly interface, accessible through local installation or Hugging Face, for animal detection and classification in images and videos. As two real-world applications, Pytorch-Wildlife has been utilized to train animal classification models for species recognition in the Amazon Rainforest and for invasive opossum recognition in the Galapagos Islands. The Opossum model achieves 98% accuracy, and the Amazon model has 92% recognition accuracy for 36 animals in 90% of the data. As Pytorch-Wildlife evolves, we aim to integrate more conservation tasks, addressing various environmental challenges. Pytorch-Wildlife is available at https://github.com/microsoft/CameraTraps. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.12930v3-abstract-full').style.display = 'none'; document.getElementById('2405.12930v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Pytorch-Wildlife is available at https://github.com/microsoft/CameraTraps</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.11174">arXiv:2401.11174</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.11174">pdf</a>, <a href="https://arxiv.org/format/2401.11174">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Pixel-Wise Recognition for Holistic Surgical Scene Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ayobi%2C+N">Nicol谩s Ayobi</a>, <a href="/search/cs?searchtype=author&amp;query=Rodr%C3%ADguez%2C+S">Santiago Rodr铆guez</a>, <a href="/search/cs?searchtype=author&amp;query=P%C3%A9rez%2C+A">Alejandra P茅rez</a>, <a href="/search/cs?searchtype=author&amp;query=Hern%C3%A1ndez%2C+I">Isabela Hern谩ndez</a>, <a href="/search/cs?searchtype=author&amp;query=Aparicio%2C+N">Nicol谩s Aparicio</a>, <a href="/search/cs?searchtype=author&amp;query=Dessevres%2C+E">Eug茅nie Dessevres</a>, <a href="/search/cs?searchtype=author&amp;query=Pe%C3%B1a%2C+S">Sebasti谩n Pe帽a</a>, <a href="/search/cs?searchtype=author&amp;query=Santander%2C+J">Jessica Santander</a>, <a href="/search/cs?searchtype=author&amp;query=Caicedo%2C+J+I">Juan Ignacio Caicedo</a>, <a href="/search/cs?searchtype=author&amp;query=Fern%C3%A1ndez%2C+N">Nicol谩s Fern谩ndez</a>, <a href="/search/cs?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P">Pablo Arbel谩ez</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.11174v2-abstract-short" style="display: inline;"> This paper presents the Holistic and Multi-Granular Surgical Scene Understanding of Prostatectomies (GraSP) dataset, a curated benchmark that models surgical scene understanding as a hierarchy of complementary tasks with varying levels of granularity. Our approach enables a multi-level comprehension of surgical activities, encompassing long-term tasks such as surgical phases and steps recognition&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.11174v2-abstract-full').style.display = 'inline'; document.getElementById('2401.11174v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.11174v2-abstract-full" style="display: none;"> This paper presents the Holistic and Multi-Granular Surgical Scene Understanding of Prostatectomies (GraSP) dataset, a curated benchmark that models surgical scene understanding as a hierarchy of complementary tasks with varying levels of granularity. Our approach enables a multi-level comprehension of surgical activities, encompassing long-term tasks such as surgical phases and steps recognition and short-term tasks including surgical instrument segmentation and atomic visual actions detection. To exploit our proposed benchmark, we introduce the Transformers for Actions, Phases, Steps, and Instrument Segmentation (TAPIS) model, a general architecture that combines a global video feature extractor with localized region proposals from an instrument segmentation model to tackle the multi-granularity of our benchmark. Through extensive experimentation, we demonstrate the impact of including segmentation annotations in short-term recognition tasks, highlight the varying granularity requirements of each task, and establish TAPIS&#39;s superiority over previously proposed baselines and conventional CNN-based models. Additionally, we validate the robustness of our method across multiple public benchmarks, confirming the reliability and applicability of our dataset. This work represents a significant step forward in Endoscopic Vision, offering a novel and comprehensive framework for future research towards a holistic understanding of surgical procedures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.11174v2-abstract-full').style.display = 'none'; document.getElementById('2401.11174v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint submitted to Medical Image Analysis. Official extension of previous MICCAI 2022 (https://link.springer.com/chapter/10.1007/978-3-031-16449-1_42) and ISBI 2023 (https://ieeexplore.ieee.org/document/10230819) orals. Data and codes are available at https://github.com/BCV-Uniandes/GraSP</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.00496">arXiv:2401.00496</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.00496">pdf</a>, <a href="https://arxiv.org/format/2401.00496">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SAR-RARP50: Segmentation of surgical instrumentation and Action Recognition on Robot-Assisted Radical Prostatectomy Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Psychogyios%2C+D">Dimitrios Psychogyios</a>, <a href="/search/cs?searchtype=author&amp;query=Colleoni%2C+E">Emanuele Colleoni</a>, <a href="/search/cs?searchtype=author&amp;query=Van+Amsterdam%2C+B">Beatrice Van Amsterdam</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chih-Yang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+S">Shu-Yu Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yuchong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Jia%2C+F">Fucang Jia</a>, <a href="/search/cs?searchtype=author&amp;query=Zou%2C+B">Baosheng Zou</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+G">Guotai Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Boels%2C+M">Maxence Boels</a>, <a href="/search/cs?searchtype=author&amp;query=Huo%2C+J">Jiayu Huo</a>, <a href="/search/cs?searchtype=author&amp;query=Sparks%2C+R">Rachel Sparks</a>, <a href="/search/cs?searchtype=author&amp;query=Dasgupta%2C+P">Prokar Dasgupta</a>, <a href="/search/cs?searchtype=author&amp;query=Granados%2C+A">Alejandro Granados</a>, <a href="/search/cs?searchtype=author&amp;query=Ourselin%2C+S">Sebastien Ourselin</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+M">Mengya Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+A">An Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Y">Yanan Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Bai%2C+L">Long Bai</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+H">Hongliang Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Yamada%2C+A">Atsushi Yamada</a>, <a href="/search/cs?searchtype=author&amp;query=Harai%2C+Y">Yuriko Harai</a>, <a href="/search/cs?searchtype=author&amp;query=Ishikawa%2C+Y">Yuto Ishikawa</a>, <a href="/search/cs?searchtype=author&amp;query=Hayashi%2C+K">Kazuyuki Hayashi</a> , et al. (25 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.00496v2-abstract-short" style="display: inline;"> Surgical tool segmentation and action recognition are fundamental building blocks in many computer-assisted intervention applications, ranging from surgical skills assessment to decision support systems. Nowadays, learning-based action recognition and segmentation approaches outperform classical methods, relying, however, on large, annotated datasets. Furthermore, action recognition and tool segme&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.00496v2-abstract-full').style.display = 'inline'; document.getElementById('2401.00496v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.00496v2-abstract-full" style="display: none;"> Surgical tool segmentation and action recognition are fundamental building blocks in many computer-assisted intervention applications, ranging from surgical skills assessment to decision support systems. Nowadays, learning-based action recognition and segmentation approaches outperform classical methods, relying, however, on large, annotated datasets. Furthermore, action recognition and tool segmentation algorithms are often trained and make predictions in isolation from each other, without exploiting potential cross-task relationships. With the EndoVis 2022 SAR-RARP50 challenge, we release the first multimodal, publicly available, in-vivo, dataset for surgical action recognition and semantic instrumentation segmentation, containing 50 suturing video segments of Robotic Assisted Radical Prostatectomy (RARP). The aim of the challenge is twofold. First, to enable researchers to leverage the scale of the provided dataset and develop robust and highly accurate single-task action recognition and tool segmentation approaches in the surgical domain. Second, to further explore the potential of multitask-based learning approaches and determine their comparative advantage against their single-task counterparts. A total of 12 teams participated in the challenge, contributing 7 action recognition methods, 9 instrument segmentation techniques, and 4 multitask approaches that integrated both action recognition and instrument segmentation. The complete SAR-RARP50 dataset is available at: https://rdr.ucl.ac.uk/projects/SARRARP50_Segmentation_of_surgical_instrumentation_and_Action_Recognition_on_Robot-Assisted_Radical_Prostatectomy_Challenge/191091 <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.00496v2-abstract-full').style.display = 'none'; document.getElementById('2401.00496v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.12487">arXiv:2312.12487</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.12487">pdf</a>, <a href="https://arxiv.org/format/2312.12487">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Guidance: Training-free Acceleration of Conditional Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Castillo%2C+A">Angela Castillo</a>, <a href="/search/cs?searchtype=author&amp;query=Kohler%2C+J">Jonas Kohler</a>, <a href="/search/cs?searchtype=author&amp;query=P%C3%A9rez%2C+J+C">Juan C. P茅rez</a>, <a href="/search/cs?searchtype=author&amp;query=P%C3%A9rez%2C+J+P">Juan Pablo P茅rez</a>, <a href="/search/cs?searchtype=author&amp;query=Pumarola%2C+A">Albert Pumarola</a>, <a href="/search/cs?searchtype=author&amp;query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P">Pablo Arbel谩ez</a>, <a href="/search/cs?searchtype=author&amp;query=Thabet%2C+A">Ali Thabet</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.12487v1-abstract-short" style="display: inline;"> This paper presents a comprehensive study on the role of Classifier-Free Guidance (CFG) in text-conditioned diffusion models from the perspective of inference efficiency. In particular, we relax the default choice of applying CFG in all diffusion steps and instead search for efficient guidance policies. We formulate the discovery of such policies in the differentiable Neural Architecture Search fr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.12487v1-abstract-full').style.display = 'inline'; document.getElementById('2312.12487v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.12487v1-abstract-full" style="display: none;"> This paper presents a comprehensive study on the role of Classifier-Free Guidance (CFG) in text-conditioned diffusion models from the perspective of inference efficiency. In particular, we relax the default choice of applying CFG in all diffusion steps and instead search for efficient guidance policies. We formulate the discovery of such policies in the differentiable Neural Architecture Search framework. Our findings suggest that the denoising steps proposed by CFG become increasingly aligned with simple conditional steps, which renders the extra neural network evaluation of CFG redundant, especially in the second half of the denoising process. Building upon this insight, we propose &#34;Adaptive Guidance&#34; (AG), an efficient variant of CFG, that adaptively omits network evaluations when the denoising process displays convergence. Our experiments demonstrate that AG preserves CFG&#39;s image quality while reducing computation by 25%. Thus, AG constitutes a plug-and-play alternative to Guidance Distillation, achieving 50% of the speed-ups of the latter while being training-free and retaining the capacity to handle negative prompts. Finally, we uncover further redundancies of CFG in the first half of the diffusion process, showing that entire neural function evaluations can be replaced by simple affine transformations of past score estimates. This method, termed LinearAG, offers even cheaper inference at the cost of deviating from the baseline model. Our findings provide insights into the efficiency of the conditional denoising process that contribute to more practical and swift deployment of text-conditioned diffusion models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.12487v1-abstract-full').style.display = 'none'; document.getElementById('2312.12487v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.18259">arXiv:2311.18259</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.18259">pdf</a>, <a href="https://arxiv.org/format/2311.18259">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Ego-Exo4D: Understanding Skilled Human Activity from First- and Third-Person Perspectives </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Grauman%2C+K">Kristen Grauman</a>, <a href="/search/cs?searchtype=author&amp;query=Westbury%2C+A">Andrew Westbury</a>, <a href="/search/cs?searchtype=author&amp;query=Torresani%2C+L">Lorenzo Torresani</a>, <a href="/search/cs?searchtype=author&amp;query=Kitani%2C+K">Kris Kitani</a>, <a href="/search/cs?searchtype=author&amp;query=Malik%2C+J">Jitendra Malik</a>, <a href="/search/cs?searchtype=author&amp;query=Afouras%2C+T">Triantafyllos Afouras</a>, <a href="/search/cs?searchtype=author&amp;query=Ashutosh%2C+K">Kumar Ashutosh</a>, <a href="/search/cs?searchtype=author&amp;query=Baiyya%2C+V">Vijay Baiyya</a>, <a href="/search/cs?searchtype=author&amp;query=Bansal%2C+S">Siddhant Bansal</a>, <a href="/search/cs?searchtype=author&amp;query=Boote%2C+B">Bikram Boote</a>, <a href="/search/cs?searchtype=author&amp;query=Byrne%2C+E">Eugene Byrne</a>, <a href="/search/cs?searchtype=author&amp;query=Chavis%2C+Z">Zach Chavis</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Joya Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+F">Feng Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Chu%2C+F">Fu-Jen Chu</a>, <a href="/search/cs?searchtype=author&amp;query=Crane%2C+S">Sean Crane</a>, <a href="/search/cs?searchtype=author&amp;query=Dasgupta%2C+A">Avijit Dasgupta</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+J">Jing Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Escobar%2C+M">Maria Escobar</a>, <a href="/search/cs?searchtype=author&amp;query=Forigua%2C+C">Cristhian Forigua</a>, <a href="/search/cs?searchtype=author&amp;query=Gebreselasie%2C+A">Abrham Gebreselasie</a>, <a href="/search/cs?searchtype=author&amp;query=Haresh%2C+S">Sanjay Haresh</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+J">Jing Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Islam%2C+M+M">Md Mohaiminul Islam</a>, <a href="/search/cs?searchtype=author&amp;query=Jain%2C+S">Suyog Jain</a> , et al. (76 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.18259v4-abstract-short" style="display: inline;"> We present Ego-Exo4D, a diverse, large-scale multimodal multiview video dataset and benchmark challenge. Ego-Exo4D centers around simultaneously-captured egocentric and exocentric video of skilled human activities (e.g., sports, music, dance, bike repair). 740 participants from 13 cities worldwide performed these activities in 123 different natural scene contexts, yielding long-form captures from&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.18259v4-abstract-full').style.display = 'inline'; document.getElementById('2311.18259v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.18259v4-abstract-full" style="display: none;"> We present Ego-Exo4D, a diverse, large-scale multimodal multiview video dataset and benchmark challenge. Ego-Exo4D centers around simultaneously-captured egocentric and exocentric video of skilled human activities (e.g., sports, music, dance, bike repair). 740 participants from 13 cities worldwide performed these activities in 123 different natural scene contexts, yielding long-form captures from 1 to 42 minutes each and 1,286 hours of video combined. The multimodal nature of the dataset is unprecedented: the video is accompanied by multichannel audio, eye gaze, 3D point clouds, camera poses, IMU, and multiple paired language descriptions -- including a novel &#34;expert commentary&#34; done by coaches and teachers and tailored to the skilled-activity domain. To push the frontier of first-person video understanding of skilled human activity, we also present a suite of benchmark tasks and their annotations, including fine-grained activity understanding, proficiency estimation, cross-view translation, and 3D hand/body pose. All resources are open sourced to fuel new research in the community. Project page: http://ego-exo4d-data.org/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.18259v4-abstract-full').style.display = 'none'; document.getElementById('2311.18259v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Expanded manuscript (compared to arxiv v1 from Nov 2023 and CVPR 2024 paper from June 2024) for more comprehensive dataset and benchmark presentation, plus new results on v2 data release</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.01064">arXiv:2311.01064</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.01064">pdf</a>, <a href="https://arxiv.org/format/2311.01064">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Multimodal Foundation Models for Zero-shot Animal Species Recognition in Camera Trap Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Fabian%2C+Z">Zalan Fabian</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Z">Zhongqi Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chunyuan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yuanhan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Ziwei Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Hern%C3%A1ndez%2C+A">Andr茅s Hern谩ndez</a>, <a href="/search/cs?searchtype=author&amp;query=Montes-Rojas%2C+A">Andr茅s Montes-Rojas</a>, <a href="/search/cs?searchtype=author&amp;query=Escucha%2C+R">Rafael Escucha</a>, <a href="/search/cs?searchtype=author&amp;query=Siabatto%2C+L">Laura Siabatto</a>, <a href="/search/cs?searchtype=author&amp;query=Link%2C+A">Andr茅s Link</a>, <a href="/search/cs?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P">Pablo Arbel谩ez</a>, <a href="/search/cs?searchtype=author&amp;query=Dodhia%2C+R">Rahul Dodhia</a>, <a href="/search/cs?searchtype=author&amp;query=Ferres%2C+J+L">Juan Lavista Ferres</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.01064v1-abstract-short" style="display: inline;"> Due to deteriorating environmental conditions and increasing human activity, conservation efforts directed towards wildlife is crucial. Motion-activated camera traps constitute an efficient tool for tracking and monitoring wildlife populations across the globe. Supervised learning techniques have been successfully deployed to analyze such imagery, however training such techniques requires annotati&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.01064v1-abstract-full').style.display = 'inline'; document.getElementById('2311.01064v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.01064v1-abstract-full" style="display: none;"> Due to deteriorating environmental conditions and increasing human activity, conservation efforts directed towards wildlife is crucial. Motion-activated camera traps constitute an efficient tool for tracking and monitoring wildlife populations across the globe. Supervised learning techniques have been successfully deployed to analyze such imagery, however training such techniques requires annotations from experts. Reducing the reliance on costly labelled data therefore has immense potential in developing large-scale wildlife tracking solutions with markedly less human labor. In this work we propose WildMatch, a novel zero-shot species classification framework that leverages multimodal foundation models. In particular, we instruction tune vision-language models to generate detailed visual descriptions of camera trap images using similar terminology to experts. Then, we match the generated caption to an external knowledge base of descriptions in order to determine the species in a zero-shot manner. We investigate techniques to build instruction tuning datasets for detailed animal description generation and propose a novel knowledge augmentation technique to enhance caption quality. We demonstrate the performance of WildMatch on a new camera trap dataset collected in the Magdalena Medio region of Colombia. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.01064v1-abstract-full').style.display = 'none'; document.getElementById('2311.01064v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.01036">arXiv:2309.01036</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2309.01036">pdf</a>, <a href="https://arxiv.org/format/2309.01036">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SEPAL: Spatial Gene Expression Prediction from Local Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mejia%2C+G">Gabriel Mejia</a>, <a href="/search/cs?searchtype=author&amp;query=C%C3%A1rdenas%2C+P">Paula C谩rdenas</a>, <a href="/search/cs?searchtype=author&amp;query=Ruiz%2C+D">Daniela Ruiz</a>, <a href="/search/cs?searchtype=author&amp;query=Castillo%2C+A">Angela Castillo</a>, <a href="/search/cs?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P">Pablo Arbel谩ez</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.01036v3-abstract-short" style="display: inline;"> Spatial transcriptomics is an emerging technology that aligns histopathology images with spatially resolved gene expression profiling. It holds the potential for understanding many diseases but faces significant bottlenecks such as specialized equipment and domain expertise. In this work, we present SEPAL, a new model for predicting genetic profiles from visual tissue appearance. Our method exploi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.01036v3-abstract-full').style.display = 'inline'; document.getElementById('2309.01036v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.01036v3-abstract-full" style="display: none;"> Spatial transcriptomics is an emerging technology that aligns histopathology images with spatially resolved gene expression profiling. It holds the potential for understanding many diseases but faces significant bottlenecks such as specialized equipment and domain expertise. In this work, we present SEPAL, a new model for predicting genetic profiles from visual tissue appearance. Our method exploits the biological biases of the problem by directly supervising relative differences with respect to mean expression, and leverages local visual context at every coordinate to make predictions using a graph neural network. This approach closes the gap between complete locality and complete globality in current methods. In addition, we propose a novel benchmark that aims to better define the task by following current best practices in transcriptomics and restricting the prediction variables to only those with clear spatial patterns. Our extensive evaluation in two different human breast cancer datasets indicates that SEPAL outperforms previous state-of-the-art methods and other mechanisms of including spatial context. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.01036v3-abstract-full').style.display = 'none'; document.getElementById('2309.01036v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.13183">arXiv:2308.13183</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2308.13183">pdf</a>, <a href="https://arxiv.org/format/2308.13183">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ICCVW60793.2023.00347">10.1109/ICCVW60793.2023.00347 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> STRIDE: Street View-based Environmental Feature Detection and Pedestrian Collision Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gonz%C3%A1lez%2C+C">Cristina Gonz谩lez</a>, <a href="/search/cs?searchtype=author&amp;query=Ayobi%2C+N">Nicol谩s Ayobi</a>, <a href="/search/cs?searchtype=author&amp;query=Escall%C3%B3n%2C+F">Felipe Escall贸n</a>, <a href="/search/cs?searchtype=author&amp;query=Baldovino-Chiquillo%2C+L">Laura Baldovino-Chiquillo</a>, <a href="/search/cs?searchtype=author&amp;query=Wilches-Mogoll%C3%B3n%2C+M">Maria Wilches-Mogoll贸n</a>, <a href="/search/cs?searchtype=author&amp;query=Pasos%2C+D">Donny Pasos</a>, <a href="/search/cs?searchtype=author&amp;query=Ram%C3%ADrez%2C+N">Nicole Ram铆rez</a>, <a href="/search/cs?searchtype=author&amp;query=Pinz%C3%B3n%2C+J">Jose Pinz贸n</a>, <a href="/search/cs?searchtype=author&amp;query=Sarmiento%2C+O">Olga Sarmiento</a>, <a href="/search/cs?searchtype=author&amp;query=Quistberg%2C+D+A">D Alex Quistberg</a>, <a href="/search/cs?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P">Pablo Arbel谩ez</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.13183v1-abstract-short" style="display: inline;"> This paper introduces a novel benchmark to study the impact and relationship of built environment elements on pedestrian collision prediction, intending to enhance environmental awareness in autonomous driving systems to prevent pedestrian injuries actively. We introduce a built environment detection task in large-scale panoramic images and a detection-based pedestrian collision frequency predicti&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.13183v1-abstract-full').style.display = 'inline'; document.getElementById('2308.13183v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.13183v1-abstract-full" style="display: none;"> This paper introduces a novel benchmark to study the impact and relationship of built environment elements on pedestrian collision prediction, intending to enhance environmental awareness in autonomous driving systems to prevent pedestrian injuries actively. We introduce a built environment detection task in large-scale panoramic images and a detection-based pedestrian collision frequency prediction task. We propose a baseline method that incorporates a collision prediction module into a state-of-the-art detection model to tackle both tasks simultaneously. Our experiments demonstrate a significant correlation between object detection of built environment elements and pedestrian collision frequency prediction. Our results are a stepping stone towards understanding the interdependencies between built environment conditions and pedestrian safety. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.13183v1-abstract-full').style.display = 'none'; document.getElementById('2308.13183v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> 2023 IEEE/CVF International Conference on Computer Vision Workshops (ICCVW) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.03880">arXiv:2308.03880</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2308.03880">pdf</a>, <a href="https://arxiv.org/format/2308.03880">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Guarding the Guardians: Automated Analysis of Online Child Sexual Abuse </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Puentes%2C+J">Juanita Puentes</a>, <a href="/search/cs?searchtype=author&amp;query=Castillo%2C+A">Angela Castillo</a>, <a href="/search/cs?searchtype=author&amp;query=Osejo%2C+W">Wilmar Osejo</a>, <a href="/search/cs?searchtype=author&amp;query=Calder%C3%B3n%2C+Y">Yuly Calder贸n</a>, <a href="/search/cs?searchtype=author&amp;query=Quintero%2C+V">Viviana Quintero</a>, <a href="/search/cs?searchtype=author&amp;query=Saldarriaga%2C+L">Lina Saldarriaga</a>, <a href="/search/cs?searchtype=author&amp;query=Agudelo%2C+D">Diana Agudelo</a>, <a href="/search/cs?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P">Pablo Arbel谩ez</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.03880v2-abstract-short" style="display: inline;"> Online violence against children has increased globally recently, demanding urgent attention. Competent authorities manually analyze abuse complaints to comprehend crime dynamics and identify patterns. However, the manual analysis of these complaints presents a challenge because it exposes analysts to harmful content during the review process. Given these challenges, we present a novel solution, a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.03880v2-abstract-full').style.display = 'inline'; document.getElementById('2308.03880v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.03880v2-abstract-full" style="display: none;"> Online violence against children has increased globally recently, demanding urgent attention. Competent authorities manually analyze abuse complaints to comprehend crime dynamics and identify patterns. However, the manual analysis of these complaints presents a challenge because it exposes analysts to harmful content during the review process. Given these challenges, we present a novel solution, an automated tool designed to analyze children&#39;s sexual abuse reports comprehensively. By automating the analysis process, our tool significantly reduces the risk of exposure to harmful content by categorizing the reports on three dimensions: Subject, Degree of Criminality, and Damage. Furthermore, leveraging our multidisciplinary team&#39;s expertise, we introduce a novel approach to annotate the collected data, enabling a more in-depth analysis of the reports. This approach improves the comprehension of fundamental patterns and trends, enabling law enforcement agencies and policymakers to create focused strategies in the fight against children&#39;s violence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.03880v2-abstract-full').style.display = 'none'; document.getElementById('2308.03880v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Artificial Intelligence (AI) and Humanitarian Assistance and Disaster Recovery (HADR) workshop, ICCV 2023 in Paris, France</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.16606">arXiv:2306.16606</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2306.16606">pdf</a>, <a href="https://arxiv.org/format/2306.16606">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EgoCOL: Egocentric Camera pose estimation for Open-world 3D object Localization @Ego4D challenge 2023 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Forigua%2C+C">Cristhian Forigua</a>, <a href="/search/cs?searchtype=author&amp;query=Escobar%2C+M">Maria Escobar</a>, <a href="/search/cs?searchtype=author&amp;query=Pont-Tuset%2C+J">Jordi Pont-Tuset</a>, <a href="/search/cs?searchtype=author&amp;query=Maninis%2C+K">Kevis-Kokitsi Maninis</a>, <a href="/search/cs?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P">Pablo Arbel谩ez</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.16606v1-abstract-short" style="display: inline;"> We present EgoCOL, an egocentric camera pose estimation method for open-world 3D object localization. Our method leverages sparse camera pose reconstructions in a two-fold manner, video and scan independently, to estimate the camera pose of egocentric frames in 3D renders with high recall and precision. We extensively evaluate our method on the Visual Query (VQ) 3D object localization Ego4D benchm&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.16606v1-abstract-full').style.display = 'inline'; document.getElementById('2306.16606v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.16606v1-abstract-full" style="display: none;"> We present EgoCOL, an egocentric camera pose estimation method for open-world 3D object localization. Our method leverages sparse camera pose reconstructions in a two-fold manner, video and scan independently, to estimate the camera pose of egocentric frames in 3D renders with high recall and precision. We extensively evaluate our method on the Visual Query (VQ) 3D object localization Ego4D benchmark. EgoCOL can estimate 62% and 59% more camera poses than the Ego4D baseline in the Ego4D Visual Queries 3D Localization challenge at CVPR 2023 in the val and test sets, respectively. Our code is publicly available at https://github.com/BCV-Uniandes/EgoCOL <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.16606v1-abstract-full').style.display = 'none'; document.getElementById('2306.16606v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.11118">arXiv:2304.11118</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2304.11118">pdf</a>, <a href="https://arxiv.org/format/2304.11118">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> BoDiffusion: Diffusing Sparse Observations for Full-Body Human Motion Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Castillo%2C+A">Angela Castillo</a>, <a href="/search/cs?searchtype=author&amp;query=Escobar%2C+M">Maria Escobar</a>, <a href="/search/cs?searchtype=author&amp;query=Jeanneret%2C+G">Guillaume Jeanneret</a>, <a href="/search/cs?searchtype=author&amp;query=Pumarola%2C+A">Albert Pumarola</a>, <a href="/search/cs?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P">Pablo Arbel谩ez</a>, <a href="/search/cs?searchtype=author&amp;query=Thabet%2C+A">Ali Thabet</a>, <a href="/search/cs?searchtype=author&amp;query=Sanakoyeu%2C+A">Artsiom Sanakoyeu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.11118v1-abstract-short" style="display: inline;"> Mixed reality applications require tracking the user&#39;s full-body motion to enable an immersive experience. However, typical head-mounted devices can only track head and hand movements, leading to a limited reconstruction of full-body motion due to variability in lower body configurations. We propose BoDiffusion -- a generative diffusion model for motion synthesis to tackle this under-constrained r&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.11118v1-abstract-full').style.display = 'inline'; document.getElementById('2304.11118v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.11118v1-abstract-full" style="display: none;"> Mixed reality applications require tracking the user&#39;s full-body motion to enable an immersive experience. However, typical head-mounted devices can only track head and hand movements, leading to a limited reconstruction of full-body motion due to variability in lower body configurations. We propose BoDiffusion -- a generative diffusion model for motion synthesis to tackle this under-constrained reconstruction problem. We present a time and space conditioning scheme that allows BoDiffusion to leverage sparse tracking inputs while generating smooth and realistic full-body motion sequences. To the best of our knowledge, this is the first approach that uses the reverse diffusion process to model full-body tracking as a conditional sequence generation task. We conduct experiments on the large-scale motion-capture dataset AMASS and show that our approach outperforms the state-of-the-art approaches by a significant margin in terms of full-body motion realism and joint reconstruction error. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.11118v1-abstract-full').style.display = 'none'; document.getElementById('2304.11118v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.07744">arXiv:2304.07744</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2304.07744">pdf</a>, <a href="https://arxiv.org/format/2304.07744">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> JoB-VS: Joint Brain-Vessel Segmentation in TOF-MRA Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Valderrama%2C+N">Natalia Valderrama</a>, <a href="/search/cs?searchtype=author&amp;query=Pitsiorlas%2C+I">Ioannis Pitsiorlas</a>, <a href="/search/cs?searchtype=author&amp;query=Vargas%2C+L">Luisa Vargas</a>, <a href="/search/cs?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P">Pablo Arbel谩ez</a>, <a href="/search/cs?searchtype=author&amp;query=Zuluaga%2C+M+A">Maria A. Zuluaga</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.07744v1-abstract-short" style="display: inline;"> We propose the first joint-task learning framework for brain and vessel segmentation (JoB-VS) from Time-of-Flight Magnetic Resonance images. Unlike state-of-the-art vessel segmentation methods, our approach avoids the pre-processing step of implementing a model to extract the brain from the volumetric input data. Skipping this additional step makes our method an end-to-end vessel segmentation fram&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.07744v1-abstract-full').style.display = 'inline'; document.getElementById('2304.07744v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.07744v1-abstract-full" style="display: none;"> We propose the first joint-task learning framework for brain and vessel segmentation (JoB-VS) from Time-of-Flight Magnetic Resonance images. Unlike state-of-the-art vessel segmentation methods, our approach avoids the pre-processing step of implementing a model to extract the brain from the volumetric input data. Skipping this additional step makes our method an end-to-end vessel segmentation framework. JoB-VS uses a lattice architecture that favors the segmentation of structures of different scales (e.g., the brain and vessels). Its segmentation head allows the simultaneous prediction of the brain and vessel mask. Moreover, we generate data augmentation with adversarial examples, which our results demonstrate to enhance the performance. JoB-VS achieves 70.03% mean AP and 69.09% F1-score in the OASIS-3 dataset and is capable of generalizing the segmentation in the IXI dataset. These results show the adequacy of JoB-VS for the challenging task of vessel segmentation in complete TOF-MRA images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.07744v1-abstract-full').style.display = 'none'; document.getElementById('2304.07744v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.09514">arXiv:2303.09514</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2303.09514">pdf</a>, <a href="https://arxiv.org/format/2303.09514">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ISBI53787.2023.10230819">10.1109/ISBI53787.2023.10230819 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> MATIS: Masked-Attention Transformers for Surgical Instrument Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ayobi%2C+N">Nicol谩s Ayobi</a>, <a href="/search/cs?searchtype=author&amp;query=P%C3%A9rez-Rond%C3%B3n%2C+A">Alejandra P茅rez-Rond贸n</a>, <a href="/search/cs?searchtype=author&amp;query=Rodr%C3%ADguez%2C+S">Santiago Rodr铆guez</a>, <a href="/search/cs?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P">Pablo Arbel谩ez</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.09514v4-abstract-short" style="display: inline;"> We propose Masked-Attention Transformers for Surgical Instrument Segmentation (MATIS), a two-stage, fully transformer-based method that leverages modern pixel-wise attention mechanisms for instrument segmentation. MATIS exploits the instance-level nature of the task by employing a masked attention module that generates and classifies a set of fine instrument region proposals. Our method incorporat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.09514v4-abstract-full').style.display = 'inline'; document.getElementById('2303.09514v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.09514v4-abstract-full" style="display: none;"> We propose Masked-Attention Transformers for Surgical Instrument Segmentation (MATIS), a two-stage, fully transformer-based method that leverages modern pixel-wise attention mechanisms for instrument segmentation. MATIS exploits the instance-level nature of the task by employing a masked attention module that generates and classifies a set of fine instrument region proposals. Our method incorporates long-term video-level information through video transformers to improve temporal consistency and enhance mask classification. We validate our approach in the two standard public benchmarks, Endovis 2017 and Endovis 2018. Our experiments demonstrate that MATIS&#39; per-frame baseline outperforms previous state-of-the-art methods and that including our temporal consistency module boosts our model&#39;s performance further. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.09514v4-abstract-full').style.display = 'none'; document.getElementById('2303.09514v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ISBI 2023 (Oral). Winning method of the 2022 SAR-RARP50 Challenge (arXiv:2401.00496). Official extension published at arXiv:2401.11174 . Code available at https://github.com/BCV-Uniandes/MATIS</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> 2023 IEEE 20th International Symposium on Biomedical Imaging (ISBI), 10230819 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.04582">arXiv:2212.04582</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2212.04582">pdf</a>, <a href="https://arxiv.org/format/2212.04582">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/978-3-031-16449-1_42">10.1007/978-3-031-16449-1_42 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Towards Holistic Surgical Scene Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Valderrama%2C+N">Natalia Valderrama</a>, <a href="/search/cs?searchtype=author&amp;query=Puentes%2C+P+R">Paola Ruiz Puentes</a>, <a href="/search/cs?searchtype=author&amp;query=Hern%C3%A1ndez%2C+I">Isabela Hern谩ndez</a>, <a href="/search/cs?searchtype=author&amp;query=Ayobi%2C+N">Nicol谩s Ayobi</a>, <a href="/search/cs?searchtype=author&amp;query=Verlyk%2C+M">Mathilde Verlyk</a>, <a href="/search/cs?searchtype=author&amp;query=Santander%2C+J">Jessica Santander</a>, <a href="/search/cs?searchtype=author&amp;query=Caicedo%2C+J">Juan Caicedo</a>, <a href="/search/cs?searchtype=author&amp;query=Fern%C3%A1ndez%2C+N">Nicol谩s Fern谩ndez</a>, <a href="/search/cs?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P">Pablo Arbel谩ez</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.04582v4-abstract-short" style="display: inline;"> Most benchmarks for studying surgical interventions focus on a specific challenge instead of leveraging the intrinsic complementarity among different tasks. In this work, we present a new experimental framework towards holistic surgical scene understanding. First, we introduce the Phase, Step, Instrument, and Atomic Visual Action recognition (PSI-AVA) Dataset. PSI-AVA includes annotations for both&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.04582v4-abstract-full').style.display = 'inline'; document.getElementById('2212.04582v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.04582v4-abstract-full" style="display: none;"> Most benchmarks for studying surgical interventions focus on a specific challenge instead of leveraging the intrinsic complementarity among different tasks. In this work, we present a new experimental framework towards holistic surgical scene understanding. First, we introduce the Phase, Step, Instrument, and Atomic Visual Action recognition (PSI-AVA) Dataset. PSI-AVA includes annotations for both long-term (Phase and Step recognition) and short-term reasoning (Instrument detection and novel Atomic Action recognition) in robot-assisted radical prostatectomy videos. Second, we present Transformers for Action, Phase, Instrument, and steps Recognition (TAPIR) as a strong baseline for surgical scene understanding. TAPIR leverages our dataset&#39;s multi-level annotations as it benefits from the learned representation on the instrument detection task to improve its classification capacity. Our experimental results in both PSI-AVA and other publicly available databases demonstrate the adequacy of our framework to spur future research on holistic surgical scene understanding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.04582v4-abstract-full').style.display = 'none'; document.getElementById('2212.04582v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">MICCAI 2022 Oral. Official extension published at arXiv:2401.11174 . Data and codes available at https://github.com/BCV-Uniandes/TAPIR</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Medical Image Computing and Computer Assisted Intervention 2022, </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.11329">arXiv:2207.11329</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2207.11329">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Video Swin Transformers for Egocentric Video Understanding @ Ego4D Challenges 2022 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Escobar%2C+M">Maria Escobar</a>, <a href="/search/cs?searchtype=author&amp;query=Daza%2C+L">Laura Daza</a>, <a href="/search/cs?searchtype=author&amp;query=Gonz%C3%A1lez%2C+C">Cristina Gonz谩lez</a>, <a href="/search/cs?searchtype=author&amp;query=Pont-Tuset%2C+J">Jordi Pont-Tuset</a>, <a href="/search/cs?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P">Pablo Arbel谩ez</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.11329v1-abstract-short" style="display: inline;"> We implemented Video Swin Transformer as a base architecture for the tasks of Point-of-No-Return temporal localization and Object State Change Classification. Our method achieved competitive performance on both challenges. </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.11329v1-abstract-full" style="display: none;"> We implemented Video Swin Transformer as a base architecture for the tasks of Point-of-No-Return temporal localization and Object State Change Classification. Our method achieved competitive performance on both challenges. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.11329v1-abstract-full').style.display = 'none'; document.getElementById('2207.11329v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.04978">arXiv:2202.04978</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2202.04978">pdf</a>, <a href="https://arxiv.org/format/2202.04978">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Assessing and Characterizing the Semantic Robustness of Face Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=P%C3%A9rez%2C+J+C">Juan C. P茅rez</a>, <a href="/search/cs?searchtype=author&amp;query=Alfarra%2C+M">Motasem Alfarra</a>, <a href="/search/cs?searchtype=author&amp;query=Thabet%2C+A">Ali Thabet</a>, <a href="/search/cs?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P">Pablo Arbel谩ez</a>, <a href="/search/cs?searchtype=author&amp;query=Ghanem%2C+B">Bernard Ghanem</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.04978v1-abstract-short" style="display: inline;"> Deep Neural Networks (DNNs) lack robustness against imperceptible perturbations to their input. Face Recognition Models (FRMs) based on DNNs inherit this vulnerability. We propose a methodology for assessing and characterizing the robustness of FRMs against semantic perturbations to their input. Our methodology causes FRMs to malfunction by designing adversarial attacks that search for identity-pr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.04978v1-abstract-full').style.display = 'inline'; document.getElementById('2202.04978v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.04978v1-abstract-full" style="display: none;"> Deep Neural Networks (DNNs) lack robustness against imperceptible perturbations to their input. Face Recognition Models (FRMs) based on DNNs inherit this vulnerability. We propose a methodology for assessing and characterizing the robustness of FRMs against semantic perturbations to their input. Our methodology causes FRMs to malfunction by designing adversarial attacks that search for identity-preserving modifications to faces. In particular, given a face, our attacks find identity-preserving variants of the face such that an FRM fails to recognize the images belonging to the same identity. We model these identity-preserving semantic modifications via direction- and magnitude-constrained perturbations in the latent space of StyleGAN. We further propose to characterize the semantic robustness of an FRM by statistically describing the perturbations that induce the FRM to malfunction. Finally, we combine our methodology with a certification technique, thus providing (i) theoretical guarantees on the performance of an FRM, and (ii) a formal description of how an FRM may model the notion of face identity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.04978v1-abstract-full').style.display = 'none'; document.getElementById('2202.04978v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 pages, 18 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.10074">arXiv:2112.10074</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2112.10074">pdf</a>, <a href="https://arxiv.org/format/2112.10074">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.59275/j.melba.2022-354b">10.59275/j.melba.2022-354b <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> QU-BraTS: MICCAI BraTS 2020 Challenge on Quantifying Uncertainty in Brain Tumor Segmentation - Analysis of Ranking Scores and Benchmarking Results </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mehta%2C+R">Raghav Mehta</a>, <a href="/search/cs?searchtype=author&amp;query=Filos%2C+A">Angelos Filos</a>, <a href="/search/cs?searchtype=author&amp;query=Baid%2C+U">Ujjwal Baid</a>, <a href="/search/cs?searchtype=author&amp;query=Sako%2C+C">Chiharu Sako</a>, <a href="/search/cs?searchtype=author&amp;query=McKinley%2C+R">Richard McKinley</a>, <a href="/search/cs?searchtype=author&amp;query=Rebsamen%2C+M">Michael Rebsamen</a>, <a href="/search/cs?searchtype=author&amp;query=Datwyler%2C+K">Katrin Datwyler</a>, <a href="/search/cs?searchtype=author&amp;query=Meier%2C+R">Raphael Meier</a>, <a href="/search/cs?searchtype=author&amp;query=Radojewski%2C+P">Piotr Radojewski</a>, <a href="/search/cs?searchtype=author&amp;query=Murugesan%2C+G+K">Gowtham Krishnan Murugesan</a>, <a href="/search/cs?searchtype=author&amp;query=Nalawade%2C+S">Sahil Nalawade</a>, <a href="/search/cs?searchtype=author&amp;query=Ganesh%2C+C">Chandan Ganesh</a>, <a href="/search/cs?searchtype=author&amp;query=Wagner%2C+B">Ben Wagner</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+F+F">Fang F. Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Fei%2C+B">Baowei Fei</a>, <a href="/search/cs?searchtype=author&amp;query=Madhuranthakam%2C+A+J">Ananth J. Madhuranthakam</a>, <a href="/search/cs?searchtype=author&amp;query=Maldjian%2C+J+A">Joseph A. Maldjian</a>, <a href="/search/cs?searchtype=author&amp;query=Daza%2C+L">Laura Daza</a>, <a href="/search/cs?searchtype=author&amp;query=Gomez%2C+C">Catalina Gomez</a>, <a href="/search/cs?searchtype=author&amp;query=Arbelaez%2C+P">Pablo Arbelaez</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+C">Chengliang Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shuo Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Reynaud%2C+H">Hadrien Reynaud</a>, <a href="/search/cs?searchtype=author&amp;query=Mo%2C+Y">Yuan-han Mo</a>, <a href="/search/cs?searchtype=author&amp;query=Angelini%2C+E">Elsa Angelini</a> , et al. (67 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.10074v2-abstract-short" style="display: inline;"> Deep learning (DL) models have provided state-of-the-art performance in various medical imaging benchmarking challenges, including the Brain Tumor Segmentation (BraTS) challenges. However, the task of focal pathology multi-compartment segmentation (e.g., tumor and lesion sub-regions) is particularly challenging, and potential errors hinder translating DL models into clinical workflows. Quantifying&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.10074v2-abstract-full').style.display = 'inline'; document.getElementById('2112.10074v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.10074v2-abstract-full" style="display: none;"> Deep learning (DL) models have provided state-of-the-art performance in various medical imaging benchmarking challenges, including the Brain Tumor Segmentation (BraTS) challenges. However, the task of focal pathology multi-compartment segmentation (e.g., tumor and lesion sub-regions) is particularly challenging, and potential errors hinder translating DL models into clinical workflows. Quantifying the reliability of DL model predictions in the form of uncertainties could enable clinical review of the most uncertain regions, thereby building trust and paving the way toward clinical translation. Several uncertainty estimation methods have recently been introduced for DL medical image segmentation tasks. Developing scores to evaluate and compare the performance of uncertainty measures will assist the end-user in making more informed decisions. In this study, we explore and evaluate a score developed during the BraTS 2019 and BraTS 2020 task on uncertainty quantification (QU-BraTS) and designed to assess and rank uncertainty estimates for brain tumor multi-compartment segmentation. This score (1) rewards uncertainty estimates that produce high confidence in correct assertions and those that assign low confidence levels at incorrect assertions, and (2) penalizes uncertainty measures that lead to a higher percentage of under-confident correct assertions. We further benchmark the segmentation uncertainties generated by 14 independent participating teams of QU-BraTS 2020, all of which also participated in the main BraTS segmentation task. Overall, our findings confirm the importance and complementary value that uncertainty estimates provide to segmentation algorithms, highlighting the need for uncertainty quantification in medical image analyses. Finally, in favor of transparency and reproducibility, our evaluation code is made publicly available at: https://github.com/RagMeh11/QU-BraTS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.10074v2-abstract-full').style.display = 'none'; document.getElementById('2112.10074v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication at the Journal of Machine Learning for Biomedical Imaging (MELBA): https://www.melba-journal.org/papers/2022:026.html</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Machine.Learning.for.Biomedical.Imaging. 1 (2022) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.07058">arXiv:2110.07058</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2110.07058">pdf</a>, <a href="https://arxiv.org/format/2110.07058">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Ego4D: Around the World in 3,000 Hours of Egocentric Video </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Grauman%2C+K">Kristen Grauman</a>, <a href="/search/cs?searchtype=author&amp;query=Westbury%2C+A">Andrew Westbury</a>, <a href="/search/cs?searchtype=author&amp;query=Byrne%2C+E">Eugene Byrne</a>, <a href="/search/cs?searchtype=author&amp;query=Chavis%2C+Z">Zachary Chavis</a>, <a href="/search/cs?searchtype=author&amp;query=Furnari%2C+A">Antonino Furnari</a>, <a href="/search/cs?searchtype=author&amp;query=Girdhar%2C+R">Rohit Girdhar</a>, <a href="/search/cs?searchtype=author&amp;query=Hamburger%2C+J">Jackson Hamburger</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+H">Hao Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+M">Miao Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xingyu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Martin%2C+M">Miguel Martin</a>, <a href="/search/cs?searchtype=author&amp;query=Nagarajan%2C+T">Tushar Nagarajan</a>, <a href="/search/cs?searchtype=author&amp;query=Radosavovic%2C+I">Ilija Radosavovic</a>, <a href="/search/cs?searchtype=author&amp;query=Ramakrishnan%2C+S+K">Santhosh Kumar Ramakrishnan</a>, <a href="/search/cs?searchtype=author&amp;query=Ryan%2C+F">Fiona Ryan</a>, <a href="/search/cs?searchtype=author&amp;query=Sharma%2C+J">Jayant Sharma</a>, <a href="/search/cs?searchtype=author&amp;query=Wray%2C+M">Michael Wray</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+M">Mengmeng Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+E+Z">Eric Zhongcong Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+C">Chen Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Bansal%2C+S">Siddhant Bansal</a>, <a href="/search/cs?searchtype=author&amp;query=Batra%2C+D">Dhruv Batra</a>, <a href="/search/cs?searchtype=author&amp;query=Cartillier%2C+V">Vincent Cartillier</a>, <a href="/search/cs?searchtype=author&amp;query=Crane%2C+S">Sean Crane</a>, <a href="/search/cs?searchtype=author&amp;query=Do%2C+T">Tien Do</a> , et al. (60 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.07058v3-abstract-short" style="display: inline;"> We introduce Ego4D, a massive-scale egocentric video dataset and benchmark suite. It offers 3,670 hours of daily-life activity video spanning hundreds of scenarios (household, outdoor, workplace, leisure, etc.) captured by 931 unique camera wearers from 74 worldwide locations and 9 different countries. The approach to collection is designed to uphold rigorous privacy and ethics standards with cons&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.07058v3-abstract-full').style.display = 'inline'; document.getElementById('2110.07058v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.07058v3-abstract-full" style="display: none;"> We introduce Ego4D, a massive-scale egocentric video dataset and benchmark suite. It offers 3,670 hours of daily-life activity video spanning hundreds of scenarios (household, outdoor, workplace, leisure, etc.) captured by 931 unique camera wearers from 74 worldwide locations and 9 different countries. The approach to collection is designed to uphold rigorous privacy and ethics standards with consenting participants and robust de-identification procedures where relevant. Ego4D dramatically expands the volume of diverse egocentric video footage publicly available to the research community. Portions of the video are accompanied by audio, 3D meshes of the environment, eye gaze, stereo, and/or synchronized videos from multiple egocentric cameras at the same event. Furthermore, we present a host of new benchmark challenges centered around understanding the first-person visual experience in the past (querying an episodic memory), present (analyzing hand-object manipulation, audio-visual conversation, and social interactions), and future (forecasting activities). By publicly sharing this massive annotated dataset and benchmark suite, we aim to push the frontier of first-person perception. Project page: https://ego4d-data.org/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.07058v3-abstract-full').style.display = 'none'; document.getElementById('2110.07058v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in the Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2022. This version updates the baseline result numbers for the Hands and Objects benchmark (appendix)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2109.04988">arXiv:2109.04988</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2109.04988">pdf</a>, <a href="https://arxiv.org/format/2109.04988">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Panoptic Narrative Grounding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gonz%C3%A1lez%2C+C">C. Gonz谩lez</a>, <a href="/search/cs?searchtype=author&amp;query=Ayobi%2C+N">N. Ayobi</a>, <a href="/search/cs?searchtype=author&amp;query=Hern%C3%A1ndez%2C+I">I. Hern谩ndez</a>, <a href="/search/cs?searchtype=author&amp;query=Hern%C3%A1ndez%2C+J">J. Hern谩ndez</a>, <a href="/search/cs?searchtype=author&amp;query=Pont-Tuset%2C+J">J. Pont-Tuset</a>, <a href="/search/cs?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P">P. Arbel谩ez</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2109.04988v1-abstract-short" style="display: inline;"> This paper proposes Panoptic Narrative Grounding, a spatially fine and general formulation of the natural language visual grounding problem. We establish an experimental framework for the study of this new task, including new ground truth and metrics, and we propose a strong baseline method to serve as stepping stone for future work. We exploit the intrinsic semantic richness in an image by includ&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.04988v1-abstract-full').style.display = 'inline'; document.getElementById('2109.04988v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2109.04988v1-abstract-full" style="display: none;"> This paper proposes Panoptic Narrative Grounding, a spatially fine and general formulation of the natural language visual grounding problem. We establish an experimental framework for the study of this new task, including new ground truth and metrics, and we propose a strong baseline method to serve as stepping stone for future work. We exploit the intrinsic semantic richness in an image by including panoptic categories, and we approach visual grounding at a fine-grained level by using segmentations. In terms of ground truth, we propose an algorithm to automatically transfer Localized Narratives annotations to specific regions in the panoptic segmentations of the MS COCO dataset. To guarantee the quality of our annotations, we take advantage of the semantic structure contained in WordNet to exclusively incorporate noun phrases that are grounded to a meaningfully related panoptic segmentation region. The proposed baseline achieves a performance of 55.4 absolute Average Recall points. This result is a suitable foundation to push the envelope further in the development of methods for Panoptic Narrative Grounding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.04988v1-abstract-full').style.display = 'none'; document.getElementById('2109.04988v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 6 figures, to appear at ICCV 2021 (Oral presentation)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2108.11785">arXiv:2108.11785</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2108.11785">pdf</a>, <a href="https://arxiv.org/format/2108.11785">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Hierarchical Assessment of Adversarial Severity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jeanneret%2C+G">Guillaume Jeanneret</a>, <a href="/search/cs?searchtype=author&amp;query=Perez%2C+J+C">Juan C Perez</a>, <a href="/search/cs?searchtype=author&amp;query=Arbelaez%2C+P">Pablo Arbelaez</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2108.11785v1-abstract-short" style="display: inline;"> Adversarial Robustness is a growing field that evidences the brittleness of neural networks. Although the literature on adversarial robustness is vast, a dimension is missing in these studies: assessing how severe the mistakes are. We call this notion &#34;Adversarial Severity&#34; since it quantifies the downstream impact of adversarial corruptions by computing the semantic error between the misclassific&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.11785v1-abstract-full').style.display = 'inline'; document.getElementById('2108.11785v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2108.11785v1-abstract-full" style="display: none;"> Adversarial Robustness is a growing field that evidences the brittleness of neural networks. Although the literature on adversarial robustness is vast, a dimension is missing in these studies: assessing how severe the mistakes are. We call this notion &#34;Adversarial Severity&#34; since it quantifies the downstream impact of adversarial corruptions by computing the semantic error between the misclassification and the proper label. We propose to study the effects of adversarial noise by measuring the Robustness and Severity into a large-scale dataset: iNaturalist-H. Our contributions are: (i) we introduce novel Hierarchical Attacks that harness the rich structured space of labels to create adversarial examples. (ii) These attacks allow us to benchmark the Adversarial Robustness and Severity of classification models. (iii) We enhance the traditional adversarial training with a simple yet effective Hierarchical Curriculum Training to learn these nodes gradually within the hierarchical tree. We perform extensive experiments showing that hierarchical defenses allow deep models to boost the adversarial Robustness by 1.85% and reduce the severity of all attacks by 0.17, on average. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.11785v1-abstract-full').style.display = 'none'; document.getElementById('2108.11785v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 August, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear on the ICCV2021 Workshop on Adversarial Robustness in the Real World</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2108.11505">arXiv:2108.11505</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2108.11505">pdf</a>, <a href="https://arxiv.org/format/2108.11505">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Generalized Real-World Super-Resolution through Adversarial Robustness </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Castillo%2C+A">Angela Castillo</a>, <a href="/search/cs?searchtype=author&amp;query=Escobar%2C+M">Mar铆a Escobar</a>, <a href="/search/cs?searchtype=author&amp;query=P%C3%A9rez%2C+J+C">Juan C. P茅rez</a>, <a href="/search/cs?searchtype=author&amp;query=Romero%2C+A">Andr茅s Romero</a>, <a href="/search/cs?searchtype=author&amp;query=Timofte%2C+R">Radu Timofte</a>, <a href="/search/cs?searchtype=author&amp;query=Van+Gool%2C+L">Luc Van Gool</a>, <a href="/search/cs?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P">Pablo Arbel谩ez</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2108.11505v1-abstract-short" style="display: inline;"> Real-world Super-Resolution (SR) has been traditionally tackled by first learning a specific degradation model that resembles the noise and corruption artifacts in low-resolution imagery. Thus, current methods lack generalization and lose their accuracy when tested on unseen types of corruption. In contrast to the traditional proposal, we present Robust Super-Resolution (RSR), a method that levera&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.11505v1-abstract-full').style.display = 'inline'; document.getElementById('2108.11505v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2108.11505v1-abstract-full" style="display: none;"> Real-world Super-Resolution (SR) has been traditionally tackled by first learning a specific degradation model that resembles the noise and corruption artifacts in low-resolution imagery. Thus, current methods lack generalization and lose their accuracy when tested on unseen types of corruption. In contrast to the traditional proposal, we present Robust Super-Resolution (RSR), a method that leverages the generalization capability of adversarial attacks to tackle real-world SR. Our novel framework poses a paradigm shift in the development of real-world SR methods. Instead of learning a dataset-specific degradation, we employ adversarial attacks to create difficult examples that target the model&#39;s weaknesses. Afterward, we use these adversarial examples during training to improve our model&#39;s capacity to process noisy inputs. We perform extensive experimentation on synthetic and real-world images and empirically demonstrate that our RSR method generalizes well across datasets without re-training for specific noise priors. By using a single robust model, we outperform state-of-the-art specialized methods on real-world benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.11505v1-abstract-full').style.display = 'none'; document.getElementById('2108.11505v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 August, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICCV Workshops, 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2107.14110">arXiv:2107.14110</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2107.14110">pdf</a>, <a href="https://arxiv.org/format/2107.14110">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Adversarial Robustness via Test-time Transformation Ensembling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=P%C3%A9rez%2C+J+C">Juan C. P茅rez</a>, <a href="/search/cs?searchtype=author&amp;query=Alfarra%2C+M">Motasem Alfarra</a>, <a href="/search/cs?searchtype=author&amp;query=Jeanneret%2C+G">Guillaume Jeanneret</a>, <a href="/search/cs?searchtype=author&amp;query=Rueda%2C+L">Laura Rueda</a>, <a href="/search/cs?searchtype=author&amp;query=Thabet%2C+A">Ali Thabet</a>, <a href="/search/cs?searchtype=author&amp;query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P">Pablo Arbel谩ez</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2107.14110v1-abstract-short" style="display: inline;"> Deep learning models are prone to being fooled by imperceptible perturbations known as adversarial attacks. In this work, we study how equipping models with Test-time Transformation Ensembling (TTE) can work as a reliable defense against such attacks. While transforming the input data, both at train and test times, is known to enhance model performance, its effects on adversarial robustness have n&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.14110v1-abstract-full').style.display = 'inline'; document.getElementById('2107.14110v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2107.14110v1-abstract-full" style="display: none;"> Deep learning models are prone to being fooled by imperceptible perturbations known as adversarial attacks. In this work, we study how equipping models with Test-time Transformation Ensembling (TTE) can work as a reliable defense against such attacks. While transforming the input data, both at train and test times, is known to enhance model performance, its effects on adversarial robustness have not been studied. Here, we present a comprehensive empirical study of the impact of TTE, in the form of widely-used image transforms, on adversarial robustness. We show that TTE consistently improves model robustness against a variety of powerful attacks without any need for re-training, and that this improvement comes at virtually no trade-off with accuracy on clean samples. Finally, we show that the benefits of TTE transfer even to the certified robustness domain, in which TTE provides sizable and consistent improvements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.14110v1-abstract-full').style.display = 'none'; document.getElementById('2107.14110v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2107.04263">arXiv:2107.04263</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2107.04263">pdf</a>, <a href="https://arxiv.org/ps/2107.04263">ps</a>, <a href="https://arxiv.org/format/2107.04263">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Robust General Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Daza%2C+L">Laura Daza</a>, <a href="/search/cs?searchtype=author&amp;query=P%C3%A9rez%2C+J+C">Juan C. P茅rez</a>, <a href="/search/cs?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P">Pablo Arbel谩ez</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2107.04263v1-abstract-short" style="display: inline;"> The reliability of Deep Learning systems depends on their accuracy but also on their robustness against adversarial perturbations to the input data. Several attacks and defenses have been proposed to improve the performance of Deep Neural Networks under the presence of adversarial noise in the natural image domain. However, robustness in computer-aided diagnosis for volumetric data has only been e&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.04263v1-abstract-full').style.display = 'inline'; document.getElementById('2107.04263v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2107.04263v1-abstract-full" style="display: none;"> The reliability of Deep Learning systems depends on their accuracy but also on their robustness against adversarial perturbations to the input data. Several attacks and defenses have been proposed to improve the performance of Deep Neural Networks under the presence of adversarial noise in the natural image domain. However, robustness in computer-aided diagnosis for volumetric data has only been explored for specific tasks and with limited attacks. We propose a new framework to assess the robustness of general medical image segmentation systems. Our contributions are two-fold: (i) we propose a new benchmark to evaluate robustness in the context of the Medical Segmentation Decathlon (MSD) by extending the recent AutoAttack natural image classification framework to the domain of volumetric data segmentation, and (ii) we present a novel lattice architecture for RObust Generic medical image segmentation (ROG). Our results show that ROG is capable of generalizing across different tasks of the MSD and largely surpasses the state-of-the-art under sophisticated adversarial attacks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.04263v1-abstract-full').style.display = 'none'; document.getElementById('2107.04263v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 July, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at MICCAI 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.05735">arXiv:2106.05735</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2106.05735">pdf</a>, <a href="https://arxiv.org/format/2106.05735">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1038/s41467-022-30695-9">10.1038/s41467-022-30695-9 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> The Medical Segmentation Decathlon </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Antonelli%2C+M">Michela Antonelli</a>, <a href="/search/cs?searchtype=author&amp;query=Reinke%2C+A">Annika Reinke</a>, <a href="/search/cs?searchtype=author&amp;query=Bakas%2C+S">Spyridon Bakas</a>, <a href="/search/cs?searchtype=author&amp;query=Farahani%2C+K">Keyvan Farahani</a>, <a href="/search/cs?searchtype=author&amp;query=AnnetteKopp-Schneider"> AnnetteKopp-Schneider</a>, <a href="/search/cs?searchtype=author&amp;query=Landman%2C+B+A">Bennett A. Landman</a>, <a href="/search/cs?searchtype=author&amp;query=Litjens%2C+G">Geert Litjens</a>, <a href="/search/cs?searchtype=author&amp;query=Menze%2C+B">Bjoern Menze</a>, <a href="/search/cs?searchtype=author&amp;query=Ronneberger%2C+O">Olaf Ronneberger</a>, <a href="/search/cs?searchtype=author&amp;query=Summers%2C+R+M">Ronald M. Summers</a>, <a href="/search/cs?searchtype=author&amp;query=van+Ginneken%2C+B">Bram van Ginneken</a>, <a href="/search/cs?searchtype=author&amp;query=Bilello%2C+M">Michel Bilello</a>, <a href="/search/cs?searchtype=author&amp;query=Bilic%2C+P">Patrick Bilic</a>, <a href="/search/cs?searchtype=author&amp;query=Christ%2C+P+F">Patrick F. Christ</a>, <a href="/search/cs?searchtype=author&amp;query=Do%2C+R+K+G">Richard K. G. Do</a>, <a href="/search/cs?searchtype=author&amp;query=Gollub%2C+M+J">Marc J. Gollub</a>, <a href="/search/cs?searchtype=author&amp;query=Heckers%2C+S+H">Stephan H. Heckers</a>, <a href="/search/cs?searchtype=author&amp;query=Huisman%2C+H">Henkjan Huisman</a>, <a href="/search/cs?searchtype=author&amp;query=Jarnagin%2C+W+R">William R. Jarnagin</a>, <a href="/search/cs?searchtype=author&amp;query=McHugo%2C+M+K">Maureen K. McHugo</a>, <a href="/search/cs?searchtype=author&amp;query=Napel%2C+S">Sandy Napel</a>, <a href="/search/cs?searchtype=author&amp;query=Pernicka%2C+J+S+G">Jennifer S. Goli Pernicka</a>, <a href="/search/cs?searchtype=author&amp;query=Rhode%2C+K">Kawal Rhode</a>, <a href="/search/cs?searchtype=author&amp;query=Tobon-Gomez%2C+C">Catalina Tobon-Gomez</a>, <a href="/search/cs?searchtype=author&amp;query=Vorontsov%2C+E">Eugene Vorontsov</a> , et al. (34 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.05735v1-abstract-short" style="display: inline;"> International challenges have become the de facto standard for comparative assessment of image analysis algorithms given a specific task. Segmentation is so far the most widely investigated medical image processing task, but the various segmentation challenges have typically been organized in isolation, such that algorithm development was driven by the need to tackle a single specific clinical pro&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.05735v1-abstract-full').style.display = 'inline'; document.getElementById('2106.05735v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.05735v1-abstract-full" style="display: none;"> International challenges have become the de facto standard for comparative assessment of image analysis algorithms given a specific task. Segmentation is so far the most widely investigated medical image processing task, but the various segmentation challenges have typically been organized in isolation, such that algorithm development was driven by the need to tackle a single specific clinical problem. We hypothesized that a method capable of performing well on multiple tasks will generalize well to a previously unseen task and potentially outperform a custom-designed solution. To investigate the hypothesis, we organized the Medical Segmentation Decathlon (MSD) - a biomedical image analysis challenge, in which algorithms compete in a multitude of both tasks and modalities. The underlying data set was designed to explore the axis of difficulties typically encountered when dealing with medical images, such as small data sets, unbalanced labels, multi-site data and small objects. The MSD challenge confirmed that algorithms with a consistent good performance on a set of tasks preserved their good average performance on a different set of previously unseen tasks. Moreover, by monitoring the MSD winner for two years, we found that this algorithm continued generalizing well to a wide range of other clinical problems, further confirming our hypothesis. Three main conclusions can be drawn from this study: (1) state-of-the-art image segmentation algorithms are mature, accurate, and generalize well when retrained on unseen tasks; (2) consistent algorithmic performance across multiple tasks is a strong surrogate of algorithmic generalizability; (3) the training of accurate AI segmentation models is now commoditized to non AI experts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.05735v1-abstract-full').style.display = 'none'; document.getElementById('2106.05735v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.01667">arXiv:2106.01667</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2106.01667">pdf</a>, <a href="https://arxiv.org/format/2106.01667">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> APES: Audiovisual Person Search in Untrimmed Video </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Alcazar%2C+J+L">Juan Leon Alcazar</a>, <a href="/search/cs?searchtype=author&amp;query=Mai%2C+L">Long Mai</a>, <a href="/search/cs?searchtype=author&amp;query=Perazzi%2C+F">Federico Perazzi</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+J">Joon-Young Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Arbelaez%2C+P">Pablo Arbelaez</a>, <a href="/search/cs?searchtype=author&amp;query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&amp;query=Heilbron%2C+F+C">Fabian Caba Heilbron</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.01667v1-abstract-short" style="display: inline;"> Humans are arguably one of the most important subjects in video streams, many real-world applications such as video summarization or video editing workflows often require the automatic search and retrieval of a person of interest. Despite tremendous efforts in the person reidentification and retrieval domains, few works have developed audiovisual search strategies. In this paper, we present the Au&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.01667v1-abstract-full').style.display = 'inline'; document.getElementById('2106.01667v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.01667v1-abstract-full" style="display: none;"> Humans are arguably one of the most important subjects in video streams, many real-world applications such as video summarization or video editing workflows often require the automatic search and retrieval of a person of interest. Despite tremendous efforts in the person reidentification and retrieval domains, few works have developed audiovisual search strategies. In this paper, we present the Audiovisual Person Search dataset (APES), a new dataset composed of untrimmed videos whose audio (voices) and visual (faces) streams are densely annotated. APES contains over 1.9K identities labeled along 36 hours of video, making it the largest dataset available for untrimmed audiovisual person search. A key property of APES is that it includes dense temporal annotations that link faces to speech segments of the same identity. To showcase the potential of our new dataset, we propose an audiovisual baseline and benchmark for person retrieval. Our study shows that modeling audiovisual cues benefits the recognition of people&#39;s identities. To enable reproducibility and promote future research, the dataset annotations and baseline code are available at: https://github.com/fuankarion/audiovisual-person-search <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.01667v1-abstract-full').style.display = 'none'; document.getElementById('2106.01667v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2103.13111">arXiv:2103.13111</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2103.13111">pdf</a>, <a href="https://arxiv.org/format/2103.13111">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MIcro-Surgical Anastomose Workflow recognition challenge report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Huaulm%C3%A9%2C+A">Arnaud Huaulm茅</a>, <a href="/search/cs?searchtype=author&amp;query=Sarikaya%2C+D">Duygu Sarikaya</a>, <a href="/search/cs?searchtype=author&amp;query=Mut%2C+K+L">K茅vin Le Mut</a>, <a href="/search/cs?searchtype=author&amp;query=Despinoy%2C+F">Fabien Despinoy</a>, <a href="/search/cs?searchtype=author&amp;query=Long%2C+Y">Yonghao Long</a>, <a href="/search/cs?searchtype=author&amp;query=Dou%2C+Q">Qi Dou</a>, <a href="/search/cs?searchtype=author&amp;query=Chng%2C+C">Chin-Boon Chng</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+W">Wenjun Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Kondo%2C+S">Satoshi Kondo</a>, <a href="/search/cs?searchtype=author&amp;query=Bravo-S%C3%A1nchez%2C+L">Laura Bravo-S谩nchez</a>, <a href="/search/cs?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P">Pablo Arbel谩ez</a>, <a href="/search/cs?searchtype=author&amp;query=Reiter%2C+W">Wolfgang Reiter</a>, <a href="/search/cs?searchtype=author&amp;query=Mitsuishi%2C+M">Manoru Mitsuishi</a>, <a href="/search/cs?searchtype=author&amp;query=Harada%2C+K">Kanako Harada</a>, <a href="/search/cs?searchtype=author&amp;query=Jannin%2C+P">Pierre Jannin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2103.13111v1-abstract-short" style="display: inline;"> The &#34;MIcro-Surgical Anastomose Workflow recognition on training sessions&#34; (MISAW) challenge provided a data set of 27 sequences of micro-surgical anastomosis on artificial blood vessels. This data set was composed of videos, kinematics, and workflow annotations described at three different granularity levels: phase, step, and activity. The participants were given the option to use kinematic data a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.13111v1-abstract-full').style.display = 'inline'; document.getElementById('2103.13111v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2103.13111v1-abstract-full" style="display: none;"> The &#34;MIcro-Surgical Anastomose Workflow recognition on training sessions&#34; (MISAW) challenge provided a data set of 27 sequences of micro-surgical anastomosis on artificial blood vessels. This data set was composed of videos, kinematics, and workflow annotations described at three different granularity levels: phase, step, and activity. The participants were given the option to use kinematic data and videos to develop workflow recognition models. Four tasks were proposed to the participants: three of them were related to the recognition of surgical workflow at three different granularity levels, while the last one addressed the recognition of all granularity levels in the same model. One ranking was made for each task. We used the average application-dependent balanced accuracy (AD-Accuracy) as the evaluation metric. This takes unbalanced classes into account and it is more clinically relevant than a frame-by-frame score. Six teams, including a non-competing team, participated in at least one task. All models employed deep learning models, such as CNN or RNN. The best models achieved more than 95% AD-Accuracy for phase recognition, 80% for step recognition, 60% for activity recognition, and 75% for all granularity levels. For high levels of granularity (i.e., phases and steps), the best models had a recognition rate that may be sufficient for applications such as prediction of remaining surgical time or resource management. However, for activities, the recognition rate was still low for applications that can be employed clinically. The MISAW data set is publicly available to encourage further research in surgical workflow recognition. It can be found at www.synapse.org/MISAW <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.13111v1-abstract-full').style.display = 'none'; document.getElementById('2103.13111v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">MICCAI2020 challenge report, 36 pages including 15 for supplementary material (complet results for each participating teams), 17 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2007.05533">arXiv:2007.05533</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2007.05533">pdf</a>, <a href="https://arxiv.org/format/2007.05533">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ISINet: An Instance-Based Approach for Surgical Instrument Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gonz%C3%A1lez%2C+C">Cristina Gonz谩lez</a>, <a href="/search/cs?searchtype=author&amp;query=Bravo-S%C3%A1nchez%2C+L">Laura Bravo-S谩nchez</a>, <a href="/search/cs?searchtype=author&amp;query=Arbelaez%2C+P">Pablo Arbelaez</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2007.05533v1-abstract-short" style="display: inline;"> We study the task of semantic segmentation of surgical instruments in robotic-assisted surgery scenes. We propose the Instance-based Surgical Instrument Segmentation Network (ISINet), a method that addresses this task from an instance-based segmentation perspective. Our method includes a temporal consistency module that takes into account the previously overlooked and inherent temporal information&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.05533v1-abstract-full').style.display = 'inline'; document.getElementById('2007.05533v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2007.05533v1-abstract-full" style="display: none;"> We study the task of semantic segmentation of surgical instruments in robotic-assisted surgery scenes. We propose the Instance-based Surgical Instrument Segmentation Network (ISINet), a method that addresses this task from an instance-based segmentation perspective. Our method includes a temporal consistency module that takes into account the previously overlooked and inherent temporal information of the problem. We validate our approach on the existing benchmark for the task, the Endoscopic Vision 2017 Robotic Instrument Segmentation Dataset, and on the 2018 version of the dataset, whose annotations we extended for the fine-grained version of instrument segmentation. Our results show that ISINet significantly outperforms state-of-the-art methods, with our baseline version duplicating the Intersection over Union (IoU) of previous methods and our complete model triplicating the IoU. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.05533v1-abstract-full').style.display = 'none'; document.getElementById('2007.05533v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at MICCAI2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2007.05454">arXiv:2007.05454</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2007.05454">pdf</a>, <a href="https://arxiv.org/format/2007.05454">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SIMBA: Specific Identity Markers for Bone Age Assessment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gonz%C3%A1lez%2C+C">Cristina Gonz谩lez</a>, <a href="/search/cs?searchtype=author&amp;query=Escobar%2C+M">Mar铆a Escobar</a>, <a href="/search/cs?searchtype=author&amp;query=Daza%2C+L">Laura Daza</a>, <a href="/search/cs?searchtype=author&amp;query=Torres%2C+F">Felipe Torres</a>, <a href="/search/cs?searchtype=author&amp;query=Triana%2C+G">Gustavo Triana</a>, <a href="/search/cs?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P">Pablo Arbel谩ez</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2007.05454v2-abstract-short" style="display: inline;"> Bone Age Assessment (BAA) is a task performed by radiologists to diagnose abnormal growth in a child. In manual approaches, radiologists take into account different identity markers when calculating bone age, i.e., chronological age and gender. However, the current automated Bone Age Assessment methods do not completely exploit the information present in the patient&#39;s metadata. With this lack of a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.05454v2-abstract-full').style.display = 'inline'; document.getElementById('2007.05454v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2007.05454v2-abstract-full" style="display: none;"> Bone Age Assessment (BAA) is a task performed by radiologists to diagnose abnormal growth in a child. In manual approaches, radiologists take into account different identity markers when calculating bone age, i.e., chronological age and gender. However, the current automated Bone Age Assessment methods do not completely exploit the information present in the patient&#39;s metadata. With this lack of available methods as motivation, we present SIMBA: Specific Identity Markers for Bone Age Assessment. SIMBA is a novel approach for the task of BAA based on the use of identity markers. For this purpose, we build upon the state-of-the-art model, fusing the information present in the identity markers with the visual features created from the original hand radiograph. We then use this robust representation to estimate the patient&#39;s relative bone age: the difference between chronological age and bone age. We validate SIMBA on the Radiological Hand Pose Estimation dataset and find that it outperforms previous state-of-the-art methods. SIMBA sets a trend of a new wave of Computer-aided Diagnosis methods that incorporate all of the data that is available regarding a patient. To promote further research in this area and ensure reproducibility we will provide the source code as well as the pre-trained models of SIMBA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.05454v2-abstract-full').style.display = 'none'; document.getElementById('2007.05454v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at MICCAI 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2006.13163">arXiv:2006.13163</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2006.13163">pdf</a>, <a href="https://arxiv.org/format/2006.13163">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Instrumentation and Methods for Astrophysics">astro-ph.IM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.3847/1538-4365/aba267">10.3847/1538-4365/aba267 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> MANTRA: A Machine Learning reference lightcurve dataset for astronomical transient event recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Neira%2C+M">Mauricio Neira</a>, <a href="/search/cs?searchtype=author&amp;query=G%C3%B3mez%2C+C">Catalina G贸mez</a>, <a href="/search/cs?searchtype=author&amp;query=Su%C3%A1rez-P%C3%A9rez%2C+J+F">John F. Su谩rez-P茅rez</a>, <a href="/search/cs?searchtype=author&amp;query=G%C3%B3mez%2C+D+A">Diego A. G贸mez</a>, <a href="/search/cs?searchtype=author&amp;query=Reyes%2C+J+P">Juan Pablo Reyes</a>, <a href="/search/cs?searchtype=author&amp;query=Hoyos%2C+M+H">Marcela Hern谩ndez Hoyos</a>, <a href="/search/cs?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P">Pablo Arbel谩ez</a>, <a href="/search/cs?searchtype=author&amp;query=Forero-Romero%2C+J+E">Jaime E. Forero-Romero</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2006.13163v2-abstract-short" style="display: inline;"> We introduce MANTRA, an annotated dataset of 4869 transient and 71207 non-transient object lightcurves built from the Catalina Real Time Transient Survey. We provide public access to this dataset as a plain text file to facilitate standardized quantitative comparison of astronomical transient event recognition algorithms. Some of the classes included in the dataset are: supernovae, cataclysmic var&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.13163v2-abstract-full').style.display = 'inline'; document.getElementById('2006.13163v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2006.13163v2-abstract-full" style="display: none;"> We introduce MANTRA, an annotated dataset of 4869 transient and 71207 non-transient object lightcurves built from the Catalina Real Time Transient Survey. We provide public access to this dataset as a plain text file to facilitate standardized quantitative comparison of astronomical transient event recognition algorithms. Some of the classes included in the dataset are: supernovae, cataclysmic variables, active galactic nuclei, high proper motion stars, blazars and flares. As an example of the tasks that can be performed on the dataset we experiment with multiple data pre-processing methods, feature selection techniques and popular machine learning algorithms (Support Vector Machines, Random Forests and Neural Networks). We assess quantitative performance in two classification tasks: binary (transient/non-transient) and eight-class classification. The best performing algorithm in both tasks is the Random Forest Classifier. It achieves an F1-score of 96.25% in the binary classification and 52.79% in the eight-class classification. For the eight-class classification, non-transients ( 96.83% ) is the class with the highest F1-score, while the lowest corresponds to high-proper-motion stars ( 16.79% ); for supernovae it achieves a value of 54.57% , close to the average across classes. The next release of MANTRA includes images and benchmarks with deep learning models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.13163v2-abstract-full').style.display = 'none'; document.getElementById('2006.13163v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 June, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 June, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ApJS accepted, 17 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2006.07682">arXiv:2006.07682</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2006.07682">pdf</a>, <a href="https://arxiv.org/format/2006.07682">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Clustering for Robustness </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Alfarra%2C+M">Motasem Alfarra</a>, <a href="/search/cs?searchtype=author&amp;query=P%C3%A9rez%2C+J+C">Juan C. P茅rez</a>, <a href="/search/cs?searchtype=author&amp;query=Bibi%2C+A">Adel Bibi</a>, <a href="/search/cs?searchtype=author&amp;query=Thabet%2C+A">Ali Thabet</a>, <a href="/search/cs?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P">Pablo Arbel谩ez</a>, <a href="/search/cs?searchtype=author&amp;query=Ghanem%2C+B">Bernard Ghanem</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2006.07682v3-abstract-short" style="display: inline;"> This paper studies how encouraging semantically-aligned features during deep neural network training can increase network robustness. Recent works observed that Adversarial Training leads to robust models, whose learnt features appear to correlate with human perception. Inspired by this connection from robustness to semantics, we study the complementary connection: from semantics to robustness. To&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.07682v3-abstract-full').style.display = 'inline'; document.getElementById('2006.07682v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2006.07682v3-abstract-full" style="display: none;"> This paper studies how encouraging semantically-aligned features during deep neural network training can increase network robustness. Recent works observed that Adversarial Training leads to robust models, whose learnt features appear to correlate with human perception. Inspired by this connection from robustness to semantics, we study the complementary connection: from semantics to robustness. To do so, we provide a robustness certificate for distance-based classification models (clustering-based classifiers). Moreover, we show that this certificate is tight, and we leverage it to propose ClusTR (Clustering Training for Robustness), a clustering-based and adversary-free training framework to learn robust models. Interestingly, \textit{ClusTR} outperforms adversarially-trained networks by up to $4\%$ under strong PGD attacks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.07682v3-abstract-full').style.display = 'none'; document.getElementById('2006.07682v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 June, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to the 32nd British Machine Vision Conference (BMVC&#39;21)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2005.09812">arXiv:2005.09812</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2005.09812">pdf</a>, <a href="https://arxiv.org/format/2005.09812">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Active Speakers in Context </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Alcazar%2C+J+L">Juan Leon Alcazar</a>, <a href="/search/cs?searchtype=author&amp;query=Heilbron%2C+F+C">Fabian Caba Heilbron</a>, <a href="/search/cs?searchtype=author&amp;query=Mai%2C+L">Long Mai</a>, <a href="/search/cs?searchtype=author&amp;query=Perazzi%2C+F">Federico Perazzi</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+J">Joon-Young Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Arbelaez%2C+P">Pablo Arbelaez</a>, <a href="/search/cs?searchtype=author&amp;query=Ghanem%2C+B">Bernard Ghanem</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2005.09812v1-abstract-short" style="display: inline;"> Current methods for active speak er detection focus on modeling short-term audiovisual information from a single speaker. Although this strategy can be enough for addressing single-speaker scenarios, it prevents accurate detection when the task is to identify who of many candidate speakers are talking. This paper introduces the Active Speaker Context, a novel representation that models relationshi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.09812v1-abstract-full').style.display = 'inline'; document.getElementById('2005.09812v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2005.09812v1-abstract-full" style="display: none;"> Current methods for active speak er detection focus on modeling short-term audiovisual information from a single speaker. Although this strategy can be enough for addressing single-speaker scenarios, it prevents accurate detection when the task is to identify who of many candidate speakers are talking. This paper introduces the Active Speaker Context, a novel representation that models relationships between multiple speakers over long time horizons. Our Active Speaker Context is designed to learn pairwise and temporal relations from an structured ensemble of audio-visual observations. Our experiments show that a structured feature ensemble already benefits the active speaker detection performance. Moreover, we find that the proposed Active Speaker Context improves the state-of-the-art on the AVA-ActiveSpeaker dataset achieving a mAP of 87.1%. We present ablation studies that verify that this result is a direct consequence of our long-term multi-speaker analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.09812v1-abstract-full').style.display = 'none'; document.getElementById('2005.09812v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2004.13877">arXiv:2004.13877</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2004.13877">pdf</a>, <a href="https://arxiv.org/format/2004.13877">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Instrumentation and Methods for Astrophysics">astro-ph.IM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1093/mnras/staa2973">10.1093/mnras/staa2973 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Classifying Image Sequences of Astronomical Transients with Deep Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=G%C3%B3mez%2C+C">Catalina G贸mez</a>, <a href="/search/cs?searchtype=author&amp;query=Neira%2C+M">Mauricio Neira</a>, <a href="/search/cs?searchtype=author&amp;query=Hoyos%2C+M+H">Marcela Hern谩ndez Hoyos</a>, <a href="/search/cs?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P">Pablo Arbel谩ez</a>, <a href="/search/cs?searchtype=author&amp;query=Forero-Romero%2C+J+E">Jaime E. Forero-Romero</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2004.13877v2-abstract-short" style="display: inline;"> Supervised classification of temporal sequences of astronomical images into meaningful transient astrophysical phenomena has been considered a hard problem because it requires the intervention of human experts. The classifier uses the expert&#39;s knowledge to find heuristic features to process the images, for instance, by performing image subtraction or by extracting sparse information such as flux t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.13877v2-abstract-full').style.display = 'inline'; document.getElementById('2004.13877v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2004.13877v2-abstract-full" style="display: none;"> Supervised classification of temporal sequences of astronomical images into meaningful transient astrophysical phenomena has been considered a hard problem because it requires the intervention of human experts. The classifier uses the expert&#39;s knowledge to find heuristic features to process the images, for instance, by performing image subtraction or by extracting sparse information such as flux time series, also known as light curves. We present a successful deep learning approach that learns directly from imaging data. Our method models explicitly the spatio-temporal patterns with Deep Convolutional Neural Networks and Gated Recurrent Units. We train these deep neural networks using 1.3 million real astronomical images from the Catalina Real-Time Transient Survey to classify the sequences into five different types of astronomical transient classes. The TAO-Net (for Transient Astronomical Objects Network) architecture outperforms the results from random forest classification on light curves by 10 percentage points as measured by the F1 score for each class; the average F1 over classes goes from $45\%$ with random forest classification to $55\%$ with TAO-Net. This achievement with TAO-Net opens the possibility to develop new deep learning architectures for early transient detection. We make available the training dataset and trained models of TAO-Net to allow for future extensions of this work. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.13877v2-abstract-full').style.display = 'none'; document.getElementById('2004.13877v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 April, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 7 figures. MNRAS accepted</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2003.10299">arXiv:2003.10299</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2003.10299">pdf</a>, <a href="https://arxiv.org/format/2003.10299">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Robust Medical Instrument Segmentation Challenge 2019 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ross%2C+T">Tobias Ross</a>, <a href="/search/cs?searchtype=author&amp;query=Reinke%2C+A">Annika Reinke</a>, <a href="/search/cs?searchtype=author&amp;query=Full%2C+P+M">Peter M. Full</a>, <a href="/search/cs?searchtype=author&amp;query=Wagner%2C+M">Martin Wagner</a>, <a href="/search/cs?searchtype=author&amp;query=Kenngott%2C+H">Hannes Kenngott</a>, <a href="/search/cs?searchtype=author&amp;query=Apitz%2C+M">Martin Apitz</a>, <a href="/search/cs?searchtype=author&amp;query=Hempe%2C+H">Hellena Hempe</a>, <a href="/search/cs?searchtype=author&amp;query=Filimon%2C+D+M">Diana Mindroc Filimon</a>, <a href="/search/cs?searchtype=author&amp;query=Scholz%2C+P">Patrick Scholz</a>, <a href="/search/cs?searchtype=author&amp;query=Tran%2C+T+N">Thuy Nuong Tran</a>, <a href="/search/cs?searchtype=author&amp;query=Bruno%2C+P">Pierangela Bruno</a>, <a href="/search/cs?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P">Pablo Arbel谩ez</a>, <a href="/search/cs?searchtype=author&amp;query=Bian%2C+G">Gui-Bin Bian</a>, <a href="/search/cs?searchtype=author&amp;query=Bodenstedt%2C+S">Sebastian Bodenstedt</a>, <a href="/search/cs?searchtype=author&amp;query=Bolmgren%2C+J+L">Jon Lindstr枚m Bolmgren</a>, <a href="/search/cs?searchtype=author&amp;query=Bravo-S%C3%A1nchez%2C+L">Laura Bravo-S谩nchez</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hua-Bin Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Gonz%C3%A1lez%2C+C">Cristina Gonz谩lez</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dong Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Halvorsen%2C+P">P氓l Halvorsen</a>, <a href="/search/cs?searchtype=author&amp;query=Heng%2C+P">Pheng-Ann Heng</a>, <a href="/search/cs?searchtype=author&amp;query=Hosgor%2C+E">Enes Hosgor</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+Z">Zeng-Guang Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Isensee%2C+F">Fabian Isensee</a>, <a href="/search/cs?searchtype=author&amp;query=Jha%2C+D">Debesh Jha</a> , et al. (25 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2003.10299v2-abstract-short" style="display: inline;"> Intraoperative tracking of laparoscopic instruments is often a prerequisite for computer and robotic-assisted interventions. While numerous methods for detecting, segmenting and tracking of medical instruments based on endoscopic video images have been proposed in the literature, key limitations remain to be addressed: Firstly, robustness, that is, the reliable performance of state-of-the-art meth&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2003.10299v2-abstract-full').style.display = 'inline'; document.getElementById('2003.10299v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2003.10299v2-abstract-full" style="display: none;"> Intraoperative tracking of laparoscopic instruments is often a prerequisite for computer and robotic-assisted interventions. While numerous methods for detecting, segmenting and tracking of medical instruments based on endoscopic video images have been proposed in the literature, key limitations remain to be addressed: Firstly, robustness, that is, the reliable performance of state-of-the-art methods when run on challenging images (e.g. in the presence of blood, smoke or motion artifacts). Secondly, generalization; algorithms trained for a specific intervention in a specific hospital should generalize to other interventions or institutions. In an effort to promote solutions for these limitations, we organized the Robust Medical Instrument Segmentation (ROBUST-MIS) challenge as an international benchmarking competition with a specific focus on the robustness and generalization capabilities of algorithms. For the first time in the field of endoscopic image processing, our challenge included a task on binary segmentation and also addressed multi-instance detection and segmentation. The challenge was based on a surgical data set comprising 10,040 annotated images acquired from a total of 30 surgical procedures from three different types of surgery. The validation of the competing methods for the three tasks (binary segmentation, multi-instance detection and multi-instance segmentation) was performed in three different stages with an increasing domain gap between the training and the test data. The results confirm the initial hypothesis, namely that algorithm performance degrades with an increasing domain gap. While the average detection and segmentation quality of the best-performing algorithms is high, future research should concentrate on detection and segmentation of small, crossing, moving and transparent instrument(s) (parts). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2003.10299v2-abstract-full').style.display = 'none'; document.getElementById('2003.10299v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 March, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">A pre-print</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1912.05661">arXiv:1912.05661</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1912.05661">pdf</a>, <a href="https://arxiv.org/format/1912.05661">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Gabor Layers Enhance Network Robustness </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=P%C3%A9rez%2C+J+C">Juan C. P茅rez</a>, <a href="/search/cs?searchtype=author&amp;query=Alfarra%2C+M">Motasem Alfarra</a>, <a href="/search/cs?searchtype=author&amp;query=Jeanneret%2C+G">Guillaume Jeanneret</a>, <a href="/search/cs?searchtype=author&amp;query=Bibi%2C+A">Adel Bibi</a>, <a href="/search/cs?searchtype=author&amp;query=Thabet%2C+A">Ali Thabet</a>, <a href="/search/cs?searchtype=author&amp;query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P">Pablo Arbel谩ez</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1912.05661v2-abstract-short" style="display: inline;"> We revisit the benefits of merging classical vision concepts with deep learning models. In particular, we explore the effect on robustness against adversarial attacks of replacing the first layers of various deep architectures with Gabor layers, i.e. convolutional layers with filters that are based on learnable Gabor parameters. We observe that architectures enhanced with Gabor layers gain a consi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1912.05661v2-abstract-full').style.display = 'inline'; document.getElementById('1912.05661v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1912.05661v2-abstract-full" style="display: none;"> We revisit the benefits of merging classical vision concepts with deep learning models. In particular, we explore the effect on robustness against adversarial attacks of replacing the first layers of various deep architectures with Gabor layers, i.e. convolutional layers with filters that are based on learnable Gabor parameters. We observe that architectures enhanced with Gabor layers gain a consistent boost in robustness over regular models and preserve high generalizing test performance, even though these layers come at a negligible increase in the number of parameters. We then exploit the closed form expression of Gabor filters to derive an expression for a Lipschitz constant of such filters, and harness this theoretical result to develop a regularizer we use during training to further enhance network robustness. We conduct extensive experiments with various architectures (LeNet, AlexNet, VGG16 and WideResNet) on several datasets (MNIST, SVHN, CIFAR10 and CIFAR100) and demonstrate large empirical robustness gains. Furthermore, we experimentally show how our regularizer provides consistent robustness improvements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1912.05661v2-abstract-full').style.display = 'none'; document.getElementById('1912.05661v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 December, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">32 pages, 23 figures, 14 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1904.05847">arXiv:1904.05847</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1904.05847">pdf</a>, <a href="https://arxiv.org/format/1904.05847">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MAIN: Multi-Attention Instance Network for Video Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Alcazar%2C+J+L">Juan Leon Alcazar</a>, <a href="/search/cs?searchtype=author&amp;query=Bravo%2C+M+A">Maria A. Bravo</a>, <a href="/search/cs?searchtype=author&amp;query=Thabet%2C+A+K">Ali K. Thabet</a>, <a href="/search/cs?searchtype=author&amp;query=Jeanneret%2C+G">Guillaume Jeanneret</a>, <a href="/search/cs?searchtype=author&amp;query=Brox%2C+T">Thomas Brox</a>, <a href="/search/cs?searchtype=author&amp;query=Arbelaez%2C+P">Pablo Arbelaez</a>, <a href="/search/cs?searchtype=author&amp;query=Ghanem%2C+B">Bernard Ghanem</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1904.05847v1-abstract-short" style="display: inline;"> Instance-level video segmentation requires a solid integration of spatial and temporal information. However, current methods rely mostly on domain-specific information (online learning) to produce accurate instance-level segmentations. We propose a novel approach that relies exclusively on the integration of generic spatio-temporal attention cues. Our strategy, named Multi-Attention Instance Netwo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1904.05847v1-abstract-full').style.display = 'inline'; document.getElementById('1904.05847v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1904.05847v1-abstract-full" style="display: none;"> Instance-level video segmentation requires a solid integration of spatial and temporal information. However, current methods rely mostly on domain-specific information (online learning) to produce accurate instance-level segmentations. We propose a novel approach that relies exclusively on the integration of generic spatio-temporal attention cues. Our strategy, named Multi-Attention Instance Network (MAIN), overcomes challenging segmentation scenarios over arbitrary videos without modelling sequence- or instance-specific knowledge. We design MAIN to segment multiple instances in a single forward pass, and optimize it with a novel loss function that favors class agnostic predictions and assigns instance-specific penalties. We achieve state-of-the-art performance on the challenging Youtube-VOS dataset and benchmark, improving the unseen Jaccard and F-Metric by 6.8% and 12.7% respectively, while operating at real-time (30.3 FPS). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1904.05847v1-abstract-full').style.display = 'none'; document.getElementById('1904.05847v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 April, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1904.05443">arXiv:1904.05443</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1904.05443">pdf</a>, <a href="https://arxiv.org/format/1904.05443">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> BAOD: Budget-Aware Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pardo%2C+A">Alejandro Pardo</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+M">Mengmeng Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Thabet%2C+A">Ali Thabet</a>, <a href="/search/cs?searchtype=author&amp;query=Arbelaez%2C+P">Pablo Arbelaez</a>, <a href="/search/cs?searchtype=author&amp;query=Ghanem%2C+B">Bernard Ghanem</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1904.05443v2-abstract-short" style="display: inline;"> We study the problem of object detection from a novel perspective in which annotation budget constraints are taken into consideration, appropriately coined Budget Aware Object Detection (BAOD). When provided with a fixed budget, we propose a strategy for building a diverse and informative dataset that can be used to optimally train a robust detector. We investigate both optimization and learning-b&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1904.05443v2-abstract-full').style.display = 'inline'; document.getElementById('1904.05443v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1904.05443v2-abstract-full" style="display: none;"> We study the problem of object detection from a novel perspective in which annotation budget constraints are taken into consideration, appropriately coined Budget Aware Object Detection (BAOD). When provided with a fixed budget, we propose a strategy for building a diverse and informative dataset that can be used to optimally train a robust detector. We investigate both optimization and learning-based methods to sample which images to annotate and what type of annotation (strongly or weakly supervised) to annotate them with. We adopt a hybrid supervised learning framework to train the object detector from both these types of annotation. We conduct a comprehensive empirical study showing that a handcrafted optimization method outperforms other selection techniques including random sampling, uncertainty sampling and active learning. By combining an optimal image/annotation selection scheme with hybrid supervised learning to solve the BAOD problem, we show that one can achieve the performance of a strongly supervised detector on PASCAL-VOC 2007 while saving 12.8% of its original annotation budget. Furthermore, when $100\%$ of the budget is used, it surpasses this performance by 2.0 mAP percentage points. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1904.05443v2-abstract-full').style.display = 'none'; document.getElementById('1904.05443v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 August, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 April, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1812.03704">arXiv:1812.03704</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1812.03704">pdf</a>, <a href="https://arxiv.org/format/1812.03704">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SMIT: Stochastic Multi-Label Image-to-Image Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Romero%2C+A">Andr茅s Romero</a>, <a href="/search/cs?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P">Pablo Arbel谩ez</a>, <a href="/search/cs?searchtype=author&amp;query=Van+Gool%2C+L">Luc Van Gool</a>, <a href="/search/cs?searchtype=author&amp;query=Timofte%2C+R">Radu Timofte</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1812.03704v3-abstract-short" style="display: inline;"> Cross-domain mapping has been a very active topic in recent years. Given one image, its main purpose is to translate it to the desired target domain, or multiple domains in the case of multiple labels. This problem is highly challenging due to three main reasons: (i) unpaired datasets, (ii) multiple attributes, and (iii) the multimodality (e.g., style) associated with the translation. Most of the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1812.03704v3-abstract-full').style.display = 'inline'; document.getElementById('1812.03704v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1812.03704v3-abstract-full" style="display: none;"> Cross-domain mapping has been a very active topic in recent years. Given one image, its main purpose is to translate it to the desired target domain, or multiple domains in the case of multiple labels. This problem is highly challenging due to three main reasons: (i) unpaired datasets, (ii) multiple attributes, and (iii) the multimodality (e.g., style) associated with the translation. Most of the existing state-of-the-art has focused only on two reasons, i.e. either on (i) and (ii), or (i) and (iii). In this work, we propose a joint framework (i, ii, iii) of diversity and multi-mapping image-to-image translations, using a single generator to conditionally produce countless and unique fake images that hold the underlying characteristics of the source image. Our system does not use style regularization, instead, it uses an embedding representation that we call domain embedding for both domain and style. Extensive experiments over different datasets demonstrate the effectiveness of our proposed approach in comparison with the state-of-the-art in both multi-label and multimodal problems. Additionally, our method is able to generalize under different scenarios: continuous style interpolation, continuous label interpolation, and fine-grained mapping. Code and pretrained models are available at https://github.com/BCV-Uniandes/SMIT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1812.03704v3-abstract-full').style.display = 'none'; document.getElementById('1812.03704v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 December, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICCV Workshops, 2019</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1811.02629">arXiv:1811.02629</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1811.02629">pdf</a>, <a href="https://arxiv.org/format/1811.02629">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Identifying the Best Machine Learning Algorithms for Brain Tumor Segmentation, Progression Assessment, and Overall Survival Prediction in the BRATS Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bakas%2C+S">Spyridon Bakas</a>, <a href="/search/cs?searchtype=author&amp;query=Reyes%2C+M">Mauricio Reyes</a>, <a href="/search/cs?searchtype=author&amp;query=Jakab%2C+A">Andras Jakab</a>, <a href="/search/cs?searchtype=author&amp;query=Bauer%2C+S">Stefan Bauer</a>, <a href="/search/cs?searchtype=author&amp;query=Rempfler%2C+M">Markus Rempfler</a>, <a href="/search/cs?searchtype=author&amp;query=Crimi%2C+A">Alessandro Crimi</a>, <a href="/search/cs?searchtype=author&amp;query=Shinohara%2C+R+T">Russell Takeshi Shinohara</a>, <a href="/search/cs?searchtype=author&amp;query=Berger%2C+C">Christoph Berger</a>, <a href="/search/cs?searchtype=author&amp;query=Ha%2C+S+M">Sung Min Ha</a>, <a href="/search/cs?searchtype=author&amp;query=Rozycki%2C+M">Martin Rozycki</a>, <a href="/search/cs?searchtype=author&amp;query=Prastawa%2C+M">Marcel Prastawa</a>, <a href="/search/cs?searchtype=author&amp;query=Alberts%2C+E">Esther Alberts</a>, <a href="/search/cs?searchtype=author&amp;query=Lipkova%2C+J">Jana Lipkova</a>, <a href="/search/cs?searchtype=author&amp;query=Freymann%2C+J">John Freymann</a>, <a href="/search/cs?searchtype=author&amp;query=Kirby%2C+J">Justin Kirby</a>, <a href="/search/cs?searchtype=author&amp;query=Bilello%2C+M">Michel Bilello</a>, <a href="/search/cs?searchtype=author&amp;query=Fathallah-Shaykh%2C+H">Hassan Fathallah-Shaykh</a>, <a href="/search/cs?searchtype=author&amp;query=Wiest%2C+R">Roland Wiest</a>, <a href="/search/cs?searchtype=author&amp;query=Kirschke%2C+J">Jan Kirschke</a>, <a href="/search/cs?searchtype=author&amp;query=Wiestler%2C+B">Benedikt Wiestler</a>, <a href="/search/cs?searchtype=author&amp;query=Colen%2C+R">Rivka Colen</a>, <a href="/search/cs?searchtype=author&amp;query=Kotrotsou%2C+A">Aikaterini Kotrotsou</a>, <a href="/search/cs?searchtype=author&amp;query=Lamontagne%2C+P">Pamela Lamontagne</a>, <a href="/search/cs?searchtype=author&amp;query=Marcus%2C+D">Daniel Marcus</a>, <a href="/search/cs?searchtype=author&amp;query=Milchenko%2C+M">Mikhail Milchenko</a> , et al. (402 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1811.02629v3-abstract-short" style="display: inline;"> Gliomas are the most common primary brain malignancies, with different degrees of aggressiveness, variable prognosis and various heterogeneous histologic sub-regions, i.e., peritumoral edematous/invaded tissue, necrotic core, active and non-enhancing core. This intrinsic heterogeneity is also portrayed in their radio-phenotype, as their sub-regions are depicted by varying intensity profiles dissem&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1811.02629v3-abstract-full').style.display = 'inline'; document.getElementById('1811.02629v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1811.02629v3-abstract-full" style="display: none;"> Gliomas are the most common primary brain malignancies, with different degrees of aggressiveness, variable prognosis and various heterogeneous histologic sub-regions, i.e., peritumoral edematous/invaded tissue, necrotic core, active and non-enhancing core. This intrinsic heterogeneity is also portrayed in their radio-phenotype, as their sub-regions are depicted by varying intensity profiles disseminated across multi-parametric magnetic resonance imaging (mpMRI) scans, reflecting varying biological properties. Their heterogeneous shape, extent, and location are some of the factors that make these tumors difficult to resect, and in some cases inoperable. The amount of resected tumor is a factor also considered in longitudinal scans, when evaluating the apparent tumor for potential diagnosis of progression. Furthermore, there is mounting evidence that accurate segmentation of the various tumor sub-regions can offer the basis for quantitative image analysis towards prediction of patient overall survival. This study assesses the state-of-the-art machine learning (ML) methods used for brain tumor image analysis in mpMRI scans, during the last seven instances of the International Brain Tumor Segmentation (BraTS) challenge, i.e., 2012-2018. Specifically, we focus on i) evaluating segmentations of the various glioma sub-regions in pre-operative mpMRI scans, ii) assessing potential tumor progression by virtue of longitudinal growth of tumor sub-regions, beyond use of the RECIST/RANO criteria, and iii) predicting the overall survival from pre-operative mpMRI scans of patients that underwent gross total resection. Finally, we investigate the challenge of identifying the best ML algorithms for each of these tasks, considering that apart from being diverse on each instance of the challenge, the multi-institutional mpMRI BraTS dataset has also been a continuously evolving/growing dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1811.02629v3-abstract-full').style.display = 'none'; document.getElementById('1811.02629v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 April, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 November, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The International Multimodal Brain Tumor Segmentation (BraTS) Challenge</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1807.02257">arXiv:1807.02257</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1807.02257">pdf</a>, <a href="https://arxiv.org/format/1807.02257">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Multimodal Instance Segmentation guided by natural language queries </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Margffoy-Tuay%2C+E">Edgar Margffoy-Tuay</a>, <a href="/search/cs?searchtype=author&amp;query=P%C3%A9rez%2C+J+C">Juan C. P茅rez</a>, <a href="/search/cs?searchtype=author&amp;query=Botero%2C+E">Emilio Botero</a>, <a href="/search/cs?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P">Pablo Arbel谩ez</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1807.02257v2-abstract-short" style="display: inline;"> We address the problem of segmenting an object given a natural language expression that describes it. Current techniques tackle this task by either (\textit{i}) directly or recursively merging linguistic and visual information in the channel dimension and then performing convolutions; or by (\textit{ii}) mapping the expression to a space in which it can be thought of as a filter, whose response is&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1807.02257v2-abstract-full').style.display = 'inline'; document.getElementById('1807.02257v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1807.02257v2-abstract-full" style="display: none;"> We address the problem of segmenting an object given a natural language expression that describes it. Current techniques tackle this task by either (\textit{i}) directly or recursively merging linguistic and visual information in the channel dimension and then performing convolutions; or by (\textit{ii}) mapping the expression to a space in which it can be thought of as a filter, whose response is directly related to the presence of the object at a given spatial coordinate in the image, so that a convolution can be applied to look for the object. We propose a novel method that integrates these two insights in order to fully exploit the recursive nature of language. Additionally, during the upsampling process, we take advantage of the intermediate information generated when downsampling the image, so that detailed segmentations can be obtained. We compare our method against the state-of-the-art approaches in four standard datasets, in which it surpasses all previous methods in six of eight of the splits for this task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1807.02257v2-abstract-full').style.display = 'none'; document.getElementById('1807.02257v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 July, 2018; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 July, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2018. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1704.07863">arXiv:1704.07863</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1704.07863">pdf</a>, <a href="https://arxiv.org/format/1704.07863">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multi-View Dynamic Facial Action Unit Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Romero%2C+A">Andres Romero</a>, <a href="/search/cs?searchtype=author&amp;query=Leon%2C+J">Juan Leon</a>, <a href="/search/cs?searchtype=author&amp;query=Arbelaez%2C+P">Pablo Arbelaez</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1704.07863v2-abstract-short" style="display: inline;"> We propose a novel convolutional neural network approach to address the fine-grained recognition problem of multi-view dynamic facial action unit detection. We leverage recent gains in large-scale object recognition by formulating the task of predicting the presence or absence of a specific action unit in a still image of a human face as holistic classification. We then explore the design space of&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1704.07863v2-abstract-full').style.display = 'inline'; document.getElementById('1704.07863v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1704.07863v2-abstract-full" style="display: none;"> We propose a novel convolutional neural network approach to address the fine-grained recognition problem of multi-view dynamic facial action unit detection. We leverage recent gains in large-scale object recognition by formulating the task of predicting the presence or absence of a specific action unit in a still image of a human face as holistic classification. We then explore the design space of our approach by considering both shared and independent representations for separate action units, and also different CNN architectures for combining color and motion information. We then move to the novel setup of the FERA 2017 Challenge, in which we propose a multi-view extension of our approach that operates by first predicting the viewpoint from which the video was taken, and then evaluating an ensemble of action unit detectors that were trained for that specific viewpoint. Our approach is holistic, efficient, and modular, since new action units can be easily included in the overall system. Our approach significantly outperforms the baseline of the FERA 2017 Challenge, with an absolute improvement of 14% on the F1-metric. Additionally, it compares favorably against the winner of the FERA 2017 challenge. Code source is available at https://github.com/BCV-Uniandes/AUNets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1704.07863v2-abstract-full').style.display = 'none'; document.getElementById('1704.07863v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2018; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 April, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2017. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1704.00675">arXiv:1704.00675</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1704.00675">pdf</a>, <a href="https://arxiv.org/format/1704.00675">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> The 2017 DAVIS Challenge on Video Object Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pont-Tuset%2C+J">Jordi Pont-Tuset</a>, <a href="/search/cs?searchtype=author&amp;query=Perazzi%2C+F">Federico Perazzi</a>, <a href="/search/cs?searchtype=author&amp;query=Caelles%2C+S">Sergi Caelles</a>, <a href="/search/cs?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P">Pablo Arbel谩ez</a>, <a href="/search/cs?searchtype=author&amp;query=Sorkine-Hornung%2C+A">Alex Sorkine-Hornung</a>, <a href="/search/cs?searchtype=author&amp;query=Van+Gool%2C+L">Luc Van Gool</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1704.00675v3-abstract-short" style="display: inline;"> We present the 2017 DAVIS Challenge on Video Object Segmentation, a public dataset, benchmark, and competition specifically designed for the task of video object segmentation. Following the footsteps of other successful initiatives, such as ILSVRC and PASCAL VOC, which established the avenue of research in the fields of scene classification and semantic segmentation, the DAVIS Challenge comprises&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1704.00675v3-abstract-full').style.display = 'inline'; document.getElementById('1704.00675v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1704.00675v3-abstract-full" style="display: none;"> We present the 2017 DAVIS Challenge on Video Object Segmentation, a public dataset, benchmark, and competition specifically designed for the task of video object segmentation. Following the footsteps of other successful initiatives, such as ILSVRC and PASCAL VOC, which established the avenue of research in the fields of scene classification and semantic segmentation, the DAVIS Challenge comprises a dataset, an evaluation methodology, and a public competition with a dedicated workshop co-located with CVPR 2017. The DAVIS Challenge follows up on the recent publication of DAVIS (Densely-Annotated VIdeo Segmentation), which has fostered the development of several novel state-of-the-art video object segmentation techniques. In this paper we describe the scope of the benchmark, highlight the main characteristics of the dataset, define the evaluation metrics of the competition, and present a detailed analysis of the results of the participants to the challenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1704.00675v3-abstract-full').style.display = 'none'; document.getElementById('1704.00675v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 March, 2018; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 April, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2017. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Challenge website: http://davischallenge.org</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1701.04658">arXiv:1701.04658</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1701.04658">pdf</a>, <a href="https://arxiv.org/format/1701.04658">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Convolutional Oriented Boundaries: From Image Segmentation to High-Level Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Maninis%2C+K">Kevis-Kokitsi Maninis</a>, <a href="/search/cs?searchtype=author&amp;query=Pont-Tuset%2C+J">Jordi Pont-Tuset</a>, <a href="/search/cs?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P">Pablo Arbel谩ez</a>, <a href="/search/cs?searchtype=author&amp;query=Van+Gool%2C+L">Luc Van Gool</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1701.04658v2-abstract-short" style="display: inline;"> We present Convolutional Oriented Boundaries (COB), which produces multiscale oriented contours and region hierarchies starting from generic image classification Convolutional Neural Networks (CNNs). COB is computationally efficient, because it requires a single CNN forward pass for multi-scale contour detection and it uses a novel sparse boundary representation for hierarchical segmentation; it g&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1701.04658v2-abstract-full').style.display = 'inline'; document.getElementById('1701.04658v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1701.04658v2-abstract-full" style="display: none;"> We present Convolutional Oriented Boundaries (COB), which produces multiscale oriented contours and region hierarchies starting from generic image classification Convolutional Neural Networks (CNNs). COB is computationally efficient, because it requires a single CNN forward pass for multi-scale contour detection and it uses a novel sparse boundary representation for hierarchical segmentation; it gives a significant leap in performance over the state-of-the-art, and it generalizes very well to unseen categories and datasets. Particularly, we show that learning to estimate not only contour strength but also orientation provides more accurate results. We perform extensive experiments for low-level applications on BSDS, PASCAL Context, PASCAL Segmentation, and NYUD to evaluate boundary detection performance, showing that COB provides state-of-the-art contours and region hierarchies in all datasets. We also evaluate COB on high-level tasks when coupled with multiple pipelines for object proposals, semantic contours, semantic segmentation, and object detection on MS-COCO, SBD, and PASCAL; showing that COB also improves the results for all tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1701.04658v2-abstract-full').style.display = 'none'; document.getElementById('1701.04658v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 April, 2017; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 January, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2017. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by T-PAMI. Extended version of &#34;Convolutional Oriented Boundaries&#34;, ECCV 2016 (arXiv:1608.02755). Project page: http://www.vision.ee.ethz.ch/~cvlsegmentation/cob/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1609.01103">arXiv:1609.01103</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1609.01103">pdf</a>, <a href="https://arxiv.org/format/1609.01103">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/978-3-319-46723-8_17">10.1007/978-3-319-46723-8_17 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Deep Retinal Image Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Maninis%2C+K">Kevis-Kokitsi Maninis</a>, <a href="/search/cs?searchtype=author&amp;query=Pont-Tuset%2C+J">Jordi Pont-Tuset</a>, <a href="/search/cs?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P">Pablo Arbel谩ez</a>, <a href="/search/cs?searchtype=author&amp;query=Van+Gool%2C+L">Luc Van Gool</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1609.01103v1-abstract-short" style="display: inline;"> This paper presents Deep Retinal Image Understanding (DRIU), a unified framework of retinal image analysis that provides both retinal vessel and optic disc segmentation. We make use of deep Convolutional Neural Networks (CNNs), which have proven revolutionary in other fields of computer vision such as object detection and image classification, and we bring their power to the study of eye fundus im&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1609.01103v1-abstract-full').style.display = 'inline'; document.getElementById('1609.01103v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1609.01103v1-abstract-full" style="display: none;"> This paper presents Deep Retinal Image Understanding (DRIU), a unified framework of retinal image analysis that provides both retinal vessel and optic disc segmentation. We make use of deep Convolutional Neural Networks (CNNs), which have proven revolutionary in other fields of computer vision such as object detection and image classification, and we bring their power to the study of eye fundus images. DRIU uses a base network architecture on which two set of specialized layers are trained to solve both the retinal vessel and optic disc segmentation. We present experimental validation, both qualitative and quantitative, in four public datasets for these tasks. In all of them, DRIU presents super-human performance, that is, it shows results more consistent with a gold standard than a second human annotator used as control. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1609.01103v1-abstract-full').style.display = 'none'; document.getElementById('1609.01103v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2016; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2016. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">MICCAI 2016 Camera Ready</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1608.02755">arXiv:1608.02755</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1608.02755">pdf</a>, <a href="https://arxiv.org/format/1608.02755">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/978-3-319-46448-0_35">10.1007/978-3-319-46448-0_35 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Convolutional Oriented Boundaries </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Maninis%2C+K">Kevis-Kokitsi Maninis</a>, <a href="/search/cs?searchtype=author&amp;query=Pont-Tuset%2C+J">Jordi Pont-Tuset</a>, <a href="/search/cs?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P">Pablo Arbel谩ez</a>, <a href="/search/cs?searchtype=author&amp;query=Van+Gool%2C+L">Luc Van Gool</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1608.02755v1-abstract-short" style="display: inline;"> We present Convolutional Oriented Boundaries (COB), which produces multiscale oriented contours and region hierarchies starting from generic image classification Convolutional Neural Networks (CNNs). COB is computationally efficient, because it requires a single CNN forward pass for contour detection and it uses a novel sparse boundary representation for hierarchical segmentation; it gives a signi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1608.02755v1-abstract-full').style.display = 'inline'; document.getElementById('1608.02755v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1608.02755v1-abstract-full" style="display: none;"> We present Convolutional Oriented Boundaries (COB), which produces multiscale oriented contours and region hierarchies starting from generic image classification Convolutional Neural Networks (CNNs). COB is computationally efficient, because it requires a single CNN forward pass for contour detection and it uses a novel sparse boundary representation for hierarchical segmentation; it gives a significant leap in performance over the state-of-the-art, and it generalizes very well to unseen categories and datasets. Particularly, we show that learning to estimate not only contour strength but also orientation provides more accurate results. We perform extensive experiments on BSDS, PASCAL Context, PASCAL Segmentation, and MS-COCO, showing that COB provides state-of-the-art contours, region hierarchies, and object proposals in all datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1608.02755v1-abstract-full').style.display = 'none'; document.getElementById('1608.02755v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 August, 2016; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2016. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2016 Camera Ready</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Arbel%C3%A1ez%2C+P&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10