CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–24 of 24 results for author: <span class="mathjax">Simo-Serra, E</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/" aria-role="search"> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Simo-Serra, E"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Simo-Serra%2C+E&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Simo-Serra, E"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18421">arXiv:2412.18421</a> <span> [<a href="https://arxiv.org/pdf/2412.18421">pdf</a>, <a href="https://arxiv.org/format/2412.18421">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Fashionability-Enhancing Outfit Image Editing with Conditional Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Qin%2C+Q">Qice Qin</a>, <a href="/search/?searchtype=author&query=Hirakawa%2C+Y">Yuki Hirakawa</a>, <a href="/search/?searchtype=author&query=Shimizu%2C+R">Ryotaro Shimizu</a>, <a href="/search/?searchtype=author&query=Furusawa%2C+T">Takuya Furusawa</a>, <a href="/search/?searchtype=author&query=Simo-Serra%2C+E">Edgar Simo-Serra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18421v1-abstract-short" style="display: inline;"> Image generation in the fashion domain has predominantly focused on preserving body characteristics or following input prompts, but little attention has been paid to improving the inherent fashionability of the output images. This paper presents a novel diffusion model-based approach that generates fashion images with improved fashionability while maintaining control over key attributes. Key compo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18421v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18421v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18421v1-abstract-full" style="display: none;"> Image generation in the fashion domain has predominantly focused on preserving body characteristics or following input prompts, but little attention has been paid to improving the inherent fashionability of the output images. This paper presents a novel diffusion model-based approach that generates fashion images with improved fashionability while maintaining control over key attributes. Key components of our method include: 1) fashionability enhancement, which ensures that the generated images are more fashionable than the input; 2) preservation of body characteristics, encouraging the generated images to maintain the original shape and proportions of the input; and 3) automatic fashion optimization, which does not rely on manual input or external prompts. We also employ two methods to collect training data for guidance while generating and evaluating the images. In particular, we rate outfit images using fashionability scores annotated by multiple fashion experts through OpenSkill-based and five critical aspect-based pairwise comparisons. These methods provide complementary perspectives for assessing and improving the fashionability of the generated images. The experimental results show that our approach outperforms the baseline Fashion++ in generating images with superior fashionability, demonstrating its effectiveness in producing more stylish and appealing fashion images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18421v1-abstract-full').style.display = 'none'; document.getElementById('2412.18421v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19051">arXiv:2409.19051</a> <span> [<a href="https://arxiv.org/pdf/2409.19051">pdf</a>, <a href="https://arxiv.org/format/2409.19051">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Multimodal Markup Document Models for Graphic Design Completion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Kikuchi%2C+K">Kotaro Kikuchi</a>, <a href="/search/?searchtype=author&query=Inoue%2C+N">Naoto Inoue</a>, <a href="/search/?searchtype=author&query=Otani%2C+M">Mayu Otani</a>, <a href="/search/?searchtype=author&query=Simo-Serra%2C+E">Edgar Simo-Serra</a>, <a href="/search/?searchtype=author&query=Yamaguchi%2C+K">Kota Yamaguchi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19051v1-abstract-short" style="display: inline;"> This paper presents multimodal markup document models (MarkupDM) that can generate both markup language and images within interleaved multimodal documents. Unlike existing vision-and-language multimodal models, our MarkupDM tackles unique challenges critical to graphic design tasks: generating partial images that contribute to the overall appearance, often involving transparency and varying sizes,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19051v1-abstract-full').style.display = 'inline'; document.getElementById('2409.19051v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19051v1-abstract-full" style="display: none;"> This paper presents multimodal markup document models (MarkupDM) that can generate both markup language and images within interleaved multimodal documents. Unlike existing vision-and-language multimodal models, our MarkupDM tackles unique challenges critical to graphic design tasks: generating partial images that contribute to the overall appearance, often involving transparency and varying sizes, and understanding the syntax and semantics of markup languages, which play a fundamental role as a representational format of graphic designs. To address these challenges, we design an image quantizer to tokenize images of diverse sizes with transparency and modify a code language model to process markup languages and incorporate image modalities. We provide in-depth evaluations of our approach on three graphic design completion tasks: generating missing attribute values, images, and texts in graphic design templates. Results corroborate the effectiveness of our MarkupDM for graphic design tasks. We also discuss the strengths and weaknesses in detail, providing insights for future research on multimodal document generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19051v1-abstract-full').style.display = 'none'; document.getElementById('2409.19051v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://cyberagentailab.github.io/MarkupDM/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.17404">arXiv:2407.17404</a> <span> [<a href="https://arxiv.org/pdf/2407.17404">pdf</a>, <a href="https://arxiv.org/format/2407.17404">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TG.2024.3520214">10.1109/TG.2024.3520214 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Grammar-based Game Description Generation using Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Tanaka%2C+T">Tsunehiko Tanaka</a>, <a href="/search/?searchtype=author&query=Simo-Serra%2C+E">Edgar Simo-Serra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.17404v2-abstract-short" style="display: inline;"> Game Description Language (GDL) provides a standardized way to express diverse games in a machine-readable format, enabling automated game simulation, and evaluation. While previous research has explored game description generation using search-based methods, generating GDL descriptions from natural language remains a challenging task. This paper presents a novel framework that leverages Large Lan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17404v2-abstract-full').style.display = 'inline'; document.getElementById('2407.17404v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.17404v2-abstract-full" style="display: none;"> Game Description Language (GDL) provides a standardized way to express diverse games in a machine-readable format, enabling automated game simulation, and evaluation. While previous research has explored game description generation using search-based methods, generating GDL descriptions from natural language remains a challenging task. This paper presents a novel framework that leverages Large Language Models (LLMs) to generate grammatically accurate game descriptions from natural language. Our approach consists of two stages: first, we gradually generate a minimal grammar based on GDL specifications; second, we iteratively improve the game description through grammar-guided generation. Our framework employs a specialized parser that identifies valid subsequences and candidate symbols from LLM responses, enabling gradual refinement of the output to ensure grammatical correctness. Experimental results demonstrate that our iterative improvement approach significantly outperforms baseline methods that directly use LLM outputs. Our code is available at https://github.com/tsunehiko/ggdg <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17404v2-abstract-full').style.display = 'none'; document.getElementById('2407.17404v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication at the IEEE Transactions on Games</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Transactions on Games, 2024 (early access) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.19056">arXiv:2405.19056</a> <span> [<a href="https://arxiv.org/pdf/2405.19056">pdf</a>, <a href="https://arxiv.org/format/2405.19056">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Neural Scene Baking for Permutation Invariant Transparency Rendering with Real-time Global Illumination </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Zhang%2C+Z">Ziyang Zhang</a>, <a href="/search/?searchtype=author&query=Simo-Serra%2C+E">Edgar Simo-Serra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.19056v1-abstract-short" style="display: inline;"> Neural rendering provides a fundamentally new way to render photorealistic images. Similar to traditional light-baking methods, neural rendering utilizes neural networks to bake representations of scenes, materials, and lights into latent vectors learned from path-tracing ground truths. However, existing neural rendering algorithms typically use G-buffers to provide position, normal, and texture i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19056v1-abstract-full').style.display = 'inline'; document.getElementById('2405.19056v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.19056v1-abstract-full" style="display: none;"> Neural rendering provides a fundamentally new way to render photorealistic images. Similar to traditional light-baking methods, neural rendering utilizes neural networks to bake representations of scenes, materials, and lights into latent vectors learned from path-tracing ground truths. However, existing neural rendering algorithms typically use G-buffers to provide position, normal, and texture information of scenes, which are prone to occlusion by transparent surfaces, leading to distortions and loss of detail in the rendered images. To address this limitation, we propose a novel neural rendering pipeline that accurately renders the scene behind transparent surfaces with global illumination and variable scenes. Our method separates the G-buffers of opaque and transparent objects, retaining G-buffer information behind transparent objects. Additionally, to render the transparent objects with permutation invariance, we designed a new permutation-invariant neural blending function. We integrate our algorithm into an efficient custom renderer to achieve real-time performance. Our results show that our method is capable of rendering photorealistic images with variable scenes and viewpoints, accurately capturing complex transparent structures along with global illumination. Our renderer can achieve real-time performance ($256\times 256$ at 63 FPS and $512\times 512$ at 32 FPS) on scenes with multiple variable transparent objects. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19056v1-abstract-full').style.display = 'none'; document.getElementById('2405.19056v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted by Computational Visual Media</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.03923">arXiv:2402.03923</a> <span> [<a href="https://arxiv.org/pdf/2402.03923">pdf</a>, <a href="https://arxiv.org/format/2402.03923">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Return-Aligned Decision Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Tanaka%2C+T">Tsunehiko Tanaka</a>, <a href="/search/?searchtype=author&query=Abe%2C+K">Kenshi Abe</a>, <a href="/search/?searchtype=author&query=Ariu%2C+K">Kaito Ariu</a>, <a href="/search/?searchtype=author&query=Morimura%2C+T">Tetsuro Morimura</a>, <a href="/search/?searchtype=author&query=Simo-Serra%2C+E">Edgar Simo-Serra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.03923v5-abstract-short" style="display: inline;"> Traditional approaches in offline reinforcement learning aim to learn the optimal policy that maximizes the cumulative reward, also known as return. It is increasingly important to adjust the performance of AI agents to meet human requirements, for example, in applications like video games and education tools. Decision Transformer (DT) optimizes a policy that generates actions conditioned on the t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.03923v5-abstract-full').style.display = 'inline'; document.getElementById('2402.03923v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.03923v5-abstract-full" style="display: none;"> Traditional approaches in offline reinforcement learning aim to learn the optimal policy that maximizes the cumulative reward, also known as return. It is increasingly important to adjust the performance of AI agents to meet human requirements, for example, in applications like video games and education tools. Decision Transformer (DT) optimizes a policy that generates actions conditioned on the target return through supervised learning and includes a mechanism to control the agent's performance using the target return. However, the action generation is hardly influenced by the target return because DT's self-attention allocates scarce attention scores to the return tokens. In this paper, we propose Return-Aligned Decision Transformer (RADT), designed to more effectively align the actual return with the target return. RADT leverages features extracted by paying attention solely to the return, enabling action generation to consistently depend on the target return. Extensive experiments show that RADT significantly reduces the discrepancies between the actual return and the target return compared to DT-based methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.03923v5-abstract-full').style.display = 'none'; document.getElementById('2402.03923v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.04794">arXiv:2312.04794</a> <span> [<a href="https://arxiv.org/pdf/2312.04794">pdf</a>, <a href="https://arxiv.org/format/2312.04794">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/978-3-031-43904-9_59">10.1007/978-3-031-43904-9_59 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Visual Grounding of Whole Radiology Reports for 3D CT Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Ichinose%2C+A">Akimichi Ichinose</a>, <a href="/search/?searchtype=author&query=Hatsutani%2C+T">Taro Hatsutani</a>, <a href="/search/?searchtype=author&query=Nakamura%2C+K">Keigo Nakamura</a>, <a href="/search/?searchtype=author&query=Kitamura%2C+Y">Yoshiro Kitamura</a>, <a href="/search/?searchtype=author&query=Iizuka%2C+S">Satoshi Iizuka</a>, <a href="/search/?searchtype=author&query=Simo-Serra%2C+E">Edgar Simo-Serra</a>, <a href="/search/?searchtype=author&query=Kido%2C+S">Shoji Kido</a>, <a href="/search/?searchtype=author&query=Tomiyama%2C+N">Noriyuki Tomiyama</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.04794v1-abstract-short" style="display: inline;"> Building a large-scale training dataset is an essential problem in the development of medical image recognition systems. Visual grounding techniques, which automatically associate objects in images with corresponding descriptions, can facilitate labeling of large number of images. However, visual grounding of radiology reports for CT images remains challenging, because so many kinds of anomalies a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.04794v1-abstract-full').style.display = 'inline'; document.getElementById('2312.04794v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.04794v1-abstract-full" style="display: none;"> Building a large-scale training dataset is an essential problem in the development of medical image recognition systems. Visual grounding techniques, which automatically associate objects in images with corresponding descriptions, can facilitate labeling of large number of images. However, visual grounding of radiology reports for CT images remains challenging, because so many kinds of anomalies are detectable via CT imaging, and resulting report descriptions are long and complex. In this paper, we present the first visual grounding framework designed for CT image and report pairs covering various body parts and diverse anomaly types. Our framework combines two components of 1) anatomical segmentation of images, and 2) report structuring. The anatomical segmentation provides multiple organ masks of given CT images, and helps the grounding model recognize detailed anatomies. The report structuring helps to accurately extract information regarding the presence, location, and type of each anomaly described in corresponding reports. Given the two additional image/report features, the grounding model can achieve better localization. In the verification process, we constructed a large-scale dataset with region-description correspondence annotations for 10,410 studies of 7,321 unique patients. We evaluated our framework using grounding accuracy, the percentage of correctly localized anomalies, as a metric and demonstrated that the combination of the anatomical segmentation and the report structuring improves the performance with a large margin over the baseline model (66.0% vs 77.8%). Comparison with the prior techniques also showed higher performance of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.04794v1-abstract-full').style.display = 'none'; document.getElementById('2312.04794v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 7 figures. Accepted at MICCAI 2023</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Medical Image Computing and Computer Assisted Intervention Lecture Notes in Computer Science 14224 (2023) 611-621 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.04779">arXiv:2312.04779</a> <span> [<a href="https://arxiv.org/pdf/2312.04779">pdf</a>, <a href="https://arxiv.org/format/2312.04779">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Image Synthesis-based Late Stage Cancer Augmentation and Semi-Supervised Segmentation for MRI Rectal Cancer Staging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Sasuga%2C+S">Saeko Sasuga</a>, <a href="/search/?searchtype=author&query=Kudo%2C+A">Akira Kudo</a>, <a href="/search/?searchtype=author&query=Kitamura%2C+Y">Yoshiro Kitamura</a>, <a href="/search/?searchtype=author&query=Iizuka%2C+S">Satoshi Iizuka</a>, <a href="/search/?searchtype=author&query=Simo-Serra%2C+E">Edgar Simo-Serra</a>, <a href="/search/?searchtype=author&query=Hamabe%2C+A">Atsushi Hamabe</a>, <a href="/search/?searchtype=author&query=Ishii%2C+M">Masayuki Ishii</a>, <a href="/search/?searchtype=author&query=Takemasa%2C+I">Ichiro Takemasa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.04779v1-abstract-short" style="display: inline;"> Rectal cancer is one of the most common diseases and a major cause of mortality. For deciding rectal cancer treatment plans, T-staging is important. However, evaluating the index from preoperative MRI images requires high radiologists' skill and experience. Therefore, the aim of this study is to segment the mesorectum, rectum, and rectal cancer region so that the system can predict T-stage from se… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.04779v1-abstract-full').style.display = 'inline'; document.getElementById('2312.04779v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.04779v1-abstract-full" style="display: none;"> Rectal cancer is one of the most common diseases and a major cause of mortality. For deciding rectal cancer treatment plans, T-staging is important. However, evaluating the index from preoperative MRI images requires high radiologists' skill and experience. Therefore, the aim of this study is to segment the mesorectum, rectum, and rectal cancer region so that the system can predict T-stage from segmentation results. Generally, shortage of large and diverse dataset and high quality annotation are known to be the bottlenecks in computer aided diagnostics development. Regarding rectal cancer, advanced cancer images are very rare, and per-pixel annotation requires high radiologists' skill and time. Therefore, it is not feasible to collect comprehensive disease patterns in a training dataset. To tackle this, we propose two kinds of approaches of image synthesis-based late stage cancer augmentation and semi-supervised learning which is designed for T-stage prediction. In the image synthesis data augmentation approach, we generated advanced cancer images from labels. The real cancer labels were deformed to resemble advanced cancer labels by artificial cancer progress simulation. Next, we introduce a T-staging loss which enables us to train segmentation models from per-image T-stage labels. The loss works to keep inclusion/invasion relationships between rectum and cancer region consistent to the ground truth T-stage. The verification tests show that the proposed method obtains the best sensitivity (0.76) and specificity (0.80) in distinguishing between over T3 stage and underT2. In the ablation studies, our semi-supervised learning approach with the T-staging loss improved specificity by 0.13. Adding the image synthesis-based data augmentation improved the DICE score of invasion cancer area by 0.08 from baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.04779v1-abstract-full').style.display = 'none'; document.getElementById('2312.04779v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 7 figures, Accepted to Data Augmentation, Labeling, and Imperfections (DALI) at MICCAI 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.14759">arXiv:2309.14759</a> <span> [<a href="https://arxiv.org/pdf/2309.14759">pdf</a>, <a href="https://arxiv.org/format/2309.14759">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Diffusion-based Holistic Texture Rectification and Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Hao%2C+G">Guoqing Hao</a>, <a href="/search/?searchtype=author&query=Iizuka%2C+S">Satoshi Iizuka</a>, <a href="/search/?searchtype=author&query=Hara%2C+K">Kensho Hara</a>, <a href="/search/?searchtype=author&query=Simo-Serra%2C+E">Edgar Simo-Serra</a>, <a href="/search/?searchtype=author&query=Kataoka%2C+H">Hirokatsu Kataoka</a>, <a href="/search/?searchtype=author&query=Fukui%2C+K">Kazuhiro Fukui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.14759v1-abstract-short" style="display: inline;"> We present a novel framework for rectifying occlusions and distortions in degraded texture samples from natural images. Traditional texture synthesis approaches focus on generating textures from pristine samples, which necessitate meticulous preparation by humans and are often unattainable in most natural images. These challenges stem from the frequent occlusions and distortions of texture samples… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.14759v1-abstract-full').style.display = 'inline'; document.getElementById('2309.14759v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.14759v1-abstract-full" style="display: none;"> We present a novel framework for rectifying occlusions and distortions in degraded texture samples from natural images. Traditional texture synthesis approaches focus on generating textures from pristine samples, which necessitate meticulous preparation by humans and are often unattainable in most natural images. These challenges stem from the frequent occlusions and distortions of texture samples in natural images due to obstructions and variations in object surface geometry. To address these issues, we propose a framework that synthesizes holistic textures from degraded samples in natural images, extending the applicability of exemplar-based texture synthesis techniques. Our framework utilizes a conditional Latent Diffusion Model (LDM) with a novel occlusion-aware latent transformer. This latent transformer not only effectively encodes texture features from partially-observed samples necessary for the generation process of the LDM, but also explicitly captures long-range dependencies in samples with large occlusions. To train our model, we introduce a method for generating synthetic data by applying geometric transformations and free-form mask generation to clean textures. Experimental results demonstrate that our framework significantly outperforms existing methods both quantitatively and quantitatively. Furthermore, we conduct comprehensive ablation studies to validate the different components of our proposed framework. Results are corroborated by a perceptual user study which highlights the efficiency of our proposed approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.14759v1-abstract-full').style.display = 'none'; document.getElementById('2309.14759v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">SIGGRAPH Asia 2023 Conference Paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.10111">arXiv:2308.10111</a> <span> [<a href="https://arxiv.org/pdf/2308.10111">pdf</a>, <a href="https://arxiv.org/format/2308.10111">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Controllable Multi-domain Semantic Artwork Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Huang%2C+Y">Yuantian Huang</a>, <a href="/search/?searchtype=author&query=Iizuka%2C+S">Satoshi Iizuka</a>, <a href="/search/?searchtype=author&query=Simo-Serra%2C+E">Edgar Simo-Serra</a>, <a href="/search/?searchtype=author&query=Fukui%2C+K">Kazuhiro Fukui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.10111v1-abstract-short" style="display: inline;"> We present a novel framework for multi-domain synthesis of artwork from semantic layouts. One of the main limitations of this challenging task is the lack of publicly available segmentation datasets for art synthesis. To address this problem, we propose a dataset, which we call ArtSem, that contains 40,000 images of artwork from 4 different domains with their corresponding semantic label maps. We… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.10111v1-abstract-full').style.display = 'inline'; document.getElementById('2308.10111v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.10111v1-abstract-full" style="display: none;"> We present a novel framework for multi-domain synthesis of artwork from semantic layouts. One of the main limitations of this challenging task is the lack of publicly available segmentation datasets for art synthesis. To address this problem, we propose a dataset, which we call ArtSem, that contains 40,000 images of artwork from 4 different domains with their corresponding semantic label maps. We generate the dataset by first extracting semantic maps from landscape photography and then propose a conditional Generative Adversarial Network (GAN)-based approach to generate high-quality artwork from the semantic maps without necessitating paired training data. Furthermore, we propose an artwork synthesis model that uses domain-dependent variational encoders for high-quality multi-domain synthesis. The model is improved and complemented with a simple but effective normalization method, based on normalizing both the semantic and style jointly, which we call Spatially STyle-Adaptive Normalization (SSTAN). In contrast to previous methods that only take semantic layout as input, our model is able to learn a joint representation of both style and semantic information, which leads to better generation quality for synthesizing artistic images. Results indicate that our model learns to separate the domains in the latent space, and thus, by identifying the hyperplanes that separate the different domains, we can also perform fine-grained control of the synthesized artwork. By combining our proposed dataset and approach, we are able to generate user-controllable artwork that is of higher quality than existing <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.10111v1-abstract-full').style.display = 'none'; document.getElementById('2308.10111v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, accepted by CVMJ, to appear</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.18248">arXiv:2303.18248</a> <span> [<a href="https://arxiv.org/pdf/2303.18248">pdf</a>, <a href="https://arxiv.org/format/2303.18248">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Flexible Multi-modal Document Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Inoue%2C+N">Naoto Inoue</a>, <a href="/search/?searchtype=author&query=Kikuchi%2C+K">Kotaro Kikuchi</a>, <a href="/search/?searchtype=author&query=Simo-Serra%2C+E">Edgar Simo-Serra</a>, <a href="/search/?searchtype=author&query=Otani%2C+M">Mayu Otani</a>, <a href="/search/?searchtype=author&query=Yamaguchi%2C+K">Kota Yamaguchi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.18248v1-abstract-short" style="display: inline;"> Creative workflows for generating graphical documents involve complex inter-related tasks, such as aligning elements, choosing appropriate fonts, or employing aesthetically harmonious colors. In this work, we attempt at building a holistic model that can jointly solve many different design tasks. Our model, which we denote by FlexDM, treats vector graphic documents as a set of multi-modal elements… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.18248v1-abstract-full').style.display = 'inline'; document.getElementById('2303.18248v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.18248v1-abstract-full" style="display: none;"> Creative workflows for generating graphical documents involve complex inter-related tasks, such as aligning elements, choosing appropriate fonts, or employing aesthetically harmonious colors. In this work, we attempt at building a holistic model that can jointly solve many different design tasks. Our model, which we denote by FlexDM, treats vector graphic documents as a set of multi-modal elements, and learns to predict masked fields such as element type, position, styling attributes, image, or text, using a unified architecture. Through the use of explicit multi-task learning and in-domain pre-training, our model can better capture the multi-modal relationships among the different document fields. Experimental results corroborate that our single FlexDM is able to successfully solve a multitude of different design tasks, while achieving performance that is competitive with task-specific and costly baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.18248v1-abstract-full').style.display = 'none'; document.getElementById('2303.18248v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To be published in CVPR2023 (highlight), project page: https://cyberagentailab.github.io/flex-dm</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.08137">arXiv:2303.08137</a> <span> [<a href="https://arxiv.org/pdf/2303.08137">pdf</a>, <a href="https://arxiv.org/format/2303.08137">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> LayoutDM: Discrete Diffusion Model for Controllable Layout Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Inoue%2C+N">Naoto Inoue</a>, <a href="/search/?searchtype=author&query=Kikuchi%2C+K">Kotaro Kikuchi</a>, <a href="/search/?searchtype=author&query=Simo-Serra%2C+E">Edgar Simo-Serra</a>, <a href="/search/?searchtype=author&query=Otani%2C+M">Mayu Otani</a>, <a href="/search/?searchtype=author&query=Yamaguchi%2C+K">Kota Yamaguchi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.08137v1-abstract-short" style="display: inline;"> Controllable layout generation aims at synthesizing plausible arrangement of element bounding boxes with optional constraints, such as type or position of a specific element. In this work, we try to solve a broad range of layout generation tasks in a single model that is based on discrete state-space diffusion models. Our model, named LayoutDM, naturally handles the structured layout data in the d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.08137v1-abstract-full').style.display = 'inline'; document.getElementById('2303.08137v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.08137v1-abstract-full" style="display: none;"> Controllable layout generation aims at synthesizing plausible arrangement of element bounding boxes with optional constraints, such as type or position of a specific element. In this work, we try to solve a broad range of layout generation tasks in a single model that is based on discrete state-space diffusion models. Our model, named LayoutDM, naturally handles the structured layout data in the discrete representation and learns to progressively infer a noiseless layout from the initial input, where we model the layout corruption process by modality-wise discrete diffusion. For conditional generation, we propose to inject layout constraints in the form of masking or logit adjustment during inference. We show in the experiments that our LayoutDM successfully generates high-quality layouts and outperforms both task-specific and task-agnostic baselines on several layout tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.08137v1-abstract-full').style.display = 'none'; document.getElementById('2303.08137v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To be published in CVPR2023, project page: https://cyberagentailab.github.io/layout-dm/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.11541">arXiv:2212.11541</a> <span> [<a href="https://arxiv.org/pdf/2212.11541">pdf</a>, <a href="https://arxiv.org/format/2212.11541">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Generative Colorization of Structured Mobile Web Pages </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Kikuchi%2C+K">Kotaro Kikuchi</a>, <a href="/search/?searchtype=author&query=Inoue%2C+N">Naoto Inoue</a>, <a href="/search/?searchtype=author&query=Otani%2C+M">Mayu Otani</a>, <a href="/search/?searchtype=author&query=Simo-Serra%2C+E">Edgar Simo-Serra</a>, <a href="/search/?searchtype=author&query=Yamaguchi%2C+K">Kota Yamaguchi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.11541v2-abstract-short" style="display: inline;"> Color is a critical design factor for web pages, affecting important factors such as viewer emotions and the overall trust and satisfaction of a website. Effective coloring requires design knowledge and expertise, but if this process could be automated through data-driven modeling, efficient exploration and alternative workflows would be possible. However, this direction remains underexplored due… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.11541v2-abstract-full').style.display = 'inline'; document.getElementById('2212.11541v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.11541v2-abstract-full" style="display: none;"> Color is a critical design factor for web pages, affecting important factors such as viewer emotions and the overall trust and satisfaction of a website. Effective coloring requires design knowledge and expertise, but if this process could be automated through data-driven modeling, efficient exploration and alternative workflows would be possible. However, this direction remains underexplored due to the lack of a formalization of the web page colorization problem, datasets, and evaluation protocols. In this work, we propose a new dataset consisting of e-commerce mobile web pages in a tractable format, which are created by simplifying the pages and extracting canonical color styles with a common web browser. The web page colorization problem is then formalized as a task of estimating plausible color styles for a given web page content with a given hierarchical structure of the elements. We present several Transformer-based methods that are adapted to this task by prepending structural message passing to capture hierarchical relationships between elements. Experimental results, including a quantitative evaluation designed for this task, demonstrate the advantages of our methods over statistical and image colorization methods. The code is available at https://github.com/CyberAgentAILab/webcolor. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.11541v2-abstract-full').style.display = 'none'; document.getElementById('2212.11541v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to WACV 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.00567">arXiv:2212.00567</a> <span> [<a href="https://arxiv.org/pdf/2212.00567">pdf</a>, <a href="https://arxiv.org/format/2212.00567">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> P2Net: A Post-Processing Network for Refining Semantic Segmentation of LiDAR Point Cloud based on Consistency of Consecutive Frames </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Momma%2C+Y">Yutaka Momma</a>, <a href="/search/?searchtype=author&query=Wang%2C+W">Weimin Wang</a>, <a href="/search/?searchtype=author&query=Simo-Serra%2C+E">Edgar Simo-Serra</a>, <a href="/search/?searchtype=author&query=Iizuka%2C+S">Satoshi Iizuka</a>, <a href="/search/?searchtype=author&query=Nakamura%2C+R">Ryosuke Nakamura</a>, <a href="/search/?searchtype=author&query=Ishikawa%2C+H">Hiroshi Ishikawa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.00567v1-abstract-short" style="display: inline;"> We present a lightweight post-processing method to refine the semantic segmentation results of point cloud sequences. Most existing methods usually segment frame by frame and encounter the inherent ambiguity of the problem: based on a measurement in a single frame, labels are sometimes difficult to predict even for humans. To remedy this problem, we propose to explicitly train a network to refine… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.00567v1-abstract-full').style.display = 'inline'; document.getElementById('2212.00567v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.00567v1-abstract-full" style="display: none;"> We present a lightweight post-processing method to refine the semantic segmentation results of point cloud sequences. Most existing methods usually segment frame by frame and encounter the inherent ambiguity of the problem: based on a measurement in a single frame, labels are sometimes difficult to predict even for humans. To remedy this problem, we propose to explicitly train a network to refine these results predicted by an existing segmentation method. The network, which we call the P2Net, learns the consistency constraints between coincident points from consecutive frames after registration. We evaluate the proposed post-processing method both qualitatively and quantitatively on the SemanticKITTI dataset that consists of real outdoor scenes. The effectiveness of the proposed method is validated by comparing the results predicted by two representative networks with and without the refinement by the post-processing network. Specifically, qualitative visualization validates the key idea that labels of the points that are difficult to predict can be corrected with P2Net. Quantitatively, overall mIoU is improved from 10.5% to 11.7% for PointNet [1] and from 10.8% to 15.9% for PointNet++ [2]. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.00567v1-abstract-full').style.display = 'none'; document.getElementById('2212.00567v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2020 IEEE International Conference on Systems, Man, and Cybernetics (SMC)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2108.00871">arXiv:2108.00871</a> <span> [<a href="https://arxiv.org/pdf/2108.00871">pdf</a>, <a href="https://arxiv.org/format/2108.00871">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3474085.3475497">10.1145/3474085.3475497 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Constrained Graphic Layout Generation via Latent Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Kikuchi%2C+K">Kotaro Kikuchi</a>, <a href="/search/?searchtype=author&query=Simo-Serra%2C+E">Edgar Simo-Serra</a>, <a href="/search/?searchtype=author&query=Otani%2C+M">Mayu Otani</a>, <a href="/search/?searchtype=author&query=Yamaguchi%2C+K">Kota Yamaguchi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2108.00871v1-abstract-short" style="display: inline;"> It is common in graphic design humans visually arrange various elements according to their design intent and semantics. For example, a title text almost always appears on top of other elements in a document. In this work, we generate graphic layouts that can flexibly incorporate such design semantics, either specified implicitly or explicitly by a user. We optimize using the latent space of an off… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.00871v1-abstract-full').style.display = 'inline'; document.getElementById('2108.00871v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2108.00871v1-abstract-full" style="display: none;"> It is common in graphic design humans visually arrange various elements according to their design intent and semantics. For example, a title text almost always appears on top of other elements in a document. In this work, we generate graphic layouts that can flexibly incorporate such design semantics, either specified implicitly or explicitly by a user. We optimize using the latent space of an off-the-shelf layout generation model, allowing our approach to be complementary to and used with existing layout generation models. Our approach builds on a generative layout model based on a Transformer architecture, and formulates the layout generation as a constrained optimization problem where design constraints are used for element alignment, overlap avoidance, or any other user-specified relationship. We show in the experiments that our approach is capable of generating realistic layouts in both constrained and unconstrained generation tasks with a single model. The code is available at https://github.com/ktrk115/const_layout . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.00871v1-abstract-full').style.display = 'none'; document.getElementById('2108.00871v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 August, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACM Multimedia 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2009.13798">arXiv:2009.13798</a> <span> [<a href="https://arxiv.org/pdf/2009.13798">pdf</a>, <a href="https://arxiv.org/format/2009.13798">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Automatic Segmentation, Localization, and Identification of Vertebrae in 3D CT Images Using Cascaded Convolutional Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Masuzawa%2C+N">Naoto Masuzawa</a>, <a href="/search/?searchtype=author&query=Kitamura%2C+Y">Yoshiro Kitamura</a>, <a href="/search/?searchtype=author&query=Nakamura%2C+K">Keigo Nakamura</a>, <a href="/search/?searchtype=author&query=Iizuka%2C+S">Satoshi Iizuka</a>, <a href="/search/?searchtype=author&query=Simo-Serra%2C+E">Edgar Simo-Serra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2009.13798v1-abstract-short" style="display: inline;"> This paper presents a method for automatic segmentation, localization, and identification of vertebrae in arbitrary 3D CT images. Many previous works do not perform the three tasks simultaneously even though requiring a priori knowledge of which part of the anatomy is visible in the 3D CT images. Our method tackles all these tasks in a single multi-stage framework without any assumptions. In the f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2009.13798v1-abstract-full').style.display = 'inline'; document.getElementById('2009.13798v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2009.13798v1-abstract-full" style="display: none;"> This paper presents a method for automatic segmentation, localization, and identification of vertebrae in arbitrary 3D CT images. Many previous works do not perform the three tasks simultaneously even though requiring a priori knowledge of which part of the anatomy is visible in the 3D CT images. Our method tackles all these tasks in a single multi-stage framework without any assumptions. In the first stage, we train a 3D Fully Convolutional Networks to find the bounding boxes of the cervical, thoracic, and lumbar vertebrae. In the second stage, we train an iterative 3D Fully Convolutional Networks to segment individual vertebrae in the bounding box. The input to the second networks have an auxiliary channel in addition to the 3D CT images. Given the segmented vertebra regions in the auxiliary channel, the networks output the next vertebra. The proposed method is evaluated in terms of segmentation, localization, and identification accuracy with two public datasets of 15 3D CT images from the MICCAI CSI 2014 workshop challenge and 302 3D CT images with various pathologies introduced in [1]. Our method achieved a mean Dice score of 96%, a mean localization error of 8.3 mm, and a mean identification rate of 84%. In summary, our method achieved better performance than all existing works in all the three metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2009.13798v1-abstract-full').style.display = 'none'; document.getElementById('2009.13798v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2009.08692">arXiv:2009.08692</a> <span> [<a href="https://arxiv.org/pdf/2009.08692">pdf</a>, <a href="https://arxiv.org/format/2009.08692">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> DeepRemaster: Temporal Source-Reference Attention Networks for Comprehensive Video Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Iizuka%2C+S">Satoshi Iizuka</a>, <a href="/search/?searchtype=author&query=Simo-Serra%2C+E">Edgar Simo-Serra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2009.08692v1-abstract-short" style="display: inline;"> The remastering of vintage film comprises of a diversity of sub-tasks including super-resolution, noise removal, and contrast enhancement which aim to restore the deteriorated film medium to its original state. Additionally, due to the technical limitations of the time, most vintage film is either recorded in black and white, or has low quality colors, for which colorization becomes necessary. In… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2009.08692v1-abstract-full').style.display = 'inline'; document.getElementById('2009.08692v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2009.08692v1-abstract-full" style="display: none;"> The remastering of vintage film comprises of a diversity of sub-tasks including super-resolution, noise removal, and contrast enhancement which aim to restore the deteriorated film medium to its original state. Additionally, due to the technical limitations of the time, most vintage film is either recorded in black and white, or has low quality colors, for which colorization becomes necessary. In this work, we propose a single framework to tackle the entire remastering task semi-interactively. Our work is based on temporal convolutional neural networks with attention mechanisms trained on videos with data-driven deterioration simulation. Our proposed source-reference attention allows the model to handle an arbitrary number of reference color images to colorize long videos without the need for segmentation while maintaining temporal consistency. Quantitative analysis shows that our framework outperforms existing approaches, and that, in contrast to existing approaches, the performance of our framework increases with longer videos and more reference color images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2009.08692v1-abstract-full').style.display = 'none'; document.getElementById('2009.08692v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to SIGGRAPH Asia 2019. Project page: http://iizuka.cs.tsukuba.ac.jp/projects/remastering/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2009.08674">arXiv:2009.08674</a> <span> [<a href="https://arxiv.org/pdf/2009.08674">pdf</a>, <a href="https://arxiv.org/format/2009.08674">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TopNet: Topology Preserving Metric Learning for Vessel Tree Reconstruction and Labelling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Keshwani%2C+D">Deepak Keshwani</a>, <a href="/search/?searchtype=author&query=Kitamura%2C+Y">Yoshiro Kitamura</a>, <a href="/search/?searchtype=author&query=Ihara%2C+S">Satoshi Ihara</a>, <a href="/search/?searchtype=author&query=Iizuka%2C+S">Satoshi Iizuka</a>, <a href="/search/?searchtype=author&query=Simo-Serra%2C+E">Edgar Simo-Serra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2009.08674v1-abstract-short" style="display: inline;"> Reconstructing Portal Vein and Hepatic Vein trees from contrast enhanced abdominal CT scans is a prerequisite for preoperative liver surgery simulation. Existing deep learning based methods treat vascular tree reconstruction as a semantic segmentation problem. However, vessels such as hepatic and portal vein look very similar locally and need to be traced to their source for robust label assignmen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2009.08674v1-abstract-full').style.display = 'inline'; document.getElementById('2009.08674v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2009.08674v1-abstract-full" style="display: none;"> Reconstructing Portal Vein and Hepatic Vein trees from contrast enhanced abdominal CT scans is a prerequisite for preoperative liver surgery simulation. Existing deep learning based methods treat vascular tree reconstruction as a semantic segmentation problem. However, vessels such as hepatic and portal vein look very similar locally and need to be traced to their source for robust label assignment. Therefore, semantic segmentation by looking at local 3D patch results in noisy misclassifications. To tackle this, we propose a novel multi-task deep learning architecture for vessel tree reconstruction. The network architecture simultaneously solves the task of detecting voxels on vascular centerlines (i.e. nodes) and estimates connectivity between center-voxels (edges) in the tree structure to be reconstructed. Further, we propose a novel connectivity metric which considers both inter-class distance and intra-class topological distance between center-voxel pairs. Vascular trees are reconstructed starting from the vessel source using the learned connectivity metric using the shortest path tree algorithm. A thorough evaluation on public IRCAD dataset shows that the proposed method considerably outperforms existing semantic segmentation based methods. To the best of our knowledge, this is the first deep learning based approach which learns multi-label tree structure connectivity from images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2009.08674v1-abstract-full').style.display = 'none'; document.getElementById('2009.08674v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in MICCAI 2020</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Report number:</span> 603 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2003.11211">arXiv:2003.11211</a> <span> [<a href="https://arxiv.org/pdf/2003.11211">pdf</a>, <a href="https://arxiv.org/format/2003.11211">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Two-stage Discriminative Re-ranking for Large-scale Landmark Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Yokoo%2C+S">Shuhei Yokoo</a>, <a href="/search/?searchtype=author&query=Ozaki%2C+K">Kohei Ozaki</a>, <a href="/search/?searchtype=author&query=Simo-Serra%2C+E">Edgar Simo-Serra</a>, <a href="/search/?searchtype=author&query=Iizuka%2C+S">Satoshi Iizuka</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2003.11211v1-abstract-short" style="display: inline;"> We propose an efficient pipeline for large-scale landmark image retrieval that addresses the diversity of the dataset through two-stage discriminative re-ranking. Our approach is based on embedding the images in a feature-space using a convolutional neural network trained with a cosine softmax loss. Due to the variance of the images, which include extreme viewpoint changes such as having to retrie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2003.11211v1-abstract-full').style.display = 'inline'; document.getElementById('2003.11211v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2003.11211v1-abstract-full" style="display: none;"> We propose an efficient pipeline for large-scale landmark image retrieval that addresses the diversity of the dataset through two-stage discriminative re-ranking. Our approach is based on embedding the images in a feature-space using a convolutional neural network trained with a cosine softmax loss. Due to the variance of the images, which include extreme viewpoint changes such as having to retrieve images of the exterior of a landmark from images of the interior, this is very challenging for approaches based exclusively on visual similarity. Our proposed re-ranking approach improves the results in two steps: in the sort-step, $k$-nearest neighbor search with soft-voting to sort the retrieved results based on their label similarity to the query images, and in the insert-step, we add additional samples from the dataset that were not retrieved by image-similarity. This approach allows overcoming the low visual diversity in retrieved images. In-depth experimental results show that the proposed approach significantly outperforms existing approaches on the challenging Google Landmarks Datasets. Using our methods, we achieved 1st place in the Google Landmark Retrieval 2019 challenge and 3rd place in the Google Landmark Recognition 2019 challenge on Kaggle. Our code is publicly available here: \url{https://github.com/lyakaap/Landmark2019-1st-and-3rd-Place-Solution} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2003.11211v1-abstract-full').style.display = 'none'; document.getElementById('2003.11211v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1909.04021">arXiv:1909.04021</a> <span> [<a href="https://arxiv.org/pdf/1909.04021">pdf</a>, <a href="https://arxiv.org/format/1909.04021">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Understanding the Effects of Pre-Training for Object Detectors via Eigenspectrum </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Shinya%2C+Y">Yosuke Shinya</a>, <a href="/search/?searchtype=author&query=Simo-Serra%2C+E">Edgar Simo-Serra</a>, <a href="/search/?searchtype=author&query=Suzuki%2C+T">Taiji Suzuki</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1909.04021v1-abstract-short" style="display: inline;"> ImageNet pre-training has been regarded as essential for training accurate object detectors for a long time. Recently, it has been shown that object detectors trained from randomly initialized weights can be on par with those fine-tuned from ImageNet pre-trained models. However, the effects of pre-training and the differences caused by pre-training are still not fully understood. In this paper, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1909.04021v1-abstract-full').style.display = 'inline'; document.getElementById('1909.04021v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1909.04021v1-abstract-full" style="display: none;"> ImageNet pre-training has been regarded as essential for training accurate object detectors for a long time. Recently, it has been shown that object detectors trained from randomly initialized weights can be on par with those fine-tuned from ImageNet pre-trained models. However, the effects of pre-training and the differences caused by pre-training are still not fully understood. In this paper, we analyze the eigenspectrum dynamics of the covariance matrix of each feature map in object detectors. Based on our analysis on ResNet-50, Faster R-CNN with FPN, and Mask R-CNN, we show that object detectors trained from ImageNet pre-trained models and those trained from scratch behave differently from each other even if both object detectors have similar accuracy. Furthermore, we propose a method for automatically determining the widths (the numbers of channels) of object detectors based on the eigenspectrum. We train Faster R-CNN with FPN from randomly initialized weights, and show that our method can reduce ~27% of the parameters of ResNet-50 without increasing Multiply-Accumulate operations and losing accuracy. Our results indicate that we should develop more appropriate methods for transferring knowledge from image classification to object detection (or other tasks). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1909.04021v1-abstract-full').style.display = 'none'; document.getElementById('1909.04021v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICCV 2019 Workshop on Neural Architects (Oral)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1908.11506">arXiv:1908.11506</a> <span> [<a href="https://arxiv.org/pdf/1908.11506">pdf</a>, <a href="https://arxiv.org/format/1908.11506">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Virtual Thin Slice: 3D Conditional GAN-based Super-resolution for CT Slice Interval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Kudo%2C+A">Akira Kudo</a>, <a href="/search/?searchtype=author&query=Kitamura%2C+Y">Yoshiro Kitamura</a>, <a href="/search/?searchtype=author&query=Li%2C+Y">Yuanzhong Li</a>, <a href="/search/?searchtype=author&query=Iizuka%2C+S">Satoshi Iizuka</a>, <a href="/search/?searchtype=author&query=Simo-Serra%2C+E">Edgar Simo-Serra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1908.11506v2-abstract-short" style="display: inline;"> Many CT slice images are stored with large slice intervals to reduce storage size in clinical practice. This leads to low resolution perpendicular to the slice images (i.e., z-axis), which is insufficient for 3D visualization or image analysis. In this paper, we present a novel architecture based on conditional Generative Adversarial Networks (cGANs) with the goal of generating high resolution ima… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1908.11506v2-abstract-full').style.display = 'inline'; document.getElementById('1908.11506v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1908.11506v2-abstract-full" style="display: none;"> Many CT slice images are stored with large slice intervals to reduce storage size in clinical practice. This leads to low resolution perpendicular to the slice images (i.e., z-axis), which is insufficient for 3D visualization or image analysis. In this paper, we present a novel architecture based on conditional Generative Adversarial Networks (cGANs) with the goal of generating high resolution images of main body parts including head, chest, abdomen and legs. However, GANs are known to have a difficulty with generating a diversity of patterns due to a phenomena known as mode collapse. To overcome the lack of generated pattern variety, we propose to condition the discriminator on the different body parts. Furthermore, our generator networks are extended to be three dimensional fully convolutional neural networks, allowing for the generation of high resolution images from arbitrary fields of view. In our verification tests, we show that the proposed method obtains the best scores by PSNR/SSIM metrics and Visual Turing Test, allowing for accurate reproduction of the principle anatomy in high resolution. We expect that the proposed method contribute to effective utilization of the existing vast amounts of thick CT images stored in hospitals. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1908.11506v2-abstract-full').style.display = 'none'; document.getElementById('1908.11506v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 September, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 August, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 6 figures, Accepted to Machine Learning for Medical Image Reconstruction (MLMIR) at MICCAI 2019</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1703.08966">arXiv:1703.08966</a> <span> [<a href="https://arxiv.org/pdf/1703.08966">pdf</a>, <a href="https://arxiv.org/format/1703.08966">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Mastering Sketching: Adversarial Augmentation for Structured Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Simo-Serra%2C+E">Edgar Simo-Serra</a>, <a href="/search/?searchtype=author&query=Iizuka%2C+S">Satoshi Iizuka</a>, <a href="/search/?searchtype=author&query=Ishikawa%2C+H">Hiroshi Ishikawa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1703.08966v1-abstract-short" style="display: inline;"> We present an integral framework for training sketch simplification networks that convert challenging rough sketches into clean line drawings. Our approach augments a simplification network with a discriminator network, training both networks jointly so that the discriminator network discerns whether a line drawing is a real training data or the output of the simplification network, which in turn… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1703.08966v1-abstract-full').style.display = 'inline'; document.getElementById('1703.08966v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1703.08966v1-abstract-full" style="display: none;"> We present an integral framework for training sketch simplification networks that convert challenging rough sketches into clean line drawings. Our approach augments a simplification network with a discriminator network, training both networks jointly so that the discriminator network discerns whether a line drawing is a real training data or the output of the simplification network, which in turn tries to fool it. This approach has two major advantages. First, because the discriminator network learns the structure in line drawings, it encourages the output sketches of the simplification network to be more similar in appearance to the training sketches. Second, we can also train the simplification network with additional unsupervised data, using the discriminator network as a substitute teacher. Thus, by adding only rough sketches without simplified line drawings, or only line drawings without the original rough sketches, we can improve the quality of the sketch simplification. We show how our framework can be used to train models that significantly outperform the state of the art in the sketch simplification task, despite using the same architecture for inference. We additionally present an approach to optimize for a single image, which improves accuracy at the cost of additional computation time. Finally, we show that, using the same framework, it is possible to train the network to perform the inverse problem, i.e., convert simple line sketches into pencil drawings, which is not possible using the standard mean squared error loss. We validate our framework with two user tests, where our approach is preferred to the state of the art in sketch simplification 92.3% of the time and obtains 1.2 more points on a scale of 1 to 5. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1703.08966v1-abstract-full').style.display = 'none'; document.getElementById('1703.08966v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2017. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1604.08164">arXiv:1604.08164</a> <span> [<a href="https://arxiv.org/pdf/1604.08164">pdf</a>, <a href="https://arxiv.org/format/1604.08164">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Understanding Human-Centric Images: From Geometry to Fashion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Simo-Serra%2C+E">Edgar Simo-Serra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1604.08164v1-abstract-short" style="display: inline;"> Understanding humans from photographs has always been a fundamental goal of computer vision. In this thesis we have developed a hierarchy of tools that cover a wide range of topics with the objective of understanding humans from monocular RGB image: from low level feature point descriptors to high level fashion-aware conditional random fields models. In order to build these high level models it is… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1604.08164v1-abstract-full').style.display = 'inline'; document.getElementById('1604.08164v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1604.08164v1-abstract-full" style="display: none;"> Understanding humans from photographs has always been a fundamental goal of computer vision. In this thesis we have developed a hierarchy of tools that cover a wide range of topics with the objective of understanding humans from monocular RGB image: from low level feature point descriptors to high level fashion-aware conditional random fields models. In order to build these high level models it is paramount to have a battery of robust and reliable low and mid level cues. Along these lines, we have proposed two low-level keypoint descriptors: one based on the theory of the heat diffusion on images, and the other that uses a convolutional neural network to learn discriminative image patch representations. We also introduce distinct low-level generative models for representing human pose: in particular we present a discrete model based on a directed acyclic graph and a continuous model that consists of poses clustered on a Riemannian manifold. As mid level cues we propose two 3D human pose estimation algorithms: one that estimates the 3D pose given a noisy 2D estimation, and an approach that simultaneously estimates both the 2D and 3D pose. Finally, we formulate higher level models built upon low and mid level cues for understanding humans from single images. Concretely, we focus on two different tasks in the context of fashion: semantic segmentation of clothing, and predicting the fashionability from images with metadata to ultimately provide fashion advice to the user. For all presented approaches we present extensive results and comparisons against the state-of-the-art and show significant improvements on the entire variety of tasks we tackle. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1604.08164v1-abstract-full').style.display = 'none'; document.getElementById('1604.08164v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2015; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2016. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">PhD Thesis, May 2015. BarcelonaTech. 169 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1509.02130">arXiv:1509.02130</a> <span> [<a href="https://arxiv.org/pdf/1509.02130">pdf</a>, <a href="https://arxiv.org/format/1509.02130">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Structured Prediction with Output Embeddings for Semantic Image Annotation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Quattoni%2C+A">Ariadna Quattoni</a>, <a href="/search/?searchtype=author&query=Ramisa%2C+A">Arnau Ramisa</a>, <a href="/search/?searchtype=author&query=Madhyastha%2C+P+S">Pranava Swaroop Madhyastha</a>, <a href="/search/?searchtype=author&query=Simo-Serra%2C+E">Edgar Simo-Serra</a>, <a href="/search/?searchtype=author&query=Moreno-Noguer%2C+F">Francesc Moreno-Noguer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1509.02130v1-abstract-short" style="display: inline;"> We address the task of annotating images with semantic tuples. Solving this problem requires an algorithm which is able to deal with hundreds of classes for each argument of the tuple. In such contexts, data sparsity becomes a key challenge, as there will be a large number of classes for which only a few examples are available. We propose handling this by incorporating feature representations of b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1509.02130v1-abstract-full').style.display = 'inline'; document.getElementById('1509.02130v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1509.02130v1-abstract-full" style="display: none;"> We address the task of annotating images with semantic tuples. Solving this problem requires an algorithm which is able to deal with hundreds of classes for each argument of the tuple. In such contexts, data sparsity becomes a key challenge, as there will be a large number of classes for which only a few examples are available. We propose handling this by incorporating feature representations of both the inputs (images) and outputs (argument classes) into a factorized log-linear model, and exploiting the flexibility of scoring functions based on bilinear forms. Experiments show that integrating feature representations of the outputs in the structured prediction model leads to better overall predictions. We also conclude that the best output representation is specific for each type of argument. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1509.02130v1-abstract-full').style.display = 'none'; document.getElementById('1509.02130v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 September, 2015; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2015. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1412.6537">arXiv:1412.6537</a> <span> [<a href="https://arxiv.org/pdf/1412.6537">pdf</a>, <a href="https://arxiv.org/format/1412.6537">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Fracking Deep Convolutional Image Descriptors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Simo-Serra%2C+E">Edgar Simo-Serra</a>, <a href="/search/?searchtype=author&query=Trulls%2C+E">Eduard Trulls</a>, <a href="/search/?searchtype=author&query=Ferraz%2C+L">Luis Ferraz</a>, <a href="/search/?searchtype=author&query=Kokkinos%2C+I">Iasonas Kokkinos</a>, <a href="/search/?searchtype=author&query=Moreno-Noguer%2C+F">Francesc Moreno-Noguer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1412.6537v2-abstract-short" style="display: inline;"> In this paper we propose a novel framework for learning local image descriptors in a discriminative manner. For this purpose we explore a siamese architecture of Deep Convolutional Neural Networks (CNN), with a Hinge embedding loss on the L2 distance between descriptors. Since a siamese architecture uses pairs rather than single image patches to train, there exist a large number of positive sample… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1412.6537v2-abstract-full').style.display = 'inline'; document.getElementById('1412.6537v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1412.6537v2-abstract-full" style="display: none;"> In this paper we propose a novel framework for learning local image descriptors in a discriminative manner. For this purpose we explore a siamese architecture of Deep Convolutional Neural Networks (CNN), with a Hinge embedding loss on the L2 distance between descriptors. Since a siamese architecture uses pairs rather than single image patches to train, there exist a large number of positive samples and an exponential number of negative samples. We propose to explore this space with a stochastic sampling of the training set, in combination with an aggressive mining strategy over both the positive and negative samples which we denote as "fracking". We perform a thorough evaluation of the architecture hyper-parameters, and demonstrate large performance gains compared to both standard CNN learning strategies, hand-crafted image descriptors like SIFT, and the state-of-the-art on learned descriptors: up to 2.5x vs SIFT and 1.5x vs the state-of-the-art in terms of the area under the curve (AUC) of the Precision-Recall curve. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1412.6537v2-abstract-full').style.display = 'none'; document.getElementById('1412.6537v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 February, 2015; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 December, 2014; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2014. </p> </li> </ol> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>