Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 85 results for author: <span class="mathjax">Huo, J</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Huo%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Huo, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Huo%2C+J&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Huo, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Huo%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Huo%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Huo%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11916">arXiv:2502.11916</a> <span> [<a href="https://arxiv.org/pdf/2502.11916">pdf</a>, <a href="https://arxiv.org/format/2502.11916">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> EssayJudge: A Multi-Granular Benchmark for Assessing Automated Essay Scoring Capabilities of Multimodal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+J">Jiamin Su</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yibo Yan</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+F">Fangteng Fu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Han Zhang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jingheng Ye</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiang Liu</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jiahao Huo</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Huiyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xuming Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11916v1-abstract-short" style="display: inline;"> Automated Essay Scoring (AES) plays a crucial role in educational assessment by providing scalable and consistent evaluations of writing tasks. However, traditional AES systems face three major challenges: (1) reliance on handcrafted features that limit generalizability, (2) difficulty in capturing fine-grained traits like coherence and argumentation, and (3) inability to handle multimodal context… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11916v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11916v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11916v1-abstract-full" style="display: none;"> Automated Essay Scoring (AES) plays a crucial role in educational assessment by providing scalable and consistent evaluations of writing tasks. However, traditional AES systems face three major challenges: (1) reliance on handcrafted features that limit generalizability, (2) difficulty in capturing fine-grained traits like coherence and argumentation, and (3) inability to handle multimodal contexts. In the era of Multimodal Large Language Models (MLLMs), we propose EssayJudge, the first multimodal benchmark to evaluate AES capabilities across lexical-, sentence-, and discourse-level traits. By leveraging MLLMs' strengths in trait-specific scoring and multimodal context understanding, EssayJudge aims to offer precise, context-rich evaluations without manual feature engineering, addressing longstanding AES limitations. Our experiments with 18 representative MLLMs reveal gaps in AES performance compared to human evaluation, particularly in discourse-level traits, highlighting the need for further advancements in MLLM-based AES research. Our dataset and code will be available upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11916v1-abstract-full').style.display = 'none'; document.getElementById('2502.11916v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">JS and YY are co-first authors. XH is the corresponding author</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11090">arXiv:2502.11090</a> <span> [<a href="https://arxiv.org/pdf/2502.11090">pdf</a>, <a href="https://arxiv.org/format/2502.11090">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SafeDialBench: A Fine-Grained Safety Benchmark for Large Language Models in Multi-Turn Dialogues with Diverse Jailbreak Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+H">Hongye Cao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanming Wang</a>, <a href="/search/cs?searchtype=author&query=Jing%2C+S">Sijia Jing</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Ziyue Peng</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Z">Zhixin Bai</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Z">Zhe Cao</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+M">Meng Fang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+F">Fan Feng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Boyan Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaheng Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tianpei Yang</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jing Huo</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+F">Fanyu Meng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xi Yang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+C">Chao Deng</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+J">Junlan Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11090v2-abstract-short" style="display: inline;"> With the rapid advancement of Large Language Models (LLMs), the safety of LLMs has been a critical concern requiring precise assessment. Current benchmarks primarily concentrate on single-turn dialogues or a single jailbreak attack method to assess the safety. Additionally, these benchmarks have not taken into account the LLM's capability of identifying and handling unsafe information in detail. T… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11090v2-abstract-full').style.display = 'inline'; document.getElementById('2502.11090v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11090v2-abstract-full" style="display: none;"> With the rapid advancement of Large Language Models (LLMs), the safety of LLMs has been a critical concern requiring precise assessment. Current benchmarks primarily concentrate on single-turn dialogues or a single jailbreak attack method to assess the safety. Additionally, these benchmarks have not taken into account the LLM's capability of identifying and handling unsafe information in detail. To address these issues, we propose a fine-grained benchmark SafeDialBench for evaluating the safety of LLMs across various jailbreak attacks in multi-turn dialogues. Specifically, we design a two-tier hierarchical safety taxonomy that considers 6 safety dimensions and generates more than 4000 multi-turn dialogues in both Chinese and English under 22 dialogue scenarios. We employ 7 jailbreak attack strategies, such as reference attack and purpose reverse, to enhance the dataset quality for dialogue generation. Notably, we construct an innovative assessment framework of LLMs, measuring capabilities in detecting, and handling unsafe information and maintaining consistency when facing jailbreak attacks. Experimental results across 17 LLMs reveal that Yi-34B-Chat and GLM4-9B-Chat demonstrate superior safety performance, while Llama3.1-8B-Instruct and o3-mini exhibit safety vulnerabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11090v2-abstract-full').style.display = 'none'; document.getElementById('2502.11090v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11051">arXiv:2502.11051</a> <span> [<a href="https://arxiv.org/pdf/2502.11051">pdf</a>, <a href="https://arxiv.org/format/2502.11051">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MMUNLEARNER: Reformulating Multimodal Machine Unlearning in the Era of Multimodal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jiahao Huo</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yibo Yan</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+X">Xu Zheng</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+Y">Yuanhuiyi Lyu</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+X">Xin Zou</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Z">Zhihua Wei</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xuming Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11051v1-abstract-short" style="display: inline;"> Recent progress in Machine Unlearning (MU) has introduced solutions for the selective removal of private or sensitive information encoded within deep neural networks. Nonetheless, MU for Multimodal Large Language Models (MLLMs) remains in its nascent phase. Therefore, we propose to reformulate the task of multimodal MU in the era of MLLMs, which aims to erase only the visual patterns associated wi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11051v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11051v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11051v1-abstract-full" style="display: none;"> Recent progress in Machine Unlearning (MU) has introduced solutions for the selective removal of private or sensitive information encoded within deep neural networks. Nonetheless, MU for Multimodal Large Language Models (MLLMs) remains in its nascent phase. Therefore, we propose to reformulate the task of multimodal MU in the era of MLLMs, which aims to erase only the visual patterns associated with a given entity while preserving the corresponding textual knowledge encoded within the original parameters of the language model backbone. Furthermore, we develop a novel geometry-constrained gradient descent method MMUnlearner. It updates the weights of MLLMs with a weight saliency map jointly restricted by the remaining concepts and textual knowledge during unlearning, thereby preserving parameters essential for non-target knowledge. Extensive experiments demonstrate that MMUnlearner surpasses baselines that finetuning MLLMs with VQA data directly through Gradient Ascent (GA) or Negative Preference Optimization (NPO), across all evaluation dimensions. Our code will be released upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11051v1-abstract-full').style.display = 'none'; document.getElementById('2502.11051v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10097">arXiv:2502.10097</a> <span> [<a href="https://arxiv.org/pdf/2502.10097">pdf</a>, <a href="https://arxiv.org/format/2502.10097">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Causal Information Prioritization for Efficient Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+H">Hongye Cao</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+F">Fan Feng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tianpei Yang</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jing Huo</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10097v1-abstract-short" style="display: inline;"> Current Reinforcement Learning (RL) methods often suffer from sample-inefficiency, resulting from blind exploration strategies that neglect causal relationships among states, actions, and rewards. Although recent causal approaches aim to address this problem, they lack grounded modeling of reward-guided causal understanding of states and actions for goal-orientation, thus impairing learning effici… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10097v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10097v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10097v1-abstract-full" style="display: none;"> Current Reinforcement Learning (RL) methods often suffer from sample-inefficiency, resulting from blind exploration strategies that neglect causal relationships among states, actions, and rewards. Although recent causal approaches aim to address this problem, they lack grounded modeling of reward-guided causal understanding of states and actions for goal-orientation, thus impairing learning efficiency. To tackle this issue, we propose a novel method named Causal Information Prioritization (CIP) that improves sample efficiency by leveraging factored MDPs to infer causal relationships between different dimensions of states and actions with respect to rewards, enabling the prioritization of causal information. Specifically, CIP identifies and leverages causal relationships between states and rewards to execute counterfactual data augmentation to prioritize high-impact state features under the causal understanding of the environments. Moreover, CIP integrates a causality-aware empowerment learning objective, which significantly enhances the agent's execution of reward-guided actions for more efficient exploration in complex environments. To fully assess the effectiveness of CIP, we conduct extensive experiments across 39 tasks in 5 diverse continuous control environments, encompassing both locomotion and manipulation skills learning with pixel-based and sparse reward settings. Experimental results demonstrate that CIP consistently outperforms existing RL methods across a wide range of scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10097v1-abstract-full').style.display = 'none'; document.getElementById('2502.10097v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10077">arXiv:2502.10077</a> <span> [<a href="https://arxiv.org/pdf/2502.10077">pdf</a>, <a href="https://arxiv.org/format/2502.10077">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Towards Empowerment Gain through Causal Structure Learning in Model-Based RL </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+H">Hongye Cao</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+F">Fan Feng</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+M">Meng Fang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+S">Shaokang Dong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tianpei Yang</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jing Huo</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10077v1-abstract-short" style="display: inline;"> In Model-Based Reinforcement Learning (MBRL), incorporating causal structures into dynamics models provides agents with a structured understanding of the environments, enabling efficient decision. Empowerment as an intrinsic motivation enhances the ability of agents to actively control their environments by maximizing the mutual information between future states and actions. We posit that empowerm… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10077v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10077v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10077v1-abstract-full" style="display: none;"> In Model-Based Reinforcement Learning (MBRL), incorporating causal structures into dynamics models provides agents with a structured understanding of the environments, enabling efficient decision. Empowerment as an intrinsic motivation enhances the ability of agents to actively control their environments by maximizing the mutual information between future states and actions. We posit that empowerment coupled with causal understanding can improve controllability, while enhanced empowerment gain can further facilitate causal reasoning in MBRL. To improve learning efficiency and controllability, we propose a novel framework, Empowerment through Causal Learning (ECL), where an agent with the awareness of causal dynamics models achieves empowerment-driven exploration and optimizes its causal structure for task learning. Specifically, ECL operates by first training a causal dynamics model of the environment based on collected data. We then maximize empowerment under the causal structure for exploration, simultaneously using data gathered through exploration to update causal dynamics model to be more controllable than dense dynamics model without causal structure. In downstream task learning, an intrinsic curiosity reward is included to balance the causality, mitigating overfitting. Importantly, ECL is method-agnostic and is capable of integrating various causal discovery methods. We evaluate ECL combined with 3 causal discovery methods across 6 environments including pixel-based tasks, demonstrating its superior performance compared to other causal MBRL methods, in terms of causal discovery, sample efficiency, and asymptotic performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10077v1-abstract-full').style.display = 'none'; document.getElementById('2502.10077v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02871">arXiv:2502.02871</a> <span> [<a href="https://arxiv.org/pdf/2502.02871">pdf</a>, <a href="https://arxiv.org/format/2502.02871">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Position: Multimodal Large Language Models Can Significantly Advance Scientific Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yibo Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shen Wang</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jiahao Huo</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jingheng Ye</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+Z">Zhendong Chu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xuming Hu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+P+S">Philip S. Yu</a>, <a href="/search/cs?searchtype=author&query=Gomes%2C+C">Carla Gomes</a>, <a href="/search/cs?searchtype=author&query=Selman%2C+B">Bart Selman</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Q">Qingsong Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02871v1-abstract-short" style="display: inline;"> Scientific reasoning, the process through which humans apply logic, evidence, and critical thinking to explore and interpret scientific phenomena, is essential in advancing knowledge reasoning across diverse fields. However, despite significant progress, current scientific reasoning models still struggle with generalization across domains and often fall short of multimodal perception. Multimodal L… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02871v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02871v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02871v1-abstract-full" style="display: none;"> Scientific reasoning, the process through which humans apply logic, evidence, and critical thinking to explore and interpret scientific phenomena, is essential in advancing knowledge reasoning across diverse fields. However, despite significant progress, current scientific reasoning models still struggle with generalization across domains and often fall short of multimodal perception. Multimodal Large Language Models (MLLMs), which integrate text, images, and other modalities, present an exciting opportunity to overcome these limitations and enhance scientific reasoning. Therefore, this position paper argues that MLLMs can significantly advance scientific reasoning across disciplines such as mathematics, physics, chemistry, and biology. First, we propose a four-stage research roadmap of scientific reasoning capabilities, and highlight the current state of MLLM applications in scientific reasoning, noting their ability to integrate and reason over diverse data types. Second, we summarize the key challenges that remain obstacles to achieving MLLM's full potential. To address these challenges, we propose actionable insights and suggestions for the future. Overall, our work offers a novel perspective on MLLM integration with scientific reasoning, providing the LLM community with a valuable vision for achieving Artificial General Intelligence (AGI). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02871v1-abstract-full').style.display = 'none'; document.getElementById('2502.02871v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06605">arXiv:2501.06605</a> <span> [<a href="https://arxiv.org/pdf/2501.06605">pdf</a>, <a href="https://arxiv.org/format/2501.06605">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> RoboHorizon: An LLM-Assisted Multi-View World Model for Long-Horizon Robotic Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zixuan Chen</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jing Huo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yangtao Chen</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06605v3-abstract-short" style="display: inline;"> Efficient control in long-horizon robotic manipulation is challenging due to complex representation and policy learning requirements. Model-based visual reinforcement learning (RL) has shown great potential in addressing these challenges but still faces notable limitations, particularly in handling sparse rewards and complex visual features in long-horizon environments. To address these limitation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06605v3-abstract-full').style.display = 'inline'; document.getElementById('2501.06605v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06605v3-abstract-full" style="display: none;"> Efficient control in long-horizon robotic manipulation is challenging due to complex representation and policy learning requirements. Model-based visual reinforcement learning (RL) has shown great potential in addressing these challenges but still faces notable limitations, particularly in handling sparse rewards and complex visual features in long-horizon environments. To address these limitations, we propose the Recognize-Sense-Plan-Act (RSPA) pipeline for long-horizon tasks and further introduce RoboHorizon, an LLM-assisted multi-view world model tailored for long-horizon robotic manipulation. In RoboHorizon, pre-trained LLMs generate dense reward structures for multi-stage sub-tasks based on task language instructions, enabling robots to better recognize long-horizon tasks. Keyframe discovery is then integrated into the multi-view masked autoencoder (MAE) architecture to enhance the robot's ability to sense critical task sequences, strengthening its multi-stage perception of long-horizon processes. Leveraging these dense rewards and multi-view representations, a robotic world model is constructed to efficiently plan long-horizon tasks, enabling the robot to reliably act through RL algorithms. Experiments on two representative benchmarks, RLBench and FurnitureBench, show that RoboHorizon outperforms state-of-the-art visual model-based RL methods, achieving a 23.35% improvement in task success rates on RLBench's 4 short-horizon tasks and a 29.23% improvement on 6 long-horizon tasks from RLBench and 3 furniture assembly tasks from FurnitureBench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06605v3-abstract-full').style.display = 'none'; document.getElementById('2501.06605v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.17316">arXiv:2412.17316</a> <span> [<a href="https://arxiv.org/pdf/2412.17316">pdf</a>, <a href="https://arxiv.org/ps/2412.17316">ps</a>, <a href="https://arxiv.org/format/2412.17316">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Complexity">cs.CC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Fast Gradient Computation for RoPE Attention in Almost Linear Time </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yifang Chen</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jiayan Huo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaoyu Li</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yingyu Liang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zhenmei Shi</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zhao Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.17316v2-abstract-short" style="display: inline;"> The Rotary Position Embedding (RoPE) mechanism has become a powerful enhancement to the Transformer architecture, which enables models to capture token relationships when encoding positional information. However, the RoPE mechanisms make the computations of attention mechanisms more complicated, which makes efficient algorithms challenging. Earlier research introduced almost linear time, i.e.,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17316v2-abstract-full').style.display = 'inline'; document.getElementById('2412.17316v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.17316v2-abstract-full" style="display: none;"> The Rotary Position Embedding (RoPE) mechanism has become a powerful enhancement to the Transformer architecture, which enables models to capture token relationships when encoding positional information. However, the RoPE mechanisms make the computations of attention mechanisms more complicated, which makes efficient algorithms challenging. Earlier research introduced almost linear time, i.e., $n^{1+o(1)}$ where $n$ is the number of input tokens, algorithms for the forward computation under specific parameter settings. However, achieving a subquadratic time algorithm for other parameter regimes remains impossible unless the widely accepted Strong Exponential Time Hypothesis (SETH) is disproven. In this work, we develop the first almost linear time algorithm for backward computations in the RoPE-based attention under bounded entries. Our approach builds on recent advancements in fast RoPE attention computations, utilizing a novel combination of the polynomial method and the Fast Fourier Transform. Furthermore, we show that with lower bounds derived from the SETH, the bounded entry condition is necessary for subquadratic performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17316v2-abstract-full').style.display = 'none'; document.getElementById('2412.17316v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.02104">arXiv:2412.02104</a> <span> [<a href="https://arxiv.org/pdf/2412.02104">pdf</a>, <a href="https://arxiv.org/format/2412.02104">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Explainable and Interpretable Multimodal Large Language Models: A Comprehensive Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dang%2C+Y">Yunkai Dang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kaichen Huang</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jiahao Huo</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yibo Yan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Sirui Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dongrui Liu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+M">Mengxi Gao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jie Zhang</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+C">Chen Qian</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kun Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yong Liu</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+J">Jing Shao</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+H">Hui Xiong</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xuming Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.02104v1-abstract-short" style="display: inline;"> The rapid development of Artificial Intelligence (AI) has revolutionized numerous fields, with large language models (LLMs) and computer vision (CV) systems driving advancements in natural language understanding and visual processing, respectively. The convergence of these technologies has catalyzed the rise of multimodal AI, enabling richer, cross-modal understanding that spans text, vision, audi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02104v1-abstract-full').style.display = 'inline'; document.getElementById('2412.02104v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.02104v1-abstract-full" style="display: none;"> The rapid development of Artificial Intelligence (AI) has revolutionized numerous fields, with large language models (LLMs) and computer vision (CV) systems driving advancements in natural language understanding and visual processing, respectively. The convergence of these technologies has catalyzed the rise of multimodal AI, enabling richer, cross-modal understanding that spans text, vision, audio, and video modalities. Multimodal large language models (MLLMs), in particular, have emerged as a powerful framework, demonstrating impressive capabilities in tasks like image-text generation, visual question answering, and cross-modal retrieval. Despite these advancements, the complexity and scale of MLLMs introduce significant challenges in interpretability and explainability, essential for establishing transparency, trustworthiness, and reliability in high-stakes applications. This paper provides a comprehensive survey on the interpretability and explainability of MLLMs, proposing a novel framework that categorizes existing research across three perspectives: (I) Data, (II) Model, (III) Training \& Inference. We systematically analyze interpretability from token-level to embedding-level representations, assess approaches related to both architecture analysis and design, and explore training and inference strategies that enhance transparency. By comparing various methodologies, we identify their strengths and limitations and propose future research directions to address unresolved challenges in multimodal explainability. This survey offers a foundational resource for advancing interpretability and transparency in MLLMs, guiding researchers and practitioners toward developing more accountable and robust multimodal AI systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02104v1-abstract-full').style.display = 'none'; document.getElementById('2412.02104v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17605">arXiv:2411.17605</a> <span> [<a href="https://arxiv.org/pdf/2411.17605">pdf</a>, <a href="https://arxiv.org/format/2411.17605">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Distractor-free Generalizable 3D Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bao%2C+Y">Yanqi Bao</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jing Liao</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jing Huo</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17605v1-abstract-short" style="display: inline;"> We present DGGS, a novel framework addressing the previously unexplored challenge of Distractor-free Generalizable 3D Gaussian Splatting (3DGS). It accomplishes two key objectives: fortifying generalizable 3DGS against distractor-laden data during both training and inference phases, while successfully extending cross-scene adaptation capabilities to conventional distractor-free approaches. To achi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17605v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17605v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17605v1-abstract-full" style="display: none;"> We present DGGS, a novel framework addressing the previously unexplored challenge of Distractor-free Generalizable 3D Gaussian Splatting (3DGS). It accomplishes two key objectives: fortifying generalizable 3DGS against distractor-laden data during both training and inference phases, while successfully extending cross-scene adaptation capabilities to conventional distractor-free approaches. To achieve these objectives, DGGS introduces a scene-agnostic reference-based mask prediction and refinement methodology during training phase, coupled with a training view selection strategy, effectively improving distractor prediction accuracy and training stability. Moreover, to address distractor-induced voids and artifacts during inference stage, we propose a two-stage inference framework for better reference selection based on the predicted distractor masks, complemented by a distractor pruning module to eliminate residual distractor effects. Extensive generalization experiments demonstrate DGGS's advantages under distractor-laden conditions. Additionally, experimental results show that our scene-agnostic mask inference achieves accuracy comparable to scene-specific trained methods. Homepage is \url{https://github.com/bbbbby-99/DGGS}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17605v1-abstract-full').style.display = 'none'; document.getElementById('2411.17605v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12755">arXiv:2411.12755</a> <span> [<a href="https://arxiv.org/pdf/2411.12755">pdf</a>, <a href="https://arxiv.org/format/2411.12755">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SAM-I2I: Unleash the Power of Segment Anything Model for Medical Image Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jiayu Huo</a>, <a href="/search/cs?searchtype=author&query=Ourselin%2C+S">Sebastien Ourselin</a>, <a href="/search/cs?searchtype=author&query=Sparks%2C+R">Rachel Sparks</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12755v1-abstract-short" style="display: inline;"> Medical image translation is crucial for reducing the need for redundant and expensive multi-modal imaging in clinical field. However, current approaches based on Convolutional Neural Networks (CNNs) and Transformers often fail to capture fine-grain semantic features, resulting in suboptimal image quality. To address this challenge, we propose SAM-I2I, a novel image-to-image translation framework… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12755v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12755v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12755v1-abstract-full" style="display: none;"> Medical image translation is crucial for reducing the need for redundant and expensive multi-modal imaging in clinical field. However, current approaches based on Convolutional Neural Networks (CNNs) and Transformers often fail to capture fine-grain semantic features, resulting in suboptimal image quality. To address this challenge, we propose SAM-I2I, a novel image-to-image translation framework based on the Segment Anything Model 2 (SAM2). SAM-I2I utilizes a pre-trained image encoder to extract multiscale semantic features from the source image and a decoder, based on the mask unit attention module, to synthesize target modality images. Our experiments on multi-contrast MRI datasets demonstrate that SAM-I2I outperforms state-of-the-art methods, offering more efficient and accurate medical image translation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12755v1-abstract-full').style.display = 'none'; document.getElementById('2411.12755v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16714">arXiv:2410.16714</a> <span> [<a href="https://arxiv.org/pdf/2410.16714">pdf</a>, <a href="https://arxiv.org/ps/2410.16714">ps</a>, <a href="https://arxiv.org/format/2410.16714">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Magnetic Preference Optimization: Achieving Last-iterate Convergence for Language Model Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mingzhi Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chengdong Ma</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qizhi Chen</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+L">Linjian Meng</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Y">Yang Han</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jiancong Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhaowei Zhang</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jing Huo</a>, <a href="/search/cs?searchtype=author&query=Su%2C+W+J">Weijie J. Su</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yaodong Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16714v2-abstract-short" style="display: inline;"> Self-play methods have demonstrated remarkable success in enhancing model capabilities across various domains. In the context of Reinforcement Learning from Human Feedback (RLHF), self-play not only boosts Large Language Model (LLM) performance but also overcomes the limitations of traditional Bradley-Terry (BT) model assumptions by finding the Nash equilibrium (NE) of a preference-based, two-play… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16714v2-abstract-full').style.display = 'inline'; document.getElementById('2410.16714v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16714v2-abstract-full" style="display: none;"> Self-play methods have demonstrated remarkable success in enhancing model capabilities across various domains. In the context of Reinforcement Learning from Human Feedback (RLHF), self-play not only boosts Large Language Model (LLM) performance but also overcomes the limitations of traditional Bradley-Terry (BT) model assumptions by finding the Nash equilibrium (NE) of a preference-based, two-player constant-sum game. However, existing methods either guarantee only average-iterate convergence, incurring high storage and inference costs, or converge to the NE of a regularized game, failing to accurately reflect true human preferences. In this paper, we introduce Magnetic Preference Optimization (MPO), a novel approach capable of achieving last-iterate convergence to the NE of the original game, effectively overcoming the limitations of existing methods. Building upon Magnetic Mirror Descent (MMD), MPO attains a linear convergence rate, making it particularly suitable for fine-tuning LLMs. To ensure our algorithm is both theoretically sound and practically viable, we present a simple yet effective implementation that adapts the theoretical insights to the RLHF setting. Empirical results demonstrate that MPO can significantly enhance the performance of LLMs, highlighting the potential of self-play methods in alignment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16714v2-abstract-full').style.display = 'none'; document.getElementById('2410.16714v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04819">arXiv:2410.04819</a> <span> [<a href="https://arxiv.org/pdf/2410.04819">pdf</a>, <a href="https://arxiv.org/format/2410.04819">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MINER: Mining the Underlying Pattern of Modality-Specific Neurons in Multimodal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kaichen Huang</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jiahao Huo</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yibo Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kun Wang</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+Y">Yutao Yue</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xuming Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04819v1-abstract-short" style="display: inline;"> In recent years, multimodal large language models (MLLMs) have significantly advanced, integrating more modalities into diverse applications. However, the lack of explainability remains a major barrier to their use in scenarios requiring decision transparency. Current neuron-level explanation paradigms mainly focus on knowledge localization or language- and domain-specific analyses, leaving the ex… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04819v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04819v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04819v1-abstract-full" style="display: none;"> In recent years, multimodal large language models (MLLMs) have significantly advanced, integrating more modalities into diverse applications. However, the lack of explainability remains a major barrier to their use in scenarios requiring decision transparency. Current neuron-level explanation paradigms mainly focus on knowledge localization or language- and domain-specific analyses, leaving the exploration of multimodality largely unaddressed. To tackle these challenges, we propose MINER, a transferable framework for mining modality-specific neurons (MSNs) in MLLMs, which comprises four stages: (1) modality separation, (2) importance score calculation, (3) importance score aggregation, (4) modality-specific neuron selection. Extensive experiments across six benchmarks and two representative MLLMs show that (I) deactivating ONLY 2% of MSNs significantly reduces MLLMs performance (0.56 to 0.24 for Qwen2-VL, 0.69 to 0.31 for Qwen2-Audio), (II) different modalities mainly converge in the lower layers, (III) MSNs influence how key information from various modalities converges to the last token, (IV) two intriguing phenomena worth further investigation, i.e., semantic probing and semantic telomeres. The source code is available at this URL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04819v1-abstract-full').style.display = 'none'; document.getElementById('2410.04819v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04509">arXiv:2410.04509</a> <span> [<a href="https://arxiv.org/pdf/2410.04509">pdf</a>, <a href="https://arxiv.org/format/2410.04509">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ErrorRadar: Benchmarking Complex Mathematical Reasoning of Multimodal Large Language Models Via Error Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yibo Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shen Wang</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jiahao Huo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hang Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Boyan Li</a>, <a href="/search/cs?searchtype=author&query=Su%2C+J">Jiamin Su</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xiong Gao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yi-Fan Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+T">Tianlong Xu</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+Z">Zhendong Chu</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+A">Aoxiao Zhong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kun Wang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+H">Hui Xiong</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+P+S">Philip S. Yu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xuming Hu</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Q">Qingsong Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04509v2-abstract-short" style="display: inline;"> As the field of Multimodal Large Language Models (MLLMs) continues to evolve, their potential to revolutionize artificial intelligence is particularly promising, especially in addressing mathematical reasoning tasks. Current mathematical benchmarks predominantly focus on evaluating MLLMs' problem-solving ability, yet there is a crucial gap in addressing more complex scenarios such as error detecti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04509v2-abstract-full').style.display = 'inline'; document.getElementById('2410.04509v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04509v2-abstract-full" style="display: none;"> As the field of Multimodal Large Language Models (MLLMs) continues to evolve, their potential to revolutionize artificial intelligence is particularly promising, especially in addressing mathematical reasoning tasks. Current mathematical benchmarks predominantly focus on evaluating MLLMs' problem-solving ability, yet there is a crucial gap in addressing more complex scenarios such as error detection, for enhancing reasoning capability in complicated settings. To fill this gap, we formally formulate the new task: multimodal error detection, and introduce ErrorRadar, the first benchmark designed to assess MLLMs' capabilities in such a task. ErrorRadar evaluates two sub-tasks: error step identification and error categorization, providing a comprehensive framework for evaluating MLLMs' complex mathematical reasoning ability. It consists of 2,500 high-quality multimodal K-12 mathematical problems, collected from real-world student interactions in an educational organization, with rigorous annotation and rich metadata such as problem type and error category. Through extensive experiments, we evaluated both open-source and closed-source representative MLLMs, benchmarking their performance against educational expert evaluators. Results indicate significant challenges still remain, as GPT-4o with best performance is still around 10% behind human evaluation. The dataset will be available upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04509v2-abstract-full').style.display = 'none'; document.getElementById('2410.04509v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.20154">arXiv:2409.20154</a> <span> [<a href="https://arxiv.org/pdf/2409.20154">pdf</a>, <a href="https://arxiv.org/format/2409.20154">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> GravMAD: Grounded Spatial Value Maps Guided Action Diffusion for Generalized 3D Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yangtao Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zixuan Chen</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Junhui Yin</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jing Huo</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+P">Pinzhuo Tian</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jieqi Shi</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.20154v4-abstract-short" style="display: inline;"> Robots' ability to follow language instructions and execute diverse 3D manipulation tasks is vital in robot learning. Traditional imitation learning-based methods perform well on seen tasks but struggle with novel, unseen ones due to variability. Recent approaches leverage large foundation models to assist in understanding novel tasks, thereby mitigating this issue. However, these methods lack a t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20154v4-abstract-full').style.display = 'inline'; document.getElementById('2409.20154v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.20154v4-abstract-full" style="display: none;"> Robots' ability to follow language instructions and execute diverse 3D manipulation tasks is vital in robot learning. Traditional imitation learning-based methods perform well on seen tasks but struggle with novel, unseen ones due to variability. Recent approaches leverage large foundation models to assist in understanding novel tasks, thereby mitigating this issue. However, these methods lack a task-specific learning process, which is essential for an accurate understanding of 3D environments, often leading to execution failures. In this paper, we introduce GravMAD, a sub-goal-driven, language-conditioned action diffusion framework that combines the strengths of imitation learning and foundation models. Our approach breaks tasks into sub-goals based on language instructions, allowing auxiliary guidance during both training and inference. During training, we introduce Sub-goal Keypose Discovery to identify key sub-goals from demonstrations. Inference differs from training, as there are no demonstrations available, so we use pre-trained foundation models to bridge the gap and identify sub-goals for the current task. In both phases, GravMaps are generated from sub-goals, providing GravMAD with more flexible 3D spatial guidance compared to fixed 3D positions. Empirical evaluations on RLBench show that GravMAD significantly outperforms state-of-the-art methods, with a 28.63% improvement on novel tasks and a 13.36% gain on tasks encountered during training. Evaluations on real-world robotic tasks further show that GravMAD can reason about real-world tasks, associate them with relevant visual information, and generalize to novel tasks. These results demonstrate GravMAD's strong multi-task learning and generalization in 3D manipulation. Video demonstrations are available at: https://gravmad.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20154v4-abstract-full').style.display = 'none'; document.getElementById('2409.20154v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2025. The first two authors contributed equally</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.11297">arXiv:2408.11297</a> <span> [<a href="https://arxiv.org/pdf/2408.11297">pdf</a>, <a href="https://arxiv.org/format/2408.11297">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Making Large Vision Language Models to be Good Few-shot Learners </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+F">Fan Liu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+W">Wenwen Cai</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jian Huo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chuanyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Delong Chen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jun Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.11297v1-abstract-short" style="display: inline;"> Few-shot classification (FSC) is a fundamental yet challenging task in computer vision that involves recognizing novel classes from limited data. While previous methods have focused on enhancing visual features or incorporating additional modalities, Large Vision Language Models (LVLMs) offer a promising alternative due to their rich knowledge and strong visual perception. However, LVLMs risk lear… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11297v1-abstract-full').style.display = 'inline'; document.getElementById('2408.11297v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.11297v1-abstract-full" style="display: none;"> Few-shot classification (FSC) is a fundamental yet challenging task in computer vision that involves recognizing novel classes from limited data. While previous methods have focused on enhancing visual features or incorporating additional modalities, Large Vision Language Models (LVLMs) offer a promising alternative due to their rich knowledge and strong visual perception. However, LVLMs risk learning specific response formats rather than effectively extracting useful information from support data in FSC tasks. In this paper, we investigate LVLMs' performance in FSC and identify key issues such as insufficient learning and the presence of severe positional biases. To tackle the above challenges, we adopt the meta-learning strategy to teach models "learn to learn". By constructing a rich set of meta-tasks for instruction fine-tuning, LVLMs enhance the ability to extract information from few-shot support data for classification. Additionally, we further boost LVLM's few-shot learning capabilities through label augmentation and candidate selection in the fine-tuning and inference stage, respectively. Label augmentation is implemented via a character perturbation strategy to ensure the model focuses on support information. Candidate selection leverages attribute descriptions to filter out unreliable candidates and simplify the task. Extensive experiments demonstrate that our approach achieves superior performance on both general and fine-grained datasets. Furthermore, our candidate selection strategy has been proven beneficial for training-free LVLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11297v1-abstract-full').style.display = 'none'; document.getElementById('2408.11297v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.17418">arXiv:2407.17418</a> <span> [<a href="https://arxiv.org/pdf/2407.17418">pdf</a>, <a href="https://arxiv.org/format/2407.17418">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> 3D Gaussian Splatting: Survey, Technologies, Challenges, and Opportunities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bao%2C+Y">Yanqi Bao</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+T">Tianyu Ding</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jing Huo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yaoli Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuxin Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenbin Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+J">Jiebo Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.17418v2-abstract-short" style="display: inline;"> 3D Gaussian Splatting (3DGS) has emerged as a prominent technique with the potential to become a mainstream method for 3D representations. It can effectively transform multi-view images into explicit 3D Gaussian through efficient training, and achieve real-time rendering of novel views. This survey aims to analyze existing 3DGS-related works from multiple intersecting perspectives, including relat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17418v2-abstract-full').style.display = 'inline'; document.getElementById('2407.17418v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.17418v2-abstract-full" style="display: none;"> 3D Gaussian Splatting (3DGS) has emerged as a prominent technique with the potential to become a mainstream method for 3D representations. It can effectively transform multi-view images into explicit 3D Gaussian through efficient training, and achieve real-time rendering of novel views. This survey aims to analyze existing 3DGS-related works from multiple intersecting perspectives, including related tasks, technologies, challenges, and opportunities. The primary objective is to provide newcomers with a rapid understanding of the field and to assist researchers in methodically organizing existing technologies and challenges. Specifically, we delve into the optimization, application, and extension of 3DGS, categorizing them based on their focuses or motivations. Additionally, we summarize and classify nine types of technical modules and corresponding improvements identified in existing works. Based on these analyses, we further examine the common challenges and technologies across various tasks, proposing potential research opportunities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17418v2-abstract-full').style.display = 'none'; document.getElementById('2407.17418v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.14826">arXiv:2406.14826</a> <span> [<a href="https://arxiv.org/pdf/2406.14826">pdf</a>, <a href="https://arxiv.org/format/2406.14826">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Self-supervised Brain Lesion Generation for Effective Data Augmentation of Medical Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jiayu Huo</a>, <a href="/search/cs?searchtype=author&query=Ourselin%2C+S">Sebastien Ourselin</a>, <a href="/search/cs?searchtype=author&query=Sparks%2C+R">Rachel Sparks</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.14826v2-abstract-short" style="display: inline;"> Accurate brain lesion delineation is important for planning neurosurgical treatment. Automatic brain lesion segmentation methods based on convolutional neural networks have demonstrated remarkable performance. However, neural network performance is constrained by the lack of large-scale well-annotated training datasets. In this manuscript, we propose a comprehensive framework to efficiently genera… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14826v2-abstract-full').style.display = 'inline'; document.getElementById('2406.14826v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.14826v2-abstract-full" style="display: none;"> Accurate brain lesion delineation is important for planning neurosurgical treatment. Automatic brain lesion segmentation methods based on convolutional neural networks have demonstrated remarkable performance. However, neural network performance is constrained by the lack of large-scale well-annotated training datasets. In this manuscript, we propose a comprehensive framework to efficiently generate new samples for training a brain lesion segmentation model. We first train a lesion generator, based on an adversarial autoencoder, in a self-supervised manner. Next, we utilize a novel image composition algorithm, Soft Poisson Blending, to seamlessly combine synthetic lesions and brain images to obtain training samples. Finally, to effectively train the brain lesion segmentation model with augmented images we introduce a new prototype consistence regularization to align real and synthetic features. Our framework is validated by extensive experiments on two public brain lesion segmentation datasets: ATLAS v2.0 and Shift MS. Our method outperforms existing brain image data augmentation schemes. For instance, our method improves the Dice from 50.36% to 60.23% compared to the U-Net with conventional data augmentation techniques for the ATLAS v2.0 dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14826v2-abstract-full').style.display = 'none'; document.getElementById('2406.14826v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 7 figures, 8 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.11193">arXiv:2406.11193</a> <span> [<a href="https://arxiv.org/pdf/2406.11193">pdf</a>, <a href="https://arxiv.org/format/2406.11193">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MMNeuron: Discovering Neuron-Level Domain-Specific Interpretation in Multimodal Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jiahao Huo</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yibo Yan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+B">Boren Hu</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+Y">Yutao Yue</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xuming Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.11193v2-abstract-short" style="display: inline;"> Projecting visual features into word embedding space has become a significant fusion strategy adopted by Multimodal Large Language Models (MLLMs). However, its internal mechanisms have yet to be explored. Inspired by multilingual research, we identify domain-specific neurons in multimodal large language models. Specifically, we investigate the distribution of domain-specific neurons and the mechan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11193v2-abstract-full').style.display = 'inline'; document.getElementById('2406.11193v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.11193v2-abstract-full" style="display: none;"> Projecting visual features into word embedding space has become a significant fusion strategy adopted by Multimodal Large Language Models (MLLMs). However, its internal mechanisms have yet to be explored. Inspired by multilingual research, we identify domain-specific neurons in multimodal large language models. Specifically, we investigate the distribution of domain-specific neurons and the mechanism of how MLLMs process features from diverse domains. Furthermore, we propose a three-stage mechanism for language model modules in MLLMs when handling projected image features, and verify this hypothesis using logit lens. Extensive experiments indicate that while current MLLMs exhibit Visual Question Answering (VQA) capability, they may not fully utilize domain-specific information. Manipulating domain-specific neurons properly will result in a 10% change of accuracy at most, shedding light on the development of cross-domain, all-encompassing MLLMs in the future. The source code is available at https://github.com/Z1zs/MMNeuron. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11193v2-abstract-full').style.display = 'none'; document.getElementById('2406.11193v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the Main Conference of Empirical Methods in Natural Language Processing (EMNLP) 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.04888">arXiv:2406.04888</a> <span> [<a href="https://arxiv.org/pdf/2406.04888">pdf</a>, <a href="https://arxiv.org/format/2406.04888">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Zero-Shot Video Editing through Adaptive Sliding Score Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Lianghan Zhu</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+Y">Yanqi Bao</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jing Huo</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jing Wu</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+Y">Yu-Kun Lai</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenbin Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.04888v2-abstract-short" style="display: inline;"> The rapidly evolving field of Text-to-Video generation (T2V) has catalyzed renewed interest in controllable video editing research. While the application of editing prompts to guide diffusion model denoising has gained prominence, mirroring advancements in image editing, this noise-based inference process inherently compromises the original video's integrity, resulting in unintended over-editing a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04888v2-abstract-full').style.display = 'inline'; document.getElementById('2406.04888v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.04888v2-abstract-full" style="display: none;"> The rapidly evolving field of Text-to-Video generation (T2V) has catalyzed renewed interest in controllable video editing research. While the application of editing prompts to guide diffusion model denoising has gained prominence, mirroring advancements in image editing, this noise-based inference process inherently compromises the original video's integrity, resulting in unintended over-editing and temporal discontinuities. To address these challenges, this study proposes a novel paradigm of video-based score distillation, facilitating direct manipulation of original video content. Specifically, distinguishing it from image-based score distillation, we propose an Adaptive Sliding Score Distillation strategy, which incorporates both global and local video guidance to reduce the impact of editing errors. Combined with our proposed Image-based Joint Guidance mechanism, it has the ability to mitigate the inherent instability of the T2V model and single-step sampling. Additionally, we design a Weighted Attention Fusion module to further preserve the key features of the original video and avoid over-editing. Extensive experiments demonstrate that these strategies effectively address existing challenges, achieving superior performance compared to current state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04888v2-abstract-full').style.display = 'none'; document.getElementById('2406.04888v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.10316">arXiv:2405.10316</a> <span> [<a href="https://arxiv.org/pdf/2405.10316">pdf</a>, <a href="https://arxiv.org/format/2405.10316">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Analogist: Out-of-the-box Visual In-Context Learning with Image Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gu%2C+Z">Zheng Gu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shiyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jing Liao</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jing Huo</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.10316v1-abstract-short" style="display: inline;"> Visual In-Context Learning (ICL) has emerged as a promising research area due to its capability to accomplish various tasks with limited example pairs through analogical reasoning. However, training-based visual ICL has limitations in its ability to generalize to unseen tasks and requires the collection of a diverse task dataset. On the other hand, existing methods in the inference-based visual IC… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10316v1-abstract-full').style.display = 'inline'; document.getElementById('2405.10316v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.10316v1-abstract-full" style="display: none;"> Visual In-Context Learning (ICL) has emerged as a promising research area due to its capability to accomplish various tasks with limited example pairs through analogical reasoning. However, training-based visual ICL has limitations in its ability to generalize to unseen tasks and requires the collection of a diverse task dataset. On the other hand, existing methods in the inference-based visual ICL category solely rely on textual prompts, which fail to capture fine-grained contextual information from given examples and can be time-consuming when converting from images to text prompts. To address these challenges, we propose Analogist, a novel inference-based visual ICL approach that exploits both visual and textual prompting techniques using a text-to-image diffusion model pretrained for image inpainting. For visual prompting, we propose a self-attention cloning (SAC) method to guide the fine-grained structural-level analogy between image examples. For textual prompting, we leverage GPT-4V's visual reasoning capability to efficiently generate text prompts and introduce a cross-attention masking (CAM) operation to enhance the accuracy of semantic-level analogy guided by text prompts. Our method is out-of-the-box and does not require fine-tuning or optimization. It is also generic and flexible, enabling a wide range of visual tasks to be performed in an in-context manner. Extensive experiments demonstrate the superiority of our method over existing approaches, both qualitatively and quantitatively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10316v1-abstract-full').style.display = 'none'; document.getElementById('2405.10316v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://analogist2d.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.10160">arXiv:2404.10160</a> <span> [<a href="https://arxiv.org/pdf/2404.10160">pdf</a>, <a href="https://arxiv.org/format/2404.10160">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Reinforcement Learning from Multi-role Debates as Feedback for Bias Mitigation in LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+R">Ruoxi Cheng</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+H">Haoxuan Ma</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+S">Shuirong Cao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiaqi Li</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+A">Aihua Pei</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhiqiang Wang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+P">Pengliang Ji</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoyu Wang</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jiaqi Huo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.10160v6-abstract-short" style="display: inline;"> Bias in LLMs can harm user experience and societal outcomes. However, current bias mitigation methods often require intensive human feedback, lack transferability to other topics or yield overconfident and random outputs. We find that involving LLMs in role-playing scenario boosts their ability to recognize and mitigate biases. Based on this, we propose Reinforcement Learning from Multi-role Debat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10160v6-abstract-full').style.display = 'inline'; document.getElementById('2404.10160v6-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.10160v6-abstract-full" style="display: none;"> Bias in LLMs can harm user experience and societal outcomes. However, current bias mitigation methods often require intensive human feedback, lack transferability to other topics or yield overconfident and random outputs. We find that involving LLMs in role-playing scenario boosts their ability to recognize and mitigate biases. Based on this, we propose Reinforcement Learning from Multi-role Debates as Feedback (RLDF), a novel approach for bias mitigation replacing human feedback in traditional RLHF. We utilize LLMs in multi-role debates to create a dataset that includes both high-bias and low-bias instances for training the reward model in reinforcement learning. Our approach comprises two modes: (1) self-reflection, where the same LLM participates in multi-role debates, and (2) teacher-student, where a more advanced LLM like GPT-3.5-turbo guides the LLM to perform this task. Experimental results across different LLMs on BBQ and our datasets demonstrate the effectiveness of our approach in bias mitigation. Our source code and datasets are available at \texttt{https://anonymous.4open.science/r/RLDF-E344}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10160v6-abstract-full').style.display = 'none'; document.getElementById('2404.10160v6-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The first three authors contributed equally to this work</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.08016">arXiv:2404.08016</a> <span> [<a href="https://arxiv.org/pdf/2404.08016">pdf</a>, <a href="https://arxiv.org/format/2404.08016">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ONNXPruner: ONNX-Based General Model Pruning Adapter </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ren%2C+D">Dongdong Ren</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenbin Li</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+T">Tianyu Ding</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lei Wang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Q">Qi Fan</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jing Huo</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+H">Hongbing Pan</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.08016v1-abstract-short" style="display: inline;"> Recent advancements in model pruning have focused on developing new algorithms and improving upon benchmarks. However, the practical application of these algorithms across various models and platforms remains a significant challenge. To address this challenge, we propose ONNXPruner, a versatile pruning adapter designed for the ONNX format models. ONNXPruner streamlines the adaptation process acros… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.08016v1-abstract-full').style.display = 'inline'; document.getElementById('2404.08016v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.08016v1-abstract-full" style="display: none;"> Recent advancements in model pruning have focused on developing new algorithms and improving upon benchmarks. However, the practical application of these algorithms across various models and platforms remains a significant challenge. To address this challenge, we propose ONNXPruner, a versatile pruning adapter designed for the ONNX format models. ONNXPruner streamlines the adaptation process across diverse deep learning frameworks and hardware platforms. A novel aspect of ONNXPruner is its use of node association trees, which automatically adapt to various model architectures. These trees clarify the structural relationships between nodes, guiding the pruning process, particularly highlighting the impact on interconnected nodes. Furthermore, we introduce a tree-level evaluation method. By leveraging node association trees, this method allows for a comprehensive analysis beyond traditional single-node evaluations, enhancing pruning performance without the need for extra operations. Experiments across multiple models and datasets confirm ONNXPruner's strong adaptability and increased efficacy. Our work aims to advance the practical application of model pruning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.08016v1-abstract-full').style.display = 'none'; document.getElementById('2404.08016v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.00563">arXiv:2404.00563</a> <span> [<a href="https://arxiv.org/pdf/2404.00563">pdf</a>, <a href="https://arxiv.org/format/2404.00563">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Exploiting Inter-sample and Inter-feature Relations in Dataset Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+W">Wenxiao Deng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenbin Li</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+T">Tianyu Ding</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lei Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongguang Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kuihua Huang</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jing Huo</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.00563v1-abstract-short" style="display: inline;"> Dataset distillation has emerged as a promising approach in deep learning, enabling efficient training with small synthetic datasets derived from larger real ones. Particularly, distribution matching-based distillation methods attract attention thanks to its effectiveness and low computational cost. However, these methods face two primary limitations: the dispersed feature distribution within the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00563v1-abstract-full').style.display = 'inline'; document.getElementById('2404.00563v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.00563v1-abstract-full" style="display: none;"> Dataset distillation has emerged as a promising approach in deep learning, enabling efficient training with small synthetic datasets derived from larger real ones. Particularly, distribution matching-based distillation methods attract attention thanks to its effectiveness and low computational cost. However, these methods face two primary limitations: the dispersed feature distribution within the same class in synthetic datasets, reducing class discrimination, and an exclusive focus on mean feature consistency, lacking precision and comprehensiveness. To address these challenges, we introduce two novel constraints: a class centralization constraint and a covariance matching constraint. The class centralization constraint aims to enhance class discrimination by more closely clustering samples within classes. The covariance matching constraint seeks to achieve more accurate feature distribution matching between real and synthetic datasets through local feature covariance matrices, particularly beneficial when sample sizes are much smaller than the number of features. Experiments demonstrate notable improvements with these constraints, yielding performance boosts of up to 6.6% on CIFAR10, 2.9% on SVHN, 2.5% on CIFAR100, and 2.5% on TinyImageNet, compared to the state-of-the-art relevant methods. In addition, our method maintains robust performance in cross-architecture settings, with a maximum performance drop of 1.7% on four architectures. Code is available at https://github.com/VincenDen/IID. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00563v1-abstract-full').style.display = 'none'; document.getElementById('2404.00563v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.19425">arXiv:2403.19425</a> <span> [<a href="https://arxiv.org/pdf/2403.19425">pdf</a>, <a href="https://arxiv.org/ps/2403.19425">ps</a>, <a href="https://arxiv.org/format/2403.19425">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Robust Ensemble Algorithm for Ischemic Stroke Lesion Segmentation: Generalizability and Clinical Utility Beyond the ISLES Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=de+la+Rosa%2C+E">Ezequiel de la Rosa</a>, <a href="/search/cs?searchtype=author&query=Reyes%2C+M">Mauricio Reyes</a>, <a href="/search/cs?searchtype=author&query=Liew%2C+S">Sook-Lei Liew</a>, <a href="/search/cs?searchtype=author&query=Hutton%2C+A">Alexandre Hutton</a>, <a href="/search/cs?searchtype=author&query=Wiest%2C+R">Roland Wiest</a>, <a href="/search/cs?searchtype=author&query=Kaesmacher%2C+J">Johannes Kaesmacher</a>, <a href="/search/cs?searchtype=author&query=Hanning%2C+U">Uta Hanning</a>, <a href="/search/cs?searchtype=author&query=Hakim%2C+A">Arsany Hakim</a>, <a href="/search/cs?searchtype=author&query=Zubal%2C+R">Richard Zubal</a>, <a href="/search/cs?searchtype=author&query=Valenzuela%2C+W">Waldo Valenzuela</a>, <a href="/search/cs?searchtype=author&query=Robben%2C+D">David Robben</a>, <a href="/search/cs?searchtype=author&query=Sima%2C+D+M">Diana M. Sima</a>, <a href="/search/cs?searchtype=author&query=Anania%2C+V">Vincenzo Anania</a>, <a href="/search/cs?searchtype=author&query=Brys%2C+A">Arne Brys</a>, <a href="/search/cs?searchtype=author&query=Meakin%2C+J+A">James A. Meakin</a>, <a href="/search/cs?searchtype=author&query=Mickan%2C+A">Anne Mickan</a>, <a href="/search/cs?searchtype=author&query=Broocks%2C+G">Gabriel Broocks</a>, <a href="/search/cs?searchtype=author&query=Heitkamp%2C+C">Christian Heitkamp</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+S">Shengbo Gao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+K">Kongming Liang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Ziji Zhang</a>, <a href="/search/cs?searchtype=author&query=Siddiquee%2C+M+M+R">Md Mahfuzur Rahman Siddiquee</a>, <a href="/search/cs?searchtype=author&query=Myronenko%2C+A">Andriy Myronenko</a>, <a href="/search/cs?searchtype=author&query=Ashtari%2C+P">Pooya Ashtari</a>, <a href="/search/cs?searchtype=author&query=Van+Huffel%2C+S">Sabine Van Huffel</a> , et al. (33 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.19425v2-abstract-short" style="display: inline;"> Diffusion-weighted MRI (DWI) is essential for stroke diagnosis, treatment decisions, and prognosis. However, image and disease variability hinder the development of generalizable AI algorithms with clinical value. We address this gap by presenting a novel ensemble algorithm derived from the 2022 Ischemic Stroke Lesion Segmentation (ISLES) challenge. ISLES'22 provided 400 patient scans with ischemi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.19425v2-abstract-full').style.display = 'inline'; document.getElementById('2403.19425v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.19425v2-abstract-full" style="display: none;"> Diffusion-weighted MRI (DWI) is essential for stroke diagnosis, treatment decisions, and prognosis. However, image and disease variability hinder the development of generalizable AI algorithms with clinical value. We address this gap by presenting a novel ensemble algorithm derived from the 2022 Ischemic Stroke Lesion Segmentation (ISLES) challenge. ISLES'22 provided 400 patient scans with ischemic stroke from various medical centers, facilitating the development of a wide range of cutting-edge segmentation algorithms by the research community. Through collaboration with leading teams, we combined top-performing algorithms into an ensemble model that overcomes the limitations of individual solutions. Our ensemble model achieved superior ischemic lesion detection and segmentation accuracy on our internal test set compared to individual algorithms. This accuracy generalized well across diverse image and disease variables. Furthermore, the model excelled in extracting clinical biomarkers. Notably, in a Turing-like test, neuroradiologists consistently preferred the algorithm's segmentations over manual expert efforts, highlighting increased comprehensiveness and precision. Validation using a real-world external dataset (N=1686) confirmed the model's generalizability. The algorithm's outputs also demonstrated strong correlations with clinical scores (admission NIHSS and 90-day mRS) on par with or exceeding expert-derived results, underlining its clinical relevance. This study offers two key findings. First, we present an ensemble algorithm (https://github.com/Tabrisrei/ISLES22_Ensemble) that detects and segments ischemic stroke lesions on DWI across diverse scenarios on par with expert (neuro)radiologists. Second, we show the potential for biomedical challenge outputs to extend beyond the challenge's initial objectives, demonstrating their real-world clinical applicability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.19425v2-abstract-full').style.display = 'none'; document.getElementById('2403.19425v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.18211">arXiv:2403.18211</a> <span> [<a href="https://arxiv.org/pdf/2403.18211">pdf</a>, <a href="https://arxiv.org/format/2403.18211">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> NeuroPictor: Refining fMRI-to-Image Reconstruction via Multi-individual Pretraining and Multi-level Modulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jingyang Huo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yikai Wang</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+X">Xuelin Qian</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yun Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chong Li</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+J">Jianfeng Feng</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yanwei Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.18211v2-abstract-short" style="display: inline;"> Recent fMRI-to-image approaches mainly focused on associating fMRI signals with specific conditions of pre-trained diffusion models. These approaches, while producing high-quality images, capture only a limited aspect of the complex information in fMRI signals and offer little detailed control over image creation. In contrast, this paper proposes to directly modulate the generation process of diff… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.18211v2-abstract-full').style.display = 'inline'; document.getElementById('2403.18211v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.18211v2-abstract-full" style="display: none;"> Recent fMRI-to-image approaches mainly focused on associating fMRI signals with specific conditions of pre-trained diffusion models. These approaches, while producing high-quality images, capture only a limited aspect of the complex information in fMRI signals and offer little detailed control over image creation. In contrast, this paper proposes to directly modulate the generation process of diffusion models using fMRI signals. Our approach, NeuroPictor, divides the fMRI-to-image process into three steps: i) fMRI calibrated-encoding, to tackle multi-individual pre-training for a shared latent space to minimize individual difference and enable the subsequent multi-subject training; ii) fMRI-to-image multi-subject pre-training, perceptually learning to guide diffusion model with high- and low-level conditions across different individuals; iii) fMRI-to-image single-subject refining, similar with step ii but focus on adapting to particular individual. NeuroPictor extracts high-level semantic features from fMRI signals that characterizing the visual stimulus and incrementally fine-tunes the diffusion model with a low-level manipulation network to provide precise structural instructions. By training with about 67,000 fMRI-image pairs from various individuals, our model enjoys superior fMRI-to-image decoding capacity, particularly in the within-subject setting, as evidenced in benchmark datasets. Our code and model are available at https://jingyanghuo.github.io/neuropictor/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.18211v2-abstract-full').style.display = 'none'; document.getElementById('2403.18211v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.18198">arXiv:2403.18198</a> <span> [<a href="https://arxiv.org/pdf/2403.18198">pdf</a>, <a href="https://arxiv.org/format/2403.18198">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Generative Medical Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jiayu Huo</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+X">Xi Ouyang</a>, <a href="/search/cs?searchtype=author&query=Ourselin%2C+S">S茅bastien Ourselin</a>, <a href="/search/cs?searchtype=author&query=Sparks%2C+R">Rachel Sparks</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.18198v2-abstract-short" style="display: inline;"> Rapid advancements in medical image segmentation performance have been significantly driven by the development of Convolutional Neural Networks (CNNs) and Vision Transformers (ViTs). These models follow the discriminative pixel-wise classification learning paradigm and often have limited ability to generalize across diverse medical imaging datasets. In this manuscript, we introduce Generative Medi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.18198v2-abstract-full').style.display = 'inline'; document.getElementById('2403.18198v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.18198v2-abstract-full" style="display: none;"> Rapid advancements in medical image segmentation performance have been significantly driven by the development of Convolutional Neural Networks (CNNs) and Vision Transformers (ViTs). These models follow the discriminative pixel-wise classification learning paradigm and often have limited ability to generalize across diverse medical imaging datasets. In this manuscript, we introduce Generative Medical Segmentation (GMS), a novel approach leveraging a generative model to perform image segmentation. Concretely, GMS employs a robust pre-trained vision foundation model to extract latent representations for images and corresponding ground truth masks, followed by a model that learns a mapping function from the image to the mask in the latent space. Once trained, the model generates an estimated segmentation mask using the pre-trained vision foundation model to decode the predicted latent representation back into the image space. The design of GMS leads to fewer trainable parameters in the model which reduces the risk of overfitting and enhances its generalization capability. Our experimental analysis across five public datasets in different medical imaging domains demonstrates GMS outperforms existing discriminative and generative segmentation models. Furthermore, GMS is able to generalize well across datasets from different centers within the same imaging modality. Our experiments suggest GMS offers a scalable and effective solution for medical image segmentation. GMS implementation and trained model weights are available at https://github.com/King-HAW/GMS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.18198v2-abstract-full').style.display = 'none'; document.getElementById('2403.18198v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.15901">arXiv:2403.15901</a> <span> [<a href="https://arxiv.org/pdf/2403.15901">pdf</a>, <a href="https://arxiv.org/format/2403.15901">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MatchSeg: Towards Better Segmentation via Reference Image Matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jiayu Huo</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+R">Ruiqiang Xiao</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Haotian Zheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Ourselin%2C+S">Sebastien Ourselin</a>, <a href="/search/cs?searchtype=author&query=Sparks%2C+R">Rachel Sparks</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.15901v3-abstract-short" style="display: inline;"> Recently, automated medical image segmentation methods based on deep learning have achieved great success. However, they heavily rely on large annotated datasets, which are costly and time-consuming to acquire. Few-shot learning aims to overcome the need for annotated data by using a small labeled dataset, known as a support set, to guide predicting labels for new, unlabeled images, known as the q… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.15901v3-abstract-full').style.display = 'inline'; document.getElementById('2403.15901v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.15901v3-abstract-full" style="display: none;"> Recently, automated medical image segmentation methods based on deep learning have achieved great success. However, they heavily rely on large annotated datasets, which are costly and time-consuming to acquire. Few-shot learning aims to overcome the need for annotated data by using a small labeled dataset, known as a support set, to guide predicting labels for new, unlabeled images, known as the query set. Inspired by this paradigm, we introduce MatchSeg, a novel framework that enhances medical image segmentation through strategic reference image matching. We leverage contrastive language-image pre-training (CLIP) to select highly relevant samples when defining the support set. Additionally, we design a joint attention module to strengthen the interaction between support and query features, facilitating a more effective knowledge transfer between support and query sets. We validated our method across four public datasets. Experimental results demonstrate superior segmentation performance and powerful domain generalization ability of MatchSeg against existing methods for domain-specific and cross-domain segmentation tasks. Our code is made available at https://github.com/keeplearning-again/MatchSeg <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.15901v3-abstract-full').style.display = 'none'; document.getElementById('2403.15901v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">International Conference on Bioinformatics and Biomedicine (BIBM 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.15647">arXiv:2403.15647</a> <span> [<a href="https://arxiv.org/pdf/2403.15647">pdf</a>, <a href="https://arxiv.org/format/2403.15647">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RetiGen: A Framework for Generalized Retinal Diagnosis Using Multi-View Fundus Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Ze Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Gongyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jiayu Huo</a>, <a href="/search/cs?searchtype=author&query=Rio%2C+J+N+d">Joan Nunez do Rio</a>, <a href="/search/cs?searchtype=author&query=Komninos%2C+C">Charalampos Komninos</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Sparks%2C+R">Rachel Sparks</a>, <a href="/search/cs?searchtype=author&query=Ourselin%2C+S">Sebastien Ourselin</a>, <a href="/search/cs?searchtype=author&query=Bergeles%2C+C">Christos Bergeles</a>, <a href="/search/cs?searchtype=author&query=Jackson%2C+T">Timothy Jackson</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.15647v1-abstract-short" style="display: inline;"> This study introduces a novel framework for enhancing domain generalization in medical imaging, specifically focusing on utilizing unlabelled multi-view colour fundus photographs. Unlike traditional approaches that rely on single-view imaging data and face challenges in generalizing across diverse clinical settings, our method leverages the rich information in the unlabelled multi-view imaging dat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.15647v1-abstract-full').style.display = 'inline'; document.getElementById('2403.15647v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.15647v1-abstract-full" style="display: none;"> This study introduces a novel framework for enhancing domain generalization in medical imaging, specifically focusing on utilizing unlabelled multi-view colour fundus photographs. Unlike traditional approaches that rely on single-view imaging data and face challenges in generalizing across diverse clinical settings, our method leverages the rich information in the unlabelled multi-view imaging data to improve model robustness and accuracy. By incorporating a class balancing method, a test-time adaptation technique and a multi-view optimization strategy, we address the critical issue of domain shift that often hampers the performance of machine learning models in real-world applications. Experiments comparing various state-of-the-art domain generalization and test-time optimization methodologies show that our approach consistently outperforms when combined with existing baseline and state-of-the-art methods. We also show our online method improves all existing techniques. Our framework demonstrates improvements in domain generalization capabilities and offers a practical solution for real-world deployment by facilitating online adaptation to new, unseen datasets. Our code is available at https://github.com/zgy600/RetiGen . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.15647v1-abstract-full').style.display = 'none'; document.getElementById('2403.15647v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.12787">arXiv:2403.12787</a> <span> [<a href="https://arxiv.org/pdf/2403.12787">pdf</a>, <a href="https://arxiv.org/format/2403.12787">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DDSB: An Unsupervised and Training-free Method for Phase Detection in Echocardiography </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bu%2C+Z">Zhenyu Bu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jiayu Huo</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+J">Jingjing Peng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kaini Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+G">Guangquan Zhou</a>, <a href="/search/cs?searchtype=author&query=Sparks%2C+R">Rachel Sparks</a>, <a href="/search/cs?searchtype=author&query=Dasgupta%2C+P">Prokar Dasgupta</a>, <a href="/search/cs?searchtype=author&query=Granados%2C+A">Alejandro Granados</a>, <a href="/search/cs?searchtype=author&query=Ourselin%2C+S">Sebastien Ourselin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.12787v1-abstract-short" style="display: inline;"> Accurate identification of End-Diastolic (ED) and End-Systolic (ES) frames is key for cardiac function assessment through echocardiography. However, traditional methods face several limitations: they require extensive amounts of data, extensive annotations by medical experts, significant training resources, and often lack robustness. Addressing these challenges, we proposed an unsupervised and tra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12787v1-abstract-full').style.display = 'inline'; document.getElementById('2403.12787v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.12787v1-abstract-full" style="display: none;"> Accurate identification of End-Diastolic (ED) and End-Systolic (ES) frames is key for cardiac function assessment through echocardiography. However, traditional methods face several limitations: they require extensive amounts of data, extensive annotations by medical experts, significant training resources, and often lack robustness. Addressing these challenges, we proposed an unsupervised and training-free method, our novel approach leverages unsupervised segmentation to enhance fault tolerance against segmentation inaccuracies. By identifying anchor points and analyzing directional deformation, we effectively reduce dependence on the accuracy of initial segmentation images and enhance fault tolerance, all while improving robustness. Tested on Echo-dynamic and CAMUS datasets, our method achieves comparable accuracy to learning-based models without their associated drawbacks. The code is available at https://github.com/MRUIL/DDSB <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12787v1-abstract-full').style.display = 'none'; document.getElementById('2403.12787v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.11229">arXiv:2403.11229</a> <span> [<a href="https://arxiv.org/pdf/2403.11229">pdf</a>, <a href="https://arxiv.org/format/2403.11229">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Stitching, Fine-tuning, Re-training: A SAM-enabled Framework for Semi-supervised 3D Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+S">Shumeng Li</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+L">Lei Qi</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Q">Qian Yu</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jing Huo</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yinghuan Shi</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.11229v2-abstract-short" style="display: inline;"> Segment Anything Model (SAM) fine-tuning has shown remarkable performance in medical image segmentation in a fully supervised manner, but requires precise annotations. To reduce the annotation cost and maintain satisfactory performance, in this work, we leverage the capabilities of SAM for establishing semi-supervised medical image segmentation models. Rethinking the requirements of effectiveness,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.11229v2-abstract-full').style.display = 'inline'; document.getElementById('2403.11229v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.11229v2-abstract-full" style="display: none;"> Segment Anything Model (SAM) fine-tuning has shown remarkable performance in medical image segmentation in a fully supervised manner, but requires precise annotations. To reduce the annotation cost and maintain satisfactory performance, in this work, we leverage the capabilities of SAM for establishing semi-supervised medical image segmentation models. Rethinking the requirements of effectiveness, efficiency, and compatibility, we propose a three-stage framework, i.e., Stitching, Fine-tuning, and Re-training (SFR). The current fine-tuning approaches mostly involve 2D slice-wise fine-tuning that disregards the contextual information between adjacent slices. Our stitching strategy mitigates the mismatch between natural and 3D medical images. The stitched images are then used for fine-tuning SAM, providing robust initialization of pseudo-labels. Afterwards, we train a 3D semi-supervised segmentation model while maintaining the same parameter size as the conventional segmenter such as V-Net. Our SFR framework is plug-and-play, and easily compatible with various popular semi-supervised methods. We also develop an extended framework SFR$^+$ with selective fine-tuning and re-training through confidence estimation. Extensive experiments validate that our SFR and SFR$^+$ achieve significant improvements in both moderate annotation and scarce annotation across five datasets. In particular, SFR framework improves the Dice score of Mean Teacher from 29.68% to 74.40% with only one labeled data of LA dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.11229v2-abstract-full').style.display = 'none'; document.getElementById('2403.11229v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.10039">arXiv:2403.10039</a> <span> [<a href="https://arxiv.org/pdf/2403.10039">pdf</a>, <a href="https://arxiv.org/format/2403.10039">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Low-quality Optical Flow in Unsupervised Surgical Instrument Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+P">Peiran Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jiayu Huo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Gongyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Bergeles%2C+C">Christos Bergeles</a>, <a href="/search/cs?searchtype=author&query=Sparks%2C+R">Rachel Sparks</a>, <a href="/search/cs?searchtype=author&query=Dasgupta%2C+P">Prokar Dasgupta</a>, <a href="/search/cs?searchtype=author&query=Granados%2C+A">Alejandro Granados</a>, <a href="/search/cs?searchtype=author&query=Ourselin%2C+S">Sebastien Ourselin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.10039v1-abstract-short" style="display: inline;"> Video-based surgical instrument segmentation plays an important role in robot-assisted surgeries. Unlike supervised settings, unsupervised segmentation relies heavily on motion cues, which are challenging to discern due to the typically lower quality of optical flow in surgical footage compared to natural scenes. This presents a considerable burden for the advancement of unsupervised segmentation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10039v1-abstract-full').style.display = 'inline'; document.getElementById('2403.10039v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.10039v1-abstract-full" style="display: none;"> Video-based surgical instrument segmentation plays an important role in robot-assisted surgeries. Unlike supervised settings, unsupervised segmentation relies heavily on motion cues, which are challenging to discern due to the typically lower quality of optical flow in surgical footage compared to natural scenes. This presents a considerable burden for the advancement of unsupervised segmentation techniques. In our work, we address the challenge of enhancing model performance despite the inherent limitations of low-quality optical flow. Our methodology employs a three-pronged approach: extracting boundaries directly from the optical flow, selectively discarding frames with inferior flow quality, and employing a fine-tuning process with variable frame rates. We thoroughly evaluate our strategy on the EndoVis2017 VOS dataset and Endovis2017 Challenge dataset, where our model demonstrates promising results, achieving a mean Intersection-over-Union (mIoU) of 0.75 and 0.72, respectively. Our findings suggest that our approach can greatly decrease the need for manual annotations in clinical environments and may facilitate the annotation process for new datasets. The code is available at https://github.com/wpr1018001/Rethinking-Low-quality-Optical-Flow.git <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10039v1-abstract-full').style.display = 'none'; document.getElementById('2403.10039v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.15746">arXiv:2402.15746</a> <span> [<a href="https://arxiv.org/pdf/2402.15746">pdf</a>, <a href="https://arxiv.org/format/2402.15746">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Intelligent Director: An Automatic Framework for Dynamic Visual Composition using ChatGPT </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Sixiao Zheng</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jingyang Huo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yu Wang</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yanwei Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.15746v1-abstract-short" style="display: inline;"> With the rise of short video platforms represented by TikTok, the trend of users expressing their creativity through photos and videos has increased dramatically. However, ordinary users lack the professional skills to produce high-quality videos using professional creation software. To meet the demand for intelligent and user-friendly video creation tools, we propose the Dynamic Visual Compositio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.15746v1-abstract-full').style.display = 'inline'; document.getElementById('2402.15746v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.15746v1-abstract-full" style="display: none;"> With the rise of short video platforms represented by TikTok, the trend of users expressing their creativity through photos and videos has increased dramatically. However, ordinary users lack the professional skills to produce high-quality videos using professional creation software. To meet the demand for intelligent and user-friendly video creation tools, we propose the Dynamic Visual Composition (DVC) task, an interesting and challenging task that aims to automatically integrate various media elements based on user requirements and create storytelling videos. We propose an Intelligent Director framework, utilizing LENS to generate descriptions for images and video frames and combining ChatGPT to generate coherent captions while recommending appropriate music names. Then, the best-matched music is obtained through music retrieval. Then, materials such as captions, images, videos, and music are integrated to seamlessly synthesize the video. Finally, we apply AnimeGANv2 for style transfer. We construct UCF101-DVC and Personal Album datasets and verified the effectiveness of our framework in solving DVC through qualitative and quantitative comparisons, along with user studies, demonstrating its substantial potential. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.15746v1-abstract-full').style.display = 'none'; document.getElementById('2402.15746v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://sixiaozheng.github.io/IntelligentDirector/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.00496">arXiv:2401.00496</a> <span> [<a href="https://arxiv.org/pdf/2401.00496">pdf</a>, <a href="https://arxiv.org/format/2401.00496">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SAR-RARP50: Segmentation of surgical instrumentation and Action Recognition on Robot-Assisted Radical Prostatectomy Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Psychogyios%2C+D">Dimitrios Psychogyios</a>, <a href="/search/cs?searchtype=author&query=Colleoni%2C+E">Emanuele Colleoni</a>, <a href="/search/cs?searchtype=author&query=Van+Amsterdam%2C+B">Beatrice Van Amsterdam</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chih-Yang Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Shu-Yu Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuchong Li</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+F">Fucang Jia</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+B">Baosheng Zou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guotai Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Boels%2C+M">Maxence Boels</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jiayu Huo</a>, <a href="/search/cs?searchtype=author&query=Sparks%2C+R">Rachel Sparks</a>, <a href="/search/cs?searchtype=author&query=Dasgupta%2C+P">Prokar Dasgupta</a>, <a href="/search/cs?searchtype=author&query=Granados%2C+A">Alejandro Granados</a>, <a href="/search/cs?searchtype=author&query=Ourselin%2C+S">Sebastien Ourselin</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mengya Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A">An Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yanan Wu</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+L">Long Bai</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hongliang Ren</a>, <a href="/search/cs?searchtype=author&query=Yamada%2C+A">Atsushi Yamada</a>, <a href="/search/cs?searchtype=author&query=Harai%2C+Y">Yuriko Harai</a>, <a href="/search/cs?searchtype=author&query=Ishikawa%2C+Y">Yuto Ishikawa</a>, <a href="/search/cs?searchtype=author&query=Hayashi%2C+K">Kazuyuki Hayashi</a> , et al. (25 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.00496v2-abstract-short" style="display: inline;"> Surgical tool segmentation and action recognition are fundamental building blocks in many computer-assisted intervention applications, ranging from surgical skills assessment to decision support systems. Nowadays, learning-based action recognition and segmentation approaches outperform classical methods, relying, however, on large, annotated datasets. Furthermore, action recognition and tool segme… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.00496v2-abstract-full').style.display = 'inline'; document.getElementById('2401.00496v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.00496v2-abstract-full" style="display: none;"> Surgical tool segmentation and action recognition are fundamental building blocks in many computer-assisted intervention applications, ranging from surgical skills assessment to decision support systems. Nowadays, learning-based action recognition and segmentation approaches outperform classical methods, relying, however, on large, annotated datasets. Furthermore, action recognition and tool segmentation algorithms are often trained and make predictions in isolation from each other, without exploiting potential cross-task relationships. With the EndoVis 2022 SAR-RARP50 challenge, we release the first multimodal, publicly available, in-vivo, dataset for surgical action recognition and semantic instrumentation segmentation, containing 50 suturing video segments of Robotic Assisted Radical Prostatectomy (RARP). The aim of the challenge is twofold. First, to enable researchers to leverage the scale of the provided dataset and develop robust and highly accurate single-task action recognition and tool segmentation approaches in the surgical domain. Second, to further explore the potential of multitask-based learning approaches and determine their comparative advantage against their single-task counterparts. A total of 12 teams participated in the challenge, contributing 7 action recognition methods, 9 instrument segmentation techniques, and 4 multitask approaches that integrated both action recognition and instrument segmentation. The complete SAR-RARP50 dataset is available at: https://rdr.ucl.ac.uk/projects/SARRARP50_Segmentation_of_surgical_instrumentation_and_Action_Recognition_on_Robot-Assisted_Radical_Prostatectomy_Challenge/191091 <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.00496v2-abstract-full').style.display = 'none'; document.getElementById('2401.00496v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.00342">arXiv:2311.00342</a> <span> [<a href="https://arxiv.org/pdf/2311.00342">pdf</a>, <a href="https://arxiv.org/format/2311.00342">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> fMRI-PTE: A Large-scale fMRI Pretrained Transformer Encoder for Multi-Subject Brain Activity Decoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qian%2C+X">Xuelin Qian</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yun Wang</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jingyang Huo</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+J">Jianfeng Feng</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yanwei Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.00342v1-abstract-short" style="display: inline;"> The exploration of brain activity and its decoding from fMRI data has been a longstanding pursuit, driven by its potential applications in brain-computer interfaces, medical diagnostics, and virtual reality. Previous approaches have primarily focused on individual subject analysis, highlighting the need for a more universal and adaptable framework, which is the core motivation behind our work. In… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.00342v1-abstract-full').style.display = 'inline'; document.getElementById('2311.00342v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.00342v1-abstract-full" style="display: none;"> The exploration of brain activity and its decoding from fMRI data has been a longstanding pursuit, driven by its potential applications in brain-computer interfaces, medical diagnostics, and virtual reality. Previous approaches have primarily focused on individual subject analysis, highlighting the need for a more universal and adaptable framework, which is the core motivation behind our work. In this work, we propose fMRI-PTE, an innovative auto-encoder approach for fMRI pre-training, with a focus on addressing the challenges of varying fMRI data dimensions due to individual brain differences. Our approach involves transforming fMRI signals into unified 2D representations, ensuring consistency in dimensions and preserving distinct brain activity patterns. We introduce a novel learning strategy tailored for pre-training 2D fMRI images, enhancing the quality of reconstruction. fMRI-PTE's adaptability with image generators enables the generation of well-represented fMRI features, facilitating various downstream tasks, including within-subject and cross-subject brain activity decoding. Our contributions encompass introducing fMRI-PTE, innovative data transformation, efficient training, a novel learning strategy, and the universal applicability of our approach. Extensive experiments validate and support our claims, offering a promising foundation for further research in this domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.00342v1-abstract-full').style.display = 'none'; document.getElementById('2311.00342v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.16299">arXiv:2309.16299</a> <span> [<a href="https://arxiv.org/pdf/2309.16299">pdf</a>, <a href="https://arxiv.org/format/2309.16299">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> CasIL: Cognizing and Imitating Skills via a Dual Cognition-Action Architecture </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zixuan Chen</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+Z">Ze Ji</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shuyang Liu</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jing Huo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yiyu Chen</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.16299v1-abstract-short" style="display: inline;"> Enabling robots to effectively imitate expert skills in longhorizon tasks such as locomotion, manipulation, and more, poses a long-standing challenge. Existing imitation learning (IL) approaches for robots still grapple with sub-optimal performance in complex tasks. In this paper, we consider how this challenge can be addressed within the human cognitive priors. Heuristically, we extend the usual… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.16299v1-abstract-full').style.display = 'inline'; document.getElementById('2309.16299v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.16299v1-abstract-full" style="display: none;"> Enabling robots to effectively imitate expert skills in longhorizon tasks such as locomotion, manipulation, and more, poses a long-standing challenge. Existing imitation learning (IL) approaches for robots still grapple with sub-optimal performance in complex tasks. In this paper, we consider how this challenge can be addressed within the human cognitive priors. Heuristically, we extend the usual notion of action to a dual Cognition (high-level)-Action (low-level) architecture by introducing intuitive human cognitive priors, and propose a novel skill IL framework through human-robot interaction, called Cognition-Action-based Skill Imitation Learning (CasIL), for the robotic agent to effectively cognize and imitate the critical skills from raw visual demonstrations. CasIL enables both cognition and action imitation, while high-level skill cognition explicitly guides low-level primitive actions, providing robustness and reliability to the entire skill IL process. We evaluated our method on MuJoCo and RLBench benchmarks, as well as on the obstacle avoidance and point-goal navigation tasks for quadrupedal robot locomotion. Experimental results show that our CasIL consistently achieves competitive and robust skill imitation capability compared to other counterparts in a variety of long-horizon robotic tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.16299v1-abstract-full').style.display = 'none'; document.getElementById('2309.16299v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.13897">arXiv:2308.13897</a> <span> [<a href="https://arxiv.org/pdf/2308.13897">pdf</a>, <a href="https://arxiv.org/format/2308.13897">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> InsertNeRF: Instilling Generalizability into NeRF with HyperNet Modules </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bao%2C+Y">Yanqi Bao</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+T">Tianyu Ding</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jing Huo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenbin Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuxin Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.13897v2-abstract-short" style="display: inline;"> Generalizing Neural Radiance Fields (NeRF) to new scenes is a significant challenge that existing approaches struggle to address without extensive modifications to vanilla NeRF framework. We introduce InsertNeRF, a method for INStilling gEneRalizabiliTy into NeRF. By utilizing multiple plug-and-play HyperNet modules, InsertNeRF dynamically tailors NeRF's weights to specific reference scenes, trans… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.13897v2-abstract-full').style.display = 'inline'; document.getElementById('2308.13897v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.13897v2-abstract-full" style="display: none;"> Generalizing Neural Radiance Fields (NeRF) to new scenes is a significant challenge that existing approaches struggle to address without extensive modifications to vanilla NeRF framework. We introduce InsertNeRF, a method for INStilling gEneRalizabiliTy into NeRF. By utilizing multiple plug-and-play HyperNet modules, InsertNeRF dynamically tailors NeRF's weights to specific reference scenes, transforming multi-scale sampling-aware features into scene-specific representations. This novel design allows for more accurate and efficient representations of complex appearances and geometries. Experiments show that this method not only achieves superior generalization performance but also provides a flexible pathway for integration with other NeRF-like systems, even in sparse input settings. Code will be available https://github.com/bbbbby-99/InsertNeRF. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.13897v2-abstract-full').style.display = 'none'; document.getElementById('2308.13897v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work was accepted at ICLR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.09923">arXiv:2308.09923</a> <span> [<a href="https://arxiv.org/pdf/2308.09923">pdf</a>, <a href="https://arxiv.org/format/2308.09923">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> East: Efficient and Accurate Secure Transformer Framework for Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Yuanchao Ding</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hua Guo</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+Y">Yewei Guan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Weixin Liu</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jiarong Huo</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+Z">Zhenyu Guan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiyong Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.09923v1-abstract-short" style="display: inline;"> Transformer has been successfully used in practical applications, such as ChatGPT, due to its powerful advantages. However, users' input is leaked to the model provider during the service. With people's attention to privacy, privacy-preserving Transformer inference is on the demand of such services. Secure protocols for non-linear functions are crucial in privacy-preserving Transformer inference,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.09923v1-abstract-full').style.display = 'inline'; document.getElementById('2308.09923v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.09923v1-abstract-full" style="display: none;"> Transformer has been successfully used in practical applications, such as ChatGPT, due to its powerful advantages. However, users' input is leaked to the model provider during the service. With people's attention to privacy, privacy-preserving Transformer inference is on the demand of such services. Secure protocols for non-linear functions are crucial in privacy-preserving Transformer inference, which are not well studied. Thus, designing practical secure protocols for non-linear functions is hard but significant to model performance. In this work, we propose a framework \emph{East} to enable efficient and accurate secure Transformer inference. Firstly, we propose a new oblivious piecewise polynomial evaluation algorithm and apply it to the activation functions, which reduces the runtime and communication of GELU by over 1.5$\times$ and 2.5$\times$, compared to prior arts. Secondly, the secure protocols for softmax and layer normalization are carefully designed to faithfully maintain the desired functionality. Thirdly, several optimizations are conducted in detail to enhance the overall efficiency. We applied \emph{East} to BERT and the results show that the inference accuracy remains consistent with the plaintext inference without fine-tuning. Compared to Iron, we achieve about 1.8$\times$ lower communication within 1.2$\times$ lower runtime. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.09923v1-abstract-full').style.display = 'none'; document.getElementById('2308.09923v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.02908">arXiv:2308.02908</a> <span> [<a href="https://arxiv.org/pdf/2308.02908">pdf</a>, <a href="https://arxiv.org/format/2308.02908">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Where and How: Mitigating Confusion in Neural Radiance Fields from Sparse Inputs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bao%2C+Y">Yanqi Bao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuxin Li</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jing Huo</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+T">Tianyu Ding</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xinyue Liang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenbin Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.02908v1-abstract-short" style="display: inline;"> Neural Radiance Fields from Sparse input} (NeRF-S) have shown great potential in synthesizing novel views with a limited number of observed viewpoints. However, due to the inherent limitations of sparse inputs and the gap between non-adjacent views, rendering results often suffer from over-fitting and foggy surfaces, a phenomenon we refer to as "CONFUSION" during volume rendering. In this paper, w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.02908v1-abstract-full').style.display = 'inline'; document.getElementById('2308.02908v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.02908v1-abstract-full" style="display: none;"> Neural Radiance Fields from Sparse input} (NeRF-S) have shown great potential in synthesizing novel views with a limited number of observed viewpoints. However, due to the inherent limitations of sparse inputs and the gap between non-adjacent views, rendering results often suffer from over-fitting and foggy surfaces, a phenomenon we refer to as "CONFUSION" during volume rendering. In this paper, we analyze the root cause of this confusion and attribute it to two fundamental questions: "WHERE" and "HOW". To this end, we present a novel learning framework, WaH-NeRF, which effectively mitigates confusion by tackling the following challenges: (i)"WHERE" to Sample? in NeRF-S -- we introduce a Deformable Sampling strategy and a Weight-based Mutual Information Loss to address sample-position confusion arising from the limited number of viewpoints; and (ii) "HOW" to Predict? in NeRF-S -- we propose a Semi-Supervised NeRF learning Paradigm based on pose perturbation and a Pixel-Patch Correspondence Loss to alleviate prediction confusion caused by the disparity between training and testing viewpoints. By integrating our proposed modules and loss functions, WaH-NeRF outperforms previous methods under the NeRF-S setting. Code is available https://github.com/bbbbby-99/WaH-NeRF. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.02908v1-abstract-full').style.display = 'none'; document.getElementById('2308.02908v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted In Proceedings of the 31st ACM International Conference on Multimedia (MM' 23)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.01220">arXiv:2307.01220</a> <span> [<a href="https://arxiv.org/pdf/2307.01220">pdf</a>, <a href="https://arxiv.org/format/2307.01220">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ARHNet: Adaptive Region Harmonization for Lesion-aware Augmentation to Improve Segmentation Performance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jiayu Huo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+X">Xi Ouyang</a>, <a href="/search/cs?searchtype=author&query=Granados%2C+A">Alejandro Granados</a>, <a href="/search/cs?searchtype=author&query=Ourselin%2C+S">Sebastien Ourselin</a>, <a href="/search/cs?searchtype=author&query=Sparks%2C+R">Rachel Sparks</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.01220v1-abstract-short" style="display: inline;"> Accurately segmenting brain lesions in MRI scans is critical for providing patients with prognoses and neurological monitoring. However, the performance of CNN-based segmentation methods is constrained by the limited training set size. Advanced data augmentation is an effective strategy to improve the model's robustness. However, they often introduce intensity disparities between foreground and ba… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.01220v1-abstract-full').style.display = 'inline'; document.getElementById('2307.01220v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.01220v1-abstract-full" style="display: none;"> Accurately segmenting brain lesions in MRI scans is critical for providing patients with prognoses and neurological monitoring. However, the performance of CNN-based segmentation methods is constrained by the limited training set size. Advanced data augmentation is an effective strategy to improve the model's robustness. However, they often introduce intensity disparities between foreground and background areas and boundary artifacts, which weakens the effectiveness of such strategies. In this paper, we propose a foreground harmonization framework (ARHNet) to tackle intensity disparities and make synthetic images look more realistic. In particular, we propose an Adaptive Region Harmonization (ARH) module to dynamically align foreground feature maps to the background with an attention mechanism. We demonstrate the efficacy of our method in improving the segmentation performance using real and synthetic images. Experimental results on the ATLAS 2.0 dataset show that ARHNet outperforms other methods for image harmonization tasks, and boosts the down-stream segmentation performance. Our code is publicly available at https://github.com/King-HAW/ARHNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.01220v1-abstract-full').style.display = 'none'; document.getElementById('2307.01220v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 4 figures, 3 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.11510">arXiv:2306.11510</a> <span> [<a href="https://arxiv.org/pdf/2306.11510">pdf</a>, <a href="https://arxiv.org/format/2306.11510">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Pushing the Limits of 3D Shape Generation at Scale </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yu Wang</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+X">Xuelin Qian</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jingyang Huo</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+T">Tiejun Huang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+B">Bo Zhao</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yanwei Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.11510v2-abstract-short" style="display: inline;"> We present a significant breakthrough in 3D shape generation by scaling it to unprecedented dimensions. Through the adaptation of the Auto-Regressive model and the utilization of large language models, we have developed a remarkable model with an astounding 3.6 billion trainable parameters, establishing it as the largest 3D shape generation model to date, named Argus-3D. Our approach addresses the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.11510v2-abstract-full').style.display = 'inline'; document.getElementById('2306.11510v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.11510v2-abstract-full" style="display: none;"> We present a significant breakthrough in 3D shape generation by scaling it to unprecedented dimensions. Through the adaptation of the Auto-Regressive model and the utilization of large language models, we have developed a remarkable model with an astounding 3.6 billion trainable parameters, establishing it as the largest 3D shape generation model to date, named Argus-3D. Our approach addresses the limitations of existing methods by enhancing the quality and diversity of generated 3D shapes. To tackle the challenges of high-resolution 3D shape generation, our model incorporates tri-plane features as latent representations, effectively reducing computational complexity. Additionally, we introduce a discrete codebook for efficient quantization of these representations. Leveraging the power of transformers, we enable multi-modal conditional generation, facilitating the production of diverse and visually impressive 3D shapes. To train our expansive model, we leverage an ensemble of publicly-available 3D datasets, consisting of a comprehensive collection of approximately 900,000 objects from renowned repositories such as ModelNet40, ShapeNet, Pix3D, 3D-Future, and Objaverse. This diverse dataset empowers our model to learn from a wide range of object variations, bolstering its ability to generate high-quality and diverse 3D shapes. Extensive experimentation demonstrate the remarkable efficacy of our approach in significantly improving the visual quality of generated 3D shapes. By pushing the boundaries of 3D generation, introducing novel methods for latent representation learning, and harnessing the power of transformers for multi-modal conditional generation, our contributions pave the way for substantial advancements in the field. Our work unlocks new possibilities for applications in gaming, virtual reality, product design, and other domains that demand high-quality and diverse 3D objects. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.11510v2-abstract-full').style.display = 'none'; document.getElementById('2306.11510v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://argus-3d.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.17102">arXiv:2305.17102</a> <span> [<a href="https://arxiv.org/pdf/2305.17102">pdf</a>, <a href="https://arxiv.org/format/2305.17102">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GeoVLN: Learning Geometry-Enhanced Visual Representation with Slot Attention for Vision-and-Language Navigation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jingyang Huo</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Q">Qiang Sun</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Boyan Jiang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haitao Lin</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yanwei Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.17102v2-abstract-short" style="display: inline;"> Most existing works solving Room-to-Room VLN problem only utilize RGB images and do not consider local context around candidate views, which lack sufficient visual cues about surrounding environment. Moreover, natural language contains complex semantic information thus its correlations with visual inputs are hard to model merely with cross attention. In this paper, we propose GeoVLN, which learns… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.17102v2-abstract-full').style.display = 'inline'; document.getElementById('2305.17102v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.17102v2-abstract-full" style="display: none;"> Most existing works solving Room-to-Room VLN problem only utilize RGB images and do not consider local context around candidate views, which lack sufficient visual cues about surrounding environment. Moreover, natural language contains complex semantic information thus its correlations with visual inputs are hard to model merely with cross attention. In this paper, we propose GeoVLN, which learns Geometry-enhanced visual representation based on slot attention for robust Visual-and-Language Navigation. The RGB images are compensated with the corresponding depth maps and normal maps predicted by Omnidata as visual inputs. Technically, we introduce a two-stage module that combine local slot attention and CLIP model to produce geometry-enhanced representation from such input. We employ V&L BERT to learn a cross-modal representation that incorporate both language and vision informations. Additionally, a novel multiway attention module is designed, encouraging different phrases of input instruction to exploit the most related features from visual input. Extensive experiments demonstrate the effectiveness of our newly designed modules and show the compelling performance of the proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.17102v2-abstract-full').style.display = 'none'; document.getElementById('2305.17102v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.15486">arXiv:2211.15486</a> <span> [<a href="https://arxiv.org/pdf/2211.15486">pdf</a>, <a href="https://arxiv.org/format/2211.15486">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MAPPING: Model Average with Post-processing for Stroke Lesion Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jiayu Huo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liyun Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Boels%2C+M">Maxence Boels</a>, <a href="/search/cs?searchtype=author&query=Granados%2C+A">Alejandro Granados</a>, <a href="/search/cs?searchtype=author&query=Ourselin%2C+S">Sebastien Ourselin</a>, <a href="/search/cs?searchtype=author&query=Sparks%2C+R">Rachel Sparks</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.15486v1-abstract-short" style="display: inline;"> Accurate stroke lesion segmentation plays a pivotal role in stroke rehabilitation research, to provide lesion shape and size information which can be used for quantification of the extent of the stroke and to assess treatment efficacy. Recently, automatic segmentation algorithms using deep learning techniques have been developed and achieved promising results. In this report, we present our stroke… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.15486v1-abstract-full').style.display = 'inline'; document.getElementById('2211.15486v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.15486v1-abstract-full" style="display: none;"> Accurate stroke lesion segmentation plays a pivotal role in stroke rehabilitation research, to provide lesion shape and size information which can be used for quantification of the extent of the stroke and to assess treatment efficacy. Recently, automatic segmentation algorithms using deep learning techniques have been developed and achieved promising results. In this report, we present our stroke lesion segmentation model based on nnU-Net framework, and apply it to the Anatomical Tracings of Lesions After Stroke (ATLAS v2.0) dataset. Furthermore, we describe an effective post-processing strategy that can improve some segmentation metrics. Our method took the first place in the 2022 MICCAI ATLAS Challenge with an average Dice score of 0.6667, Lesion-wise F1 score of 0.5643, Simple Lesion Count score of 4.5367, and Volume Difference score of 8804.9102. Our code and trained model weights are publicly available at https://github.com/King-HAW/ATLAS-R2-Docker-Submission. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.15486v1-abstract-full').style.display = 'none'; document.getElementById('2211.15486v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Challenge Report, 1st place in 2022 MICCAI ATLAS Challenge</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.14516">arXiv:2211.14516</a> <span> [<a href="https://arxiv.org/pdf/2211.14516">pdf</a>, <a href="https://arxiv.org/format/2211.14516">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Unified Framework for Contrastive Learning from a Perspective of Affinity Matrix </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenbin Li</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+M">Meihao Kong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xuesong Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lei Wang</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jing Huo</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+J">Jiebo Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.14516v1-abstract-short" style="display: inline;"> In recent years, a variety of contrastive learning based unsupervised visual representation learning methods have been designed and achieved great success in many visual tasks. Generally, these methods can be roughly classified into four categories: (1) standard contrastive methods with an InfoNCE like loss, such as MoCo and SimCLR; (2) non-contrastive methods with only positive pairs, such as BYO… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.14516v1-abstract-full').style.display = 'inline'; document.getElementById('2211.14516v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.14516v1-abstract-full" style="display: none;"> In recent years, a variety of contrastive learning based unsupervised visual representation learning methods have been designed and achieved great success in many visual tasks. Generally, these methods can be roughly classified into four categories: (1) standard contrastive methods with an InfoNCE like loss, such as MoCo and SimCLR; (2) non-contrastive methods with only positive pairs, such as BYOL and SimSiam; (3) whitening regularization based methods, such as W-MSE and VICReg; and (4) consistency regularization based methods, such as CO2. In this study, we present a new unified contrastive learning representation framework (named UniCLR) suitable for all the above four kinds of methods from a novel perspective of basic affinity matrix. Moreover, three variants, i.e., SimAffinity, SimWhitening and SimTrace, are presented based on UniCLR. In addition, a simple symmetric loss, as a new consistency regularization term, is proposed based on this framework. By symmetrizing the affinity matrix, we can effectively accelerate the convergence of the training process. Extensive experiments have been conducted to show that (1) the proposed UniCLR framework can achieve superior results on par with and even be better than the state of the art, (2) the proposed symmetric loss can significantly accelerate the convergence of models, and (3) SimTrace can avoid the mode collapse problem by maximizing the trace of a whitened affinity matrix without relying on asymmetry designs or stop-gradients. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.14516v1-abstract-full').style.display = 'none'; document.getElementById('2211.14516v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.03591">arXiv:2210.03591</a> <span> [<a href="https://arxiv.org/pdf/2210.03591">pdf</a>, <a href="https://arxiv.org/format/2210.03591">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Modeling Inter-Class and Intra-Class Constraints in Novel Class Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenbin Li</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Z">Zhichen Fan</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jing Huo</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.03591v3-abstract-short" style="display: inline;"> Novel class discovery (NCD) aims at learning a model that transfers the common knowledge from a class-disjoint labelled dataset to another unlabelled dataset and discovers new classes (clusters) within it. Many methods, as well as elaborate training pipelines and appropriate objectives, have been proposed and considerably boosted performance on NCD tasks. Despite all this, we find that the existin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.03591v3-abstract-full').style.display = 'inline'; document.getElementById('2210.03591v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.03591v3-abstract-full" style="display: none;"> Novel class discovery (NCD) aims at learning a model that transfers the common knowledge from a class-disjoint labelled dataset to another unlabelled dataset and discovers new classes (clusters) within it. Many methods, as well as elaborate training pipelines and appropriate objectives, have been proposed and considerably boosted performance on NCD tasks. Despite all this, we find that the existing methods do not sufficiently take advantage of the essence of the NCD setting. To this end, in this paper, we propose to model both inter-class and intra-class constraints in NCD based on the symmetric Kullback-Leibler divergence (sKLD). Specifically, we propose an inter-class sKLD constraint to effectively exploit the disjoint relationship between labelled and unlabelled classes, enforcing the separability for different classes in the embedding space. In addition, we present an intra-class sKLD constraint to explicitly constrain the intra-relationship between a sample and its augmentations and ensure the stability of the training process at the same time. We conduct extensive experiments on the popular CIFAR10, CIFAR100 and ImageNet benchmarks and successfully demonstrate that our method can establish a new state of the art and can achieve significant performance improvements, e.g., 3.5%/3.7% clustering accuracy improvements on CIFAR100-50 dataset split under the task-aware/-agnostic evaluation protocol, over previous state-of-the-art methods. Code is available at https://github.com/FanZhichen/NCD-IIC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.03591v3-abstract-full').style.display = 'none'; document.getElementById('2210.03591v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.09612">arXiv:2208.09612</a> <span> [<a href="https://arxiv.org/pdf/2208.09612">pdf</a>, <a href="https://arxiv.org/format/2208.09612">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> AntCritic: Argument Mining for Free-Form and Visually-Rich Financial Comments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huadai Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+W">Wenqiang Xu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xuan Lin</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jingjing Huo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hong Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.09612v2-abstract-short" style="display: inline;"> Argument mining aims to detect all possible argumentative components and identify their relationships automatically. As a thriving task in natural language processing, there has been a large amount of corpus for academic study and application development in this field. However, the research in this area is still constrained by the inherent limitations of existing datasets. Specifically, all the pu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.09612v2-abstract-full').style.display = 'inline'; document.getElementById('2208.09612v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.09612v2-abstract-full" style="display: none;"> Argument mining aims to detect all possible argumentative components and identify their relationships automatically. As a thriving task in natural language processing, there has been a large amount of corpus for academic study and application development in this field. However, the research in this area is still constrained by the inherent limitations of existing datasets. Specifically, all the publicly available datasets are relatively small in scale, and few of them provide information from other modalities to facilitate the learning process. Moreover, the statements and expressions in these corpora are usually in a compact form, which restricts the generalization ability of models. To this end, we collect a novel dataset AntCritic to serve as a helpful complement to this area, which consists of about 10k free-form and visually-rich financial comments and supports both argument component detection and argument relation prediction tasks. Besides, to cope with the challenges brought by scenario expansion, we thoroughly explore the fine-grained relation prediction and structure reconstruction scheme and discuss the encoding mechanism for visual styles and layouts. On this basis, we design two simple but effective model architectures and conduct various experiments on this dataset to provide benchmark performances as a reference and verify the practicability of our proposed architecture. We release our data and code in this link, and this dataset follows CC BY-NC-ND 4.0 license. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.09612v2-abstract-full').style.display = 'none'; document.getElementById('2208.09612v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.03988">arXiv:2208.03988</a> <span> [<a href="https://arxiv.org/pdf/2208.03988">pdf</a>, <a href="https://arxiv.org/format/2208.03988">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Fuzzing Microservices: A Series of User Studies in Industry on Industrial Systems with EvoMaster </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Man Zhang</a>, <a href="/search/cs?searchtype=author&query=Arcuri%2C+A">Andrea Arcuri</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yonggang Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+K">Kaiming Xue</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhao Wang</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jian Huo</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Weiwei Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.03988v2-abstract-short" style="display: inline;"> With several microservice architectures comprising of thousands of web services, used to serve 630 million customers, companies like Meituan face several challenges in the verification and validation of their software. This paper reports on our experience of integrating EvoMaster (a search-based white-box fuzzer) in the testing processes at Meituan over almost 2 years. Two user studies were carrie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.03988v2-abstract-full').style.display = 'inline'; document.getElementById('2208.03988v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.03988v2-abstract-full" style="display: none;"> With several microservice architectures comprising of thousands of web services, used to serve 630 million customers, companies like Meituan face several challenges in the verification and validation of their software. This paper reports on our experience of integrating EvoMaster (a search-based white-box fuzzer) in the testing processes at Meituan over almost 2 years. Two user studies were carried out in 2021 and in 2023 to evaluate two versions of EvoMaster, respectively, in tackling the test generation for industrial web services which are parts of a large e-commerce microservice system. The two user studies involve in total 321,131 lines of code from five APIs and 27 industrial participants at Meituan. Questionnaires and interviews were carried out in both user studies with employees at Meituan. The two user studies demonstrate clear advantages of EvoMaster (i.e., code coverage and fault detection) and the urgent need to have such a fuzzer in industrial microservices testing. To study how these results could generalize, a follow up user study was done in 2024 with five engineers in the five different companies. Our results show that, besides their clear usefulness, there are still many critical challenges that the research community needs to investigate to improve performance further. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.03988v2-abstract-full').style.display = 'none'; document.getElementById('2208.03988v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.03203">arXiv:2208.03203</a> <span> [<a href="https://arxiv.org/pdf/2208.03203">pdf</a>, <a href="https://arxiv.org/format/2208.03203">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Brain Lesion Synthesis via Progressive Adversarial Variational Auto-Encoder </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jiayu Huo</a>, <a href="/search/cs?searchtype=author&query=Vakharia%2C+V">Vejay Vakharia</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chengyuan Wu</a>, <a href="/search/cs?searchtype=author&query=Sharan%2C+A">Ashwini Sharan</a>, <a href="/search/cs?searchtype=author&query=Ko%2C+A">Andrew Ko</a>, <a href="/search/cs?searchtype=author&query=Ourselin%2C+S">Sebastien Ourselin</a>, <a href="/search/cs?searchtype=author&query=Sparks%2C+R">Rachel Sparks</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.03203v1-abstract-short" style="display: inline;"> Laser interstitial thermal therapy (LITT) is a novel minimally invasive treatment that is used to ablate intracranial structures to treat mesial temporal lobe epilepsy (MTLE). Region of interest (ROI) segmentation before and after LITT would enable automated lesion quantification to objectively assess treatment efficacy. Deep learning techniques, such as convolutional neural networks (CNNs) are st… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.03203v1-abstract-full').style.display = 'inline'; document.getElementById('2208.03203v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.03203v1-abstract-full" style="display: none;"> Laser interstitial thermal therapy (LITT) is a novel minimally invasive treatment that is used to ablate intracranial structures to treat mesial temporal lobe epilepsy (MTLE). Region of interest (ROI) segmentation before and after LITT would enable automated lesion quantification to objectively assess treatment efficacy. Deep learning techniques, such as convolutional neural networks (CNNs) are state-of-the-art solutions for ROI segmentation, but require large amounts of annotated data during the training. However, collecting large datasets from emerging treatments such as LITT is impractical. In this paper, we propose a progressive brain lesion synthesis framework (PAVAE) to expand both the quantity and diversity of the training dataset. Concretely, our framework consists of two sequential networks: a mask synthesis network and a mask-guided lesion synthesis network. To better employ extrinsic information to provide additional supervision during network training, we design a condition embedding block (CEB) and a mask embedding block (MEB) to encode inherent conditions of masks to the feature space. Finally, a segmentation network is trained using raw and synthetic lesion images to evaluate the effectiveness of the proposed framework. Experimental results show that our method can achieve realistic synthetic results and boost the performance of down-stream segmentation tasks above traditional data augmentation techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.03203v1-abstract-full').style.display = 'none'; document.getElementById('2208.03203v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 4 figures, accepted by International Workshop on Simulation and Synthesis in Medical Imaging (SASHIMI 2022)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.13802">arXiv:2203.13802</a> <span> [<a href="https://arxiv.org/pdf/2203.13802">pdf</a>, <a href="https://arxiv.org/format/2203.13802">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Playing Lottery Tickets in Style Transfer Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kong%2C+M">Meihao Kong</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jing Huo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenbin Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jing Wu</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+Y">Yu-Kun Lai</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.13802v2-abstract-short" style="display: inline;"> Style transfer has achieved great success and attracted a wide range of attention from both academic and industrial communities due to its flexible application scenarios. However, the dependence on a pretty large VGG-based autoencoder leads to existing style transfer models having high parameter complexities, which limits their applications on resource-constrained devices. Compared with many other… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.13802v2-abstract-full').style.display = 'inline'; document.getElementById('2203.13802v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.13802v2-abstract-full" style="display: none;"> Style transfer has achieved great success and attracted a wide range of attention from both academic and industrial communities due to its flexible application scenarios. However, the dependence on a pretty large VGG-based autoencoder leads to existing style transfer models having high parameter complexities, which limits their applications on resource-constrained devices. Compared with many other tasks, the compression of style transfer models has been less explored. Recently, the lottery ticket hypothesis (LTH) has shown great potential in finding extremely sparse matching subnetworks which can achieve on par or even better performance than the original full networks when trained in isolation. In this work, we for the first time perform an empirical study to verify whether such trainable matching subnetworks also exist in style transfer models. Specifically, we take two most popular style transfer models, i.e., AdaIN and SANet, as the main testbeds, which represent global and local transformation based style transfer methods respectively. We carry out extensive experiments and comprehensive analysis, and draw the following conclusions. (1) Compared with fixing the VGG encoder, style transfer models can benefit more from training the whole network together. (2) Using iterative magnitude pruning, we find the matching subnetworks at 89.2% sparsity in AdaIN and 73.7% sparsity in SANet, which demonstrates that style transfer models can play lottery tickets too. (3) The feature transformation module should also be pruned to obtain a much sparser model without affecting the existence and quality of the matching subnetworks. (4) Besides AdaIN and SANet, other models such as LST, MANet, AdaAttN and MCCNet can also play lottery tickets, which shows that LTH can be generalized to various style transfer models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.13802v2-abstract-full').style.display = 'none'; document.getElementById('2203.13802v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.12349">arXiv:2112.12349</a> <span> [<a href="https://arxiv.org/pdf/2112.12349">pdf</a>, <a href="https://arxiv.org/format/2112.12349">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TMI.2020.3042773">10.1109/TMI.2020.3042773 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Learning Hierarchical Attention for Weakly-supervised Chest X-Ray Abnormality Localization and Diagnosis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ouyang%2C+X">Xi Ouyang</a>, <a href="/search/cs?searchtype=author&query=Karanam%2C+S">Srikrishna Karanam</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Ziyan Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Terrence Chen</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jiayu Huo</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X+S">Xiang Sean Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qian Wang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jie-Zhi Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.12349v1-abstract-short" style="display: inline;"> We consider the problem of abnormality localization for clinical applications. While deep learning has driven much recent progress in medical imaging, many clinical challenges are not fully addressed, limiting its broader usage. While recent methods report high diagnostic accuracies, physicians have concerns trusting these algorithm results for diagnostic decision-making purposes because of a gene… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.12349v1-abstract-full').style.display = 'inline'; document.getElementById('2112.12349v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.12349v1-abstract-full" style="display: none;"> We consider the problem of abnormality localization for clinical applications. While deep learning has driven much recent progress in medical imaging, many clinical challenges are not fully addressed, limiting its broader usage. While recent methods report high diagnostic accuracies, physicians have concerns trusting these algorithm results for diagnostic decision-making purposes because of a general lack of algorithm decision reasoning and interpretability. One potential way to address this problem is to further train these models to localize abnormalities in addition to just classifying them. However, doing this accurately will require a large amount of disease localization annotations by clinical experts, a task that is prohibitively expensive to accomplish for most applications. In this work, we take a step towards addressing these issues by means of a new attention-driven weakly supervised algorithm comprising a hierarchical attention mining framework that unifies activation- and gradient-based visual attention in a holistic manner. Our key algorithmic innovations include the design of explicit ordinal attention constraints, enabling principled model training in a weakly-supervised fashion, while also facilitating the generation of visual-attention-driven model explanations by means of localization cues. On two large-scale chest X-ray datasets (NIH ChestX-ray14 and CheXpert), we demonstrate significant localization performance improvements over the current state of the art while also achieving competitive classification performance. Our code is available on https://github.com/oyxhust/HAM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.12349v1-abstract-full').style.display = 'none'; document.getElementById('2112.12349v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Transactions on Medical Imaging 2021 </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Huo%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Huo%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Huo%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository