CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 266 results for author: <span class="mathjax">Ye, M</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Ye%2C+M">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Ye, M"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Ye%2C+M&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Ye, M"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Ye%2C+M&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Ye%2C+M&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Ye%2C+M&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Ye%2C+M&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Ye%2C+M&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Ye%2C+M&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Ye%2C+M&amp;start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14507">arXiv:2502.14507</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.14507">pdf</a>, <a href="https://arxiv.org/format/2502.14507">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Can LLMs Simulate L2-English Dialogue? An Information-Theoretic Analysis of L1-Dependent Biases </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gao%2C+R">Rena Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xuetong Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Kuribayashi%2C+T">Tatsuki Kuribayashi</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Mingrui Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+S">Siya Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Roever%2C+C">Carsten Roever</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yuanxing Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+Z">Zheng Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Lau%2C+J+H">Jey Han Lau</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14507v1-abstract-short" style="display: inline;"> This study evaluates Large Language Models&#39; (LLMs) ability to simulate non-native-like English use observed in human second language (L2) learners interfered with by their native first language (L1). In dialogue-based interviews, we prompt LLMs to mimic L2 English learners with specific L1s (e.g., Japanese, Thai, Urdu) across seven languages, comparing their outputs to real L2 learner data. Our an&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14507v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14507v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14507v1-abstract-full" style="display: none;"> This study evaluates Large Language Models&#39; (LLMs) ability to simulate non-native-like English use observed in human second language (L2) learners interfered with by their native first language (L1). In dialogue-based interviews, we prompt LLMs to mimic L2 English learners with specific L1s (e.g., Japanese, Thai, Urdu) across seven languages, comparing their outputs to real L2 learner data. Our analysis examines L1-driven linguistic biases, such as reference word usage and avoidance behaviors, using information-theoretic and distributional density measures. Results show that modern LLMs (e.g., Qwen2.5, LLAMA3.3, DeepseekV3, GPT-4o) replicate L1-dependent patterns observed in human L2 data, with distinct influences from various languages (e.g., Japanese, Korean, and Mandarin significantly affect tense agreement, and Urdu influences noun-verb collocations). Our results reveal the potential of LLMs for L2 dialogue generation and evaluation for future educational applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14507v1-abstract-full').style.display = 'none'; document.getElementById('2502.14507v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01980">arXiv:2502.01980</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.01980">pdf</a>, <a href="https://arxiv.org/format/2502.01980">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Generative Data Mining with Longtail-Guided Diffusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hayden%2C+D+S">David S. Hayden</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Mao Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Garipov%2C+T">Timur Garipov</a>, <a href="/search/cs?searchtype=author&amp;query=Meyer%2C+G+P">Gregory P. Meyer</a>, <a href="/search/cs?searchtype=author&amp;query=Vondrick%2C+C">Carl Vondrick</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zhao Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chai%2C+Y">Yuning Chai</a>, <a href="/search/cs?searchtype=author&amp;query=Wolff%2C+E">Eric Wolff</a>, <a href="/search/cs?searchtype=author&amp;query=Srinivasa%2C+S+S">Siddhartha S. Srinivasa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01980v1-abstract-short" style="display: inline;"> It is difficult to anticipate the myriad challenges that a predictive model will encounter once deployed. Common practice entails a reactive, cyclical approach: model deployment, data mining, and retraining. We instead develop a proactive longtail discovery process by imagining additional data during training. In particular, we develop general model-based longtail signals, including a differentiab&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01980v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01980v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01980v1-abstract-full" style="display: none;"> It is difficult to anticipate the myriad challenges that a predictive model will encounter once deployed. Common practice entails a reactive, cyclical approach: model deployment, data mining, and retraining. We instead develop a proactive longtail discovery process by imagining additional data during training. In particular, we develop general model-based longtail signals, including a differentiable, single forward pass formulation of epistemic uncertainty that does not impact model parameters or predictive performance but can flag rare or hard inputs. We leverage these signals as guidance to generate additional training data from a latent diffusion model in a process we call Longtail Guidance (LTG). Crucially, we can perform LTG without retraining the diffusion model or the predictive model, and we do not need to expose the predictive model to intermediate diffusion states. Data generated by LTG exhibit semantically meaningful variation, yield significant generalization improvements on image classification benchmarks, and can be analyzed to proactively discover, explain, and address conceptual gaps in a predictive model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01980v1-abstract-full').style.display = 'none'; document.getElementById('2502.01980v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06590">arXiv:2501.06590</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.06590">pdf</a>, <a href="https://arxiv.org/format/2501.06590">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ChemAgent: Self-updating Library in Large Language Models Improves Chemical Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tang%2C+X">Xiangru Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+T">Tianyu Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Muyang Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Shao%2C+Y">Yanjun Shao</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+X">Xunjian Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Ouyang%2C+S">Siru Ouyang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+W">Wangchunshu Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+P">Pan Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zhuosheng Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Y">Yilun Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Cohan%2C+A">Arman Cohan</a>, <a href="/search/cs?searchtype=author&amp;query=Gerstein%2C+M">Mark Gerstein</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06590v1-abstract-short" style="display: inline;"> Chemical reasoning usually involves complex, multi-step processes that demand precise calculations, where even minor errors can lead to cascading failures. Furthermore, large language models (LLMs) encounter difficulties handling domain-specific formulas, executing reasoning steps accurately, and integrating code effectively when tackling chemical reasoning tasks. To address these challenges, we p&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06590v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06590v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06590v1-abstract-full" style="display: none;"> Chemical reasoning usually involves complex, multi-step processes that demand precise calculations, where even minor errors can lead to cascading failures. Furthermore, large language models (LLMs) encounter difficulties handling domain-specific formulas, executing reasoning steps accurately, and integrating code effectively when tackling chemical reasoning tasks. To address these challenges, we present ChemAgent, a novel framework designed to improve the performance of LLMs through a dynamic, self-updating library. This library is developed by decomposing chemical tasks into sub-tasks and compiling these sub-tasks into a structured collection that can be referenced for future queries. Then, when presented with a new problem, ChemAgent retrieves and refines pertinent information from the library, which we call memory, facilitating effective task decomposition and the generation of solutions. Our method designs three types of memory and a library-enhanced reasoning component, enabling LLMs to improve over time through experience. Experimental results on four chemical reasoning datasets from SciBench demonstrate that ChemAgent achieves performance gains of up to 46% (GPT-4), significantly outperforming existing methods. Our findings suggest substantial potential for future applications, including tasks such as drug discovery and materials science. Our code can be found at https://github.com/gersteinlab/chemagent <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06590v1-abstract-full').style.display = 'none'; document.getElementById('2501.06590v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03223">arXiv:2501.03223</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.03223">pdf</a>, <a href="https://arxiv.org/format/2501.03223">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Rate-My-LoRA: Efficient and Adaptive Federated Model Tuning for Cardiac MRI Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=He%2C+X">Xiaoxiao He</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+H">Haizhou Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+L">Ligong Han</a>, <a href="/search/cs?searchtype=author&amp;query=Tan%2C+C">Chaowei Tan</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+B">Bo Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Z">Zihao Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Meng Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Axel%2C+L">Leon Axel</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+K">Kang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Metaxas%2C+D">Dimitris Metaxas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03223v1-abstract-short" style="display: inline;"> Cardiovascular disease (CVD) and cardiac dyssynchrony are major public health problems in the United States. Precise cardiac image segmentation is crucial for extracting quantitative measures that help categorize cardiac dyssynchrony. However, achieving high accuracy often depends on centralizing large datasets from different hospitals, which can be challenging due to privacy concerns. To solve th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03223v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03223v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03223v1-abstract-full" style="display: none;"> Cardiovascular disease (CVD) and cardiac dyssynchrony are major public health problems in the United States. Precise cardiac image segmentation is crucial for extracting quantitative measures that help categorize cardiac dyssynchrony. However, achieving high accuracy often depends on centralizing large datasets from different hospitals, which can be challenging due to privacy concerns. To solve this problem, Federated Learning (FL) is proposed to enable decentralized model training on such data without exchanging sensitive information. However, bandwidth limitations and data heterogeneity remain as significant challenges in conventional FL algorithms. In this paper, we propose a novel efficient and adaptive federate learning method for cardiac segmentation that improves model performance while reducing the bandwidth requirement. Our method leverages the low-rank adaptation (LoRA) to regularize model weight update and reduce communication overhead. We also propose a \mymethod{} aggregation technique to address data heterogeneity among clients. This technique adaptively penalizes the aggregated weights from different clients by comparing the validation accuracy in each client, allowing better generalization performance and fast local adaptation. In-client and cross-client evaluations on public cardiac MR datasets demonstrate the superiority of our method over other LoRA-based federate learning approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03223v1-abstract-full').style.display = 'none'; document.getElementById('2501.03223v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in ISBI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16381">arXiv:2412.16381</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.16381">pdf</a>, <a href="https://arxiv.org/format/2412.16381">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> VerSe: Integrating Multiple Queries as Prompts for Versatile Cardiac MRI Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Guo%2C+B">Bangwei Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Meng Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+Y">Yunhe Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Xin%2C+B">Bingyu Xin</a>, <a href="/search/cs?searchtype=author&amp;query=Axel%2C+L">Leon Axel</a>, <a href="/search/cs?searchtype=author&amp;query=Metaxas%2C+D">Dimitris Metaxas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16381v1-abstract-short" style="display: inline;"> Despite the advances in learning-based image segmentation approach, the accurate segmentation of cardiac structures from magnetic resonance imaging (MRI) remains a critical challenge. While existing automatic segmentation methods have shown promise, they still require extensive manual corrections of the segmentation results by human experts, particularly in complex regions such as the basal and ap&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16381v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16381v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16381v1-abstract-full" style="display: none;"> Despite the advances in learning-based image segmentation approach, the accurate segmentation of cardiac structures from magnetic resonance imaging (MRI) remains a critical challenge. While existing automatic segmentation methods have shown promise, they still require extensive manual corrections of the segmentation results by human experts, particularly in complex regions such as the basal and apical parts of the heart. Recent efforts have been made on developing interactive image segmentation methods that enable human-in-the-loop learning. However, they are semi-automatic and inefficient, due to their reliance on click-based prompts, especially for 3D cardiac MRI volumes. To address these limitations, we propose VerSe, a Versatile Segmentation framework to unify automatic and interactive segmentation through mutiple queries. Our key innovation lies in the joint learning of object and click queries as prompts for a shared segmentation backbone. VerSe supports both fully automatic segmentation, through object queries, and interactive mask refinement, by providing click queries when needed. With the proposed integrated prompting scheme, VerSe demonstrates significant improvement in performance and efficiency over existing methods, on both cardiac MRI and out-of-distribution medical imaging datasets. The code is available at https://github.com/bangwayne/Verse. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16381v1-abstract-full').style.display = 'none'; document.getElementById('2412.16381v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15628">arXiv:2412.15628</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.15628">pdf</a>, <a href="https://arxiv.org/format/2412.15628">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Can Input Attributions Interpret the Inductive Reasoning Process in In-Context Learning? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Mengyu Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Kuribayashi%2C+T">Tatsuki Kuribayashi</a>, <a href="/search/cs?searchtype=author&amp;query=Kobayashi%2C+G">Goro Kobayashi</a>, <a href="/search/cs?searchtype=author&amp;query=Suzuki%2C+J">Jun Suzuki</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15628v3-abstract-short" style="display: inline;"> Interpreting the internal process of neural models has long been a challenge. This challenge remains relevant in the era of large language models (LLMs) and in-context learning (ICL); for example, ICL poses a new issue of interpreting which example in the few-shot examples contributed to identifying/solving the task. To this end, in this paper, we design synthetic diagnostic tasks of inductive rea&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15628v3-abstract-full').style.display = 'inline'; document.getElementById('2412.15628v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15628v3-abstract-full" style="display: none;"> Interpreting the internal process of neural models has long been a challenge. This challenge remains relevant in the era of large language models (LLMs) and in-context learning (ICL); for example, ICL poses a new issue of interpreting which example in the few-shot examples contributed to identifying/solving the task. To this end, in this paper, we design synthetic diagnostic tasks of inductive reasoning, inspired by the generalization tests in linguistics; here, most in-context examples are ambiguous w.r.t. their underlying rule, and one critical example disambiguates the task demonstrated. The question is whether conventional input attribution (IA) methods can track such a reasoning process, i.e., identify the influential example, in ICL. Our experiments provide several practical findings; for example, a certain simple IA method works the best, and the larger the model, the generally harder it is to interpret the ICL with gradient-based IA methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15628v3-abstract-full').style.display = 'none'; document.getElementById('2412.15628v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.02983">arXiv:2412.02983</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.02983">pdf</a>, <a href="https://arxiv.org/format/2412.02983">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Is Foreground Prototype Sufficient? Few-Shot Medical Image Segmentation with Background-Fused Prototype </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tang%2C+S">Song Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Zu%2C+C">Chunxiao Zu</a>, <a href="/search/cs?searchtype=author&amp;query=Su%2C+W">Wenxin Su</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Y">Yuan Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Mao Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Gan%2C+Y">Yan Gan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+X">Xiatian Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.02983v1-abstract-short" style="display: inline;"> Few-shot Semantic Segmentation(FSS)aim to adapt a pre-trained model to new classes with as few as a single labeled training sample per class. The existing prototypical work used in natural image scenarios biasedly focus on capturing foreground&#39;s discrimination while employing a simplistic representation for background, grounded on the inherent observation separation between foreground and backgrou&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02983v1-abstract-full').style.display = 'inline'; document.getElementById('2412.02983v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.02983v1-abstract-full" style="display: none;"> Few-shot Semantic Segmentation(FSS)aim to adapt a pre-trained model to new classes with as few as a single labeled training sample per class. The existing prototypical work used in natural image scenarios biasedly focus on capturing foreground&#39;s discrimination while employing a simplistic representation for background, grounded on the inherent observation separation between foreground and background. However, this paradigm is not applicable to medical images where the foreground and background share numerous visual features, necessitating a more detailed description for background. In this paper, we present a new pluggable Background-fused prototype(Bro)approach for FSS in medical images. Instead of finding a commonality of background subjects in support image, Bro incorporates this background with two pivot designs. Specifically, Feature Similarity Calibration(FeaC)initially reduces noise in the support image by employing feature cross-attention with the query image. Subsequently, Hierarchical Channel Adversarial Attention(HiCA)merges the background into comprehensive prototypes. We achieve this by a channel groups-based attention mechanism, where an adversarial Mean-Offset structure encourages a coarse-to-fine fusion. Extensive experiments show that previous state-of-the-art methods, when paired with Bro, experience significant performance improvements. This demonstrates a more integrated way to represent backgrounds specifically for medical image. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02983v1-abstract-full').style.display = 'none'; document.getElementById('2412.02983v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.02270">arXiv:2412.02270</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.02270">pdf</a>, <a href="https://arxiv.org/format/2412.02270">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Sustainable Self-evolution Adversarial Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+W">Wenxuan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Chenglei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+H">Huihui Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Menghao Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Qian%2C+X">Xuelin Qian</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+P">Peng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yanning Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.02270v1-abstract-short" style="display: inline;"> With the wide application of deep neural network models in various computer vision tasks, there has been a proliferation of adversarial example generation strategies aimed at deeply exploring model security. However, existing adversarial training defense models, which rely on single or limited types of attacks under a one-time learning process, struggle to adapt to the dynamic and evolving nature&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02270v1-abstract-full').style.display = 'inline'; document.getElementById('2412.02270v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.02270v1-abstract-full" style="display: none;"> With the wide application of deep neural network models in various computer vision tasks, there has been a proliferation of adversarial example generation strategies aimed at deeply exploring model security. However, existing adversarial training defense models, which rely on single or limited types of attacks under a one-time learning process, struggle to adapt to the dynamic and evolving nature of attack methods. Therefore, to achieve defense performance improvements for models in long-term applications, we propose a novel Sustainable Self-Evolution Adversarial Training (SSEAT) framework. Specifically, we introduce a continual adversarial defense pipeline to realize learning from various kinds of adversarial examples across multiple stages. Additionally, to address the issue of model catastrophic forgetting caused by continual learning from ongoing novel attacks, we propose an adversarial data replay module to better select more diverse and key relearning data. Furthermore, we design a consistency regularization strategy to encourage current defense models to learn more from previously trained ones, guiding them to retain more past knowledge and maintain accuracy on clean samples. Extensive experiments have been conducted to verify the efficacy of the proposed SSEAT defense method, which demonstrates superior defense performance and classification accuracy compared to competitors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02270v1-abstract-full').style.display = 'none'; document.getElementById('2412.02270v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ACMMM 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01203">arXiv:2412.01203</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.01203">pdf</a>, <a href="https://arxiv.org/format/2412.01203">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Domain Adaptive Diabetic Retinopathy Grading with Model Absence and Flowing Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Su%2C+W">Wenxin Su</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+S">Song Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xiaofeng Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Yi%2C+X">Xiaojing Yi</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Mao Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Zu%2C+C">Chunxiao Zu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jiahao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+X">Xiatian Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01203v1-abstract-short" style="display: inline;"> Domain shift (the difference between source and target domains) poses a significant challenge in clinical applications, e.g., Diabetic Retinopathy (DR) grading. Despite considering certain clinical requirements, like source data privacy, conventional transfer methods are predominantly model-centered and often struggle to prevent model-targeted attacks. In this paper, we address a challenging Onlin&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01203v1-abstract-full').style.display = 'inline'; document.getElementById('2412.01203v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01203v1-abstract-full" style="display: none;"> Domain shift (the difference between source and target domains) poses a significant challenge in clinical applications, e.g., Diabetic Retinopathy (DR) grading. Despite considering certain clinical requirements, like source data privacy, conventional transfer methods are predominantly model-centered and often struggle to prevent model-targeted attacks. In this paper, we address a challenging Online Model-aGnostic Domain Adaptation (OMG-DA) setting, driven by the demands of clinical environments. This setting is characterized by the absence of the model and the flow of target data. To tackle the new challenge, we propose a novel approach, Generative Unadversarial ExampleS (GUES), which enables adaptation from a data-centric perspective. Specifically, we first theoretically reformulate conventional perturbation optimization in a generative way--learning a perturbation generation function with a latent input variable. During model instantiation, we leverage a Variational AutoEncoder to express this function. The encoder with the reparameterization trick predicts the latent input, whilst the decoder is responsible for the generation. Furthermore, the saliency map is selected as pseudo-perturbation labels. Because it not only captures potential lesions but also theoretically provides an upper bound on the function input, enabling the identification of the latent variable. Extensive comparative experiments on DR benchmarks with both frozen pre-trained models and trainable models demonstrate the superiority of GUES, showing robustness even with small batch size. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01203v1-abstract-full').style.display = 'none'; document.getElementById('2412.01203v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01095">arXiv:2412.01095</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.01095">pdf</a>, <a href="https://arxiv.org/format/2412.01095">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> VERA: Explainable Video Anomaly Detection via Verbalized Learning of Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Muchao Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+W">Weiyang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+P">Pan He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01095v1-abstract-short" style="display: inline;"> The rapid advancement of vision-language models (VLMs) has established a new paradigm in video anomaly detection (VAD): leveraging VLMs to simultaneously detect anomalies and provide comprehendible explanations for the decisions. Existing work in this direction often assumes the complex reasoning required for VAD exceeds the capabilities of pretrained VLMs. Consequently, these approaches either in&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01095v1-abstract-full').style.display = 'inline'; document.getElementById('2412.01095v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01095v1-abstract-full" style="display: none;"> The rapid advancement of vision-language models (VLMs) has established a new paradigm in video anomaly detection (VAD): leveraging VLMs to simultaneously detect anomalies and provide comprehendible explanations for the decisions. Existing work in this direction often assumes the complex reasoning required for VAD exceeds the capabilities of pretrained VLMs. Consequently, these approaches either incorporate specialized reasoning modules during inference or rely on instruction tuning datasets through additional training to adapt VLMs for VAD. However, such strategies often incur substantial computational costs or data annotation overhead. To address these challenges in explainable VAD, we introduce a verbalized learning framework named VERA that enables VLMs to perform VAD without model parameter modifications. Specifically, VERA automatically decomposes the complex reasoning required for VAD into reflections on simpler, more focused guiding questions capturing distinct abnormal patterns. It treats these reflective questions as learnable parameters and optimizes them through data-driven verbal interactions between learner and optimizer VLMs, using coarsely labeled training data. During inference, VERA embeds the learned questions into model prompts to guide VLMs in generating segment-level anomaly scores, which are then refined into frame-level scores via the fusion of scene and temporal contexts. Experimental results on challenging benchmarks demonstrate that the learned questions of VERA are highly adaptable, significantly improving both detection performance and explainability of VLMs for VAD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01095v1-abstract-full').style.display = 'none'; document.getElementById('2412.01095v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00115">arXiv:2412.00115</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.00115">pdf</a>, <a href="https://arxiv.org/format/2412.00115">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> OpenHumanVid: A Large-Scale High-Quality Dataset for Enhancing Human-Centric Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Hui Li</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+M">Mingwang Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhan%2C+Y">Yun Zhan</a>, <a href="/search/cs?searchtype=author&amp;query=Mu%2C+S">Shan Mu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jiaye Li</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+K">Kaihui Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yuxuan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+T">Tan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Mao Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jingdong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Siyu Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00115v3-abstract-short" style="display: inline;"> Recent advancements in visual generation technologies have markedly increased the scale and availability of video datasets, which are crucial for training effective video generation models. However, a significant lack of high-quality, human-centric video datasets presents a challenge to progress in this field. To bridge this gap, we introduce OpenHumanVid, a large-scale and high-quality human-cent&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00115v3-abstract-full').style.display = 'inline'; document.getElementById('2412.00115v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00115v3-abstract-full" style="display: none;"> Recent advancements in visual generation technologies have markedly increased the scale and availability of video datasets, which are crucial for training effective video generation models. However, a significant lack of high-quality, human-centric video datasets presents a challenge to progress in this field. To bridge this gap, we introduce OpenHumanVid, a large-scale and high-quality human-centric video dataset characterized by precise and detailed captions that encompass both human appearance and motion states, along with supplementary human motion conditions, including skeleton sequences and speech audio. To validate the efficacy of this dataset and the associated training strategies, we propose an extension of existing classical diffusion transformer architectures and conduct further pretraining of our models on the proposed dataset. Our findings yield two critical insights: First, the incorporation of a large-scale, high-quality dataset substantially enhances evaluation metrics for generated human videos while preserving performance in general video generation tasks. Second, the effective alignment of text with human appearance, human motion, and facial motion is essential for producing high-quality video outputs. Based on these insights and corresponding methodologies, the straightforward extended network trained on the proposed dataset demonstrates an obvious improvement in the generation of human-centric videos. Project page https://fudan-generative-vision.github.io/OpenHumanVid <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00115v3-abstract-full').style.display = 'none'; document.getElementById('2412.00115v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 8 figures, 5 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15233">arXiv:2411.15233</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.15233">pdf</a>, <a href="https://arxiv.org/format/2411.15233">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Learning Volumetric Neural Deformable Models to Recover 3D Regional Heart Wall Motion from Multi-Planar Tagged MRI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Meng Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Xin%2C+B">Bingyu Xin</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+B">Bangwei Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Axel%2C+L">Leon Axel</a>, <a href="/search/cs?searchtype=author&amp;query=Metaxas%2C+D">Dimitris Metaxas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15233v2-abstract-short" style="display: inline;"> Multi-planar tagged MRI is the gold standard for regional heart wall motion evaluation. However, accurate recovery of the 3D true heart wall motion from a set of 2D apparent motion cues is challenging, due to incomplete sampling of the true motion and difficulty in information fusion from apparent motion cues observed on multiple imaging planes. To solve these challenges, we introduce a novel clas&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15233v2-abstract-full').style.display = 'inline'; document.getElementById('2411.15233v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15233v2-abstract-full" style="display: none;"> Multi-planar tagged MRI is the gold standard for regional heart wall motion evaluation. However, accurate recovery of the 3D true heart wall motion from a set of 2D apparent motion cues is challenging, due to incomplete sampling of the true motion and difficulty in information fusion from apparent motion cues observed on multiple imaging planes. To solve these challenges, we introduce a novel class of volumetric neural deformable models ($\upsilon$NDMs). Our $\upsilon$NDMs represent heart wall geometry and motion through a set of low-dimensional global deformation parameter functions and a diffeomorphic point flow regularized local deformation field. To learn such global and local deformation for 2D apparent motion mapping to 3D true motion, we design a hybrid point transformer, which incorporates both point cross-attention and self-attention mechanisms. While use of point cross-attention can learn to fuse 2D apparent motion cues into material point true motion hints, point self-attention hierarchically organised as an encoder-decoder structure can further learn to refine these hints and map them into 3D true motion. We have performed experiments on a large cohort of synthetic 3D regional heart wall motion dataset. The results demonstrated the high accuracy of our method for the recovery of dense 3D true motion from sparse 2D apparent motion cues. Project page is at https://github.com/DeepTag/VolumetricNeuralDeformableModels. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15233v2-abstract-full').style.display = 'none'; document.getElementById('2411.15233v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13076">arXiv:2411.13076</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13076">pdf</a>, <a href="https://arxiv.org/format/2411.13076">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Hints of Prompt: Enhancing Visual Representation for Multimodal LLMs in Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+H">Hao Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+Z">Zhanning Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Maosheng Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zhili Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Q">Qifeng Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+T">Tongyi Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+H">Honggang Qi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13076v1-abstract-short" style="display: inline;"> In light of the dynamic nature of autonomous driving environments and stringent safety requirements, general MLLMs combined with CLIP alone often struggle to represent driving-specific scenarios accurately, particularly in complex interactions and long-tail cases. To address this, we propose the Hints of Prompt (HoP) framework, which introduces three key enhancements: Affinity hint to emphasize in&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13076v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13076v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13076v1-abstract-full" style="display: none;"> In light of the dynamic nature of autonomous driving environments and stringent safety requirements, general MLLMs combined with CLIP alone often struggle to represent driving-specific scenarios accurately, particularly in complex interactions and long-tail cases. To address this, we propose the Hints of Prompt (HoP) framework, which introduces three key enhancements: Affinity hint to emphasize instance-level structure by strengthening token-wise connections, Semantic hint to incorporate high-level information relevant to driving-specific cases, such as complex interactions among vehicles and traffic signs, and Question hint to align visual features with the query context, focusing on question-relevant regions. These hints are fused through a Hint Fusion module, enriching visual representations and enhancing multimodal reasoning for autonomous driving VQA tasks. Extensive experiments confirm the effectiveness of the HoP framework, showing it significantly outperforms previous state-of-the-art methods across all key metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13076v1-abstract-full').style.display = 'none'; document.getElementById('2411.13076v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10928">arXiv:2411.10928</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.10928">pdf</a>, <a href="https://arxiv.org/format/2411.10928">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Learn from Downstream and Be Yourself in Multimodal Large Language Model Fine-Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Huang%2C+W">Wenke Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+J">Jian Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+Z">Zekun Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+D">Didi Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+G">Guancheng Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">He Li</a>, <a href="/search/cs?searchtype=author&amp;query=Du%2C+B">Bo Du</a>, <a href="/search/cs?searchtype=author&amp;query=Tao%2C+D">Dacheng Tao</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Mang Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10928v1-abstract-short" style="display: inline;"> Multimodal Large Language Model (MLLM) have demonstrated strong generalization capabilities across diverse distributions and tasks, largely due to extensive pre-training datasets. Fine-tuning MLLM has become a common practice to improve performance on specific downstream tasks. However, during fine-tuning, MLLM often faces the risk of forgetting knowledge acquired during pre-training, which can re&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10928v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10928v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10928v1-abstract-full" style="display: none;"> Multimodal Large Language Model (MLLM) have demonstrated strong generalization capabilities across diverse distributions and tasks, largely due to extensive pre-training datasets. Fine-tuning MLLM has become a common practice to improve performance on specific downstream tasks. However, during fine-tuning, MLLM often faces the risk of forgetting knowledge acquired during pre-training, which can result in a decline in generalization abilities. To balance the trade-off between generalization and specialization, we propose measuring the parameter importance for both pre-trained and fine-tuning distributions, based on frozen pre-trained weight magnitude and accumulated fine-tuning gradient values. We further apply an importance-aware weight allocation strategy, selectively updating relatively important parameters for downstream tasks. We conduct empirical evaluations on both image captioning and visual question-answering tasks using various MLLM architectures. The comprehensive experimental analysis demonstrates the effectiveness of the proposed solution, highlighting the efficiency of the crucial modules in enhancing downstream specialization performance while mitigating generalization degradation in MLLM Fine-Tuning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10928v1-abstract-full').style.display = 'none'; document.getElementById('2411.10928v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10484">arXiv:2411.10484</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.10484">pdf</a>, <a href="https://arxiv.org/format/2411.10484">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> iFlow: An Interactive Max-Flow/Min-Cut Algorithms Visualizer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Muyang Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Xia%2C+T">Tianrui Xia</a>, <a href="/search/cs?searchtype=author&amp;query=Zu%2C+T">Tianxin Zu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Q">Qian Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Kempe%2C+D">David Kempe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10484v1-abstract-short" style="display: inline;"> The Max-Flow/Min-Cut problem is a fundamental tool in graph theory, with applications in many domains, including data mining, image segmentation, transportation planning, and many types of assignment problems, in addition to being an essential building block for many other algorithms. The Ford-Fulkerson Algorithm for Max-Flow/Min-Cut and its variants are therefore commonly taught in undergraduate&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10484v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10484v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10484v1-abstract-full" style="display: none;"> The Max-Flow/Min-Cut problem is a fundamental tool in graph theory, with applications in many domains, including data mining, image segmentation, transportation planning, and many types of assignment problems, in addition to being an essential building block for many other algorithms. The Ford-Fulkerson Algorithm for Max-Flow/Min-Cut and its variants are therefore commonly taught in undergraduate and beginning graduate algorithms classes. However, these algorithms -- and in particular the so-called residual graphs they utilize -- often pose significant challenges for students. To help students achieve a deeper understanding, we developed iFlow, an interactive visualization tool for the Ford-Fulkerson Algorithm and its variants. iFlow lets users design or import flow networks, and execute the algorithm by hand. In particular, the user can select an augmentation path and amount, and then update the residual graph. The user is given detailed feedback on mistakes, and can also have iFlow auto-complete each step, to use it as a demonstration tool while still in the initial learning stages. iFlow has been made publicly available and open-sourced. We deployed iFlow in an undergraduate algorithms class, and collected students&#39; self-reported learning benefits via an optional survey. All respondents considered the tool at least somewhat useful and engaging, with most rating it either as useful/engaging or very useful/engaging. Students also generally reported a significant increase in understanding of the algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10484v1-abstract-full').style.display = 'none'; document.getElementById('2411.10484v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is accepted by SIGCSE 2025 TS. Due to the page limit we can not include the appendix in the SIGCSE version. So we decide to include them on arXiv so that the SIGCSE version can point to the arXiv version. Since the final SIGCSE version is due by Nov. 17, it would be really helpful if this submission can go online as soon as possible. Thanks!</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05569">arXiv:2411.05569</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.05569">pdf</a>, <a href="https://arxiv.org/format/2411.05569">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3701571.3703381">10.1145/3701571.3703381 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> The Framework of NAVIS: Navigating Virtual Spaces with Immersive Scooters </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Z">Zhixun Lin</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+W">Wei He</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xinyi Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Mingchen Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Kan%2C+G+L">Ge Lin Kan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05569v1-abstract-short" style="display: inline;"> Virtual reality (VR) environments have greatly expanded opportunities for immersive exploration, yet physically navigating these digital spaces remains a significant challenge. In this paper, we present the conceptual framework of NAVIS (Navigating Virtual Spaces with Immersive Scooters), a novel system that utilizes a scooter-based interface to enhance both navigation and interaction within virtu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05569v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05569v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05569v1-abstract-full" style="display: none;"> Virtual reality (VR) environments have greatly expanded opportunities for immersive exploration, yet physically navigating these digital spaces remains a significant challenge. In this paper, we present the conceptual framework of NAVIS (Navigating Virtual Spaces with Immersive Scooters), a novel system that utilizes a scooter-based interface to enhance both navigation and interaction within virtual environments. NAVIS combines real-time physical mobility, haptic feedback, and CAVE-like (Cave Automatic Virtual Environment) technology to create a realistic sense of travel and movement, improving both spatial awareness and the overall immersive experience. By offering a more natural and physically engaging method of exploration, NAVIS addresses key limitations found in traditional VR locomotion techniques, such as teleportation or joystick control, which can detract from immersion and realism. This approach highlights the potential of combining physical movement with virtual environments to provide a more intuitive and enjoyable experience for users, opening up new possibilities for applications in gaming, education, and beyond. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05569v1-abstract-full').style.display = 'none'; document.getElementById('2411.05569v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> International Conference on Mobile and Ubiquitous Multimedia 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04469">arXiv:2411.04469</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.04469">pdf</a>, <a href="https://arxiv.org/format/2411.04469">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FreeCap: Hybrid Calibration-Free Motion Capture in Open Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xue%2C+A">Aoru Xue</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+Y">Yiming Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+Z">Zining Song</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Mao Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+X">Xinge Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+Y">Yuexin Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04469v2-abstract-short" style="display: inline;"> We propose a novel hybrid calibration-free method FreeCap to accurately capture global multi-person motions in open environments. Our system combines a single LiDAR with expandable moving cameras, allowing for flexible and precise motion estimation in a unified world coordinate. In particular, We introduce a local-to-global pose-aware cross-sensor human-matching module that predicts the alignment&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04469v2-abstract-full').style.display = 'inline'; document.getElementById('2411.04469v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04469v2-abstract-full" style="display: none;"> We propose a novel hybrid calibration-free method FreeCap to accurately capture global multi-person motions in open environments. Our system combines a single LiDAR with expandable moving cameras, allowing for flexible and precise motion estimation in a unified world coordinate. In particular, We introduce a local-to-global pose-aware cross-sensor human-matching module that predicts the alignment among each sensor, even in the absence of calibration. Additionally, our coarse-to-fine sensor-expandable pose optimizer further optimizes the 3D human key points and the alignments, it is also capable of incorporating additional cameras to enhance accuracy. Extensive experiments on Human-M3 and FreeMotion datasets demonstrate that our method significantly outperforms state-of-the-art single-modal methods, offering an expandable and efficient solution for multi-person motion capture across various applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04469v2-abstract-full').style.display = 'none'; document.getElementById('2411.04469v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23231">arXiv:2410.23231</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.23231">pdf</a>, <a href="https://arxiv.org/format/2410.23231">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LGU-SLAM: Learnable Gaussian Uncertainty Matching with Deformable Correlation Sampling for Deep Visual SLAM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Y">Yucheng Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Ji%2C+L">Luping Ji</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Hudong Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Mao Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23231v1-abstract-short" style="display: inline;"> Deep visual Simultaneous Localization and Mapping (SLAM) techniques, e.g., DROID, have made significant advancements by leveraging deep visual odometry on dense flow fields. In general, they heavily rely on global visual similarity matching. However, the ambiguous similarity interference in uncertain regions could often lead to excessive noise in correspondences, ultimately misleading SLAM in geom&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23231v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23231v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23231v1-abstract-full" style="display: none;"> Deep visual Simultaneous Localization and Mapping (SLAM) techniques, e.g., DROID, have made significant advancements by leveraging deep visual odometry on dense flow fields. In general, they heavily rely on global visual similarity matching. However, the ambiguous similarity interference in uncertain regions could often lead to excessive noise in correspondences, ultimately misleading SLAM in geometric modeling. To address this issue, we propose a Learnable Gaussian Uncertainty (LGU) matching. It mainly focuses on precise correspondence construction. In our scheme, a learnable 2D Gaussian uncertainty model is designed to associate matching-frame pairs. It could generate input-dependent Gaussian distributions for each correspondence map. Additionally, a multi-scale deformable correlation sampling strategy is devised to adaptively fine-tune the sampling of each direction by a priori look-up ranges, enabling reliable correlation construction. Furthermore, a KAN-bias GRU component is adopted to improve a temporal iterative enhancement for accomplishing sophisticated spatio-temporal modeling with limited parameters. The extensive experiments on real-world and synthetic datasets are conducted to validate the effectiveness and superiority of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23231v1-abstract-full').style.display = 'none'; document.getElementById('2410.23231v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23191">arXiv:2410.23191</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.23191">pdf</a>, <a href="https://arxiv.org/format/2410.23191">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Continuous Spatio-Temporal Memory Networks for 4D Cardiac Cine MRI Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Meng Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Xin%2C+B">Bingyu Xin</a>, <a href="/search/cs?searchtype=author&amp;query=Axel%2C+L">Leon Axel</a>, <a href="/search/cs?searchtype=author&amp;query=Metaxas%2C+D">Dimitris Metaxas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23191v2-abstract-short" style="display: inline;"> Current cardiac cine magnetic resonance image (cMR) studies focus on the end diastole (ED) and end systole (ES) phases, while ignoring the abundant temporal information in the whole image sequence. This is because whole sequence segmentation is currently a tedious process and inaccurate. Conventional whole sequence segmentation approaches first estimate the motion field between frames, which is th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23191v2-abstract-full').style.display = 'inline'; document.getElementById('2410.23191v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23191v2-abstract-full" style="display: none;"> Current cardiac cine magnetic resonance image (cMR) studies focus on the end diastole (ED) and end systole (ES) phases, while ignoring the abundant temporal information in the whole image sequence. This is because whole sequence segmentation is currently a tedious process and inaccurate. Conventional whole sequence segmentation approaches first estimate the motion field between frames, which is then used to propagate the mask along the temporal axis. However, the mask propagation results could be prone to error, especially for the basal and apex slices, where through-plane motion leads to significant morphology and structural change during the cardiac cycle. Inspired by recent advances in video object segmentation (VOS), based on spatio-temporal memory (STM) networks, we propose a continuous STM (CSTM) network for semi-supervised whole heart and whole sequence cMR segmentation. Our CSTM network takes full advantage of the spatial, scale, temporal and through-plane continuity prior of the underlying heart anatomy structures, to achieve accurate and fast 4D segmentation. Results of extensive experiments across multiple cMR datasets show that our method can improve the 4D cMR segmentation performance, especially for the hard-to-segment regions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23191v2-abstract-full').style.display = 'none'; document.getElementById('2410.23191v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to WACV 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21996">arXiv:2410.21996</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.21996">pdf</a>, <a href="https://arxiv.org/format/2410.21996">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Multi-layer network analysis of deliberation in an online discussion platform: the case of Reddit </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gao%2C+T">Tianshu Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Mengbin Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Ackland%2C+R">Robert Ackland</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21996v1-abstract-short" style="display: inline;"> This paper uses a multi-layer network model to study deliberation in online discussion platforms, focusing on the Reddit platform. The model comprises two layers: a discussion layer, which represents the comment-to-comment replies as a hierarchical tree, and an actor layer, which represent the actor-to-actor reply interactions. The interlayer links represent user-comment ownership. We further prop&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21996v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21996v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21996v1-abstract-full" style="display: none;"> This paper uses a multi-layer network model to study deliberation in online discussion platforms, focusing on the Reddit platform. The model comprises two layers: a discussion layer, which represents the comment-to-comment replies as a hierarchical tree, and an actor layer, which represent the actor-to-actor reply interactions. The interlayer links represent user-comment ownership. We further propose several different network metrics to characterise the level of deliberation in discussion threads, and apply the model and metrics to a large Reddit dataset containing posts from 72 subreddits focused on different topics. We compare the level of deliberation that occurs on different subreddits, finding that subreddits that are based on geographical regions or focus on sports have the highest levels of deliberation. Analysis of the actor layer reveals several features consistent across all subreddits, such as small-world characteristics and similar numbers of highly active users. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21996v1-abstract-full').style.display = 'none'; document.getElementById('2410.21996v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint of journal paper submission</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20105">arXiv:2410.20105</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.20105">pdf</a>, <a href="https://arxiv.org/format/2410.20105">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> FedSSP: Federated Graph Learning with Spectral Knowledge and Personalized Preference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tan%2C+Z">Zihan Tan</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+G">Guancheng Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+W">Wenke Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Mang Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20105v1-abstract-short" style="display: inline;"> Personalized Federated Graph Learning (pFGL) facilitates the decentralized training of Graph Neural Networks (GNNs) without compromising privacy while accommodating personalized requirements for non-IID participants. In cross-domain scenarios, structural heterogeneity poses significant challenges for pFGL. Nevertheless, previous pFGL methods incorrectly share non-generic knowledge globally and fai&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20105v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20105v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20105v1-abstract-full" style="display: none;"> Personalized Federated Graph Learning (pFGL) facilitates the decentralized training of Graph Neural Networks (GNNs) without compromising privacy while accommodating personalized requirements for non-IID participants. In cross-domain scenarios, structural heterogeneity poses significant challenges for pFGL. Nevertheless, previous pFGL methods incorrectly share non-generic knowledge globally and fail to tailor personalized solutions locally under domain structural shift. We innovatively reveal that the spectral nature of graphs can well reflect inherent domain structural shifts. Correspondingly, our method overcomes it by sharing generic spectral knowledge. Moreover, we indicate the biased message-passing schemes for graph structures and propose the personalized preference module. Combining both strategies, we propose our pFGL framework FedSSP which Shares generic Spectral knowledge while satisfying graph Preferences. Furthermore, We perform extensive experiments on cross-dataset and cross-domain settings to demonstrate the superiority of our framework. The code is available at https://github.com/OakleyTan/FedSSP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20105v1-abstract-full').style.display = 'none'; document.getElementById('2410.20105v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10573">arXiv:2410.10573</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.10573">pdf</a>, <a href="https://arxiv.org/format/2410.10573">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Queryable Prototype Multiple Instance Learning with Vision-Language Models for Incremental Whole Slide Image Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gou%2C+J">Jiaxiang Gou</a>, <a href="/search/cs?searchtype=author&amp;query=Ji%2C+L">Luping Ji</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+P">Pei Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Mao Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10573v3-abstract-short" style="display: inline;"> Whole Slide Image (WSI) classification has very significant applications in clinical pathology, e.g., tumor identification and cancer diagnosis. Currently, most research attention is focused on Multiple Instance Learning (MIL) using static datasets. One of the most obvious weaknesses of these methods is that they cannot efficiently preserve and utilize previously learned knowledge. With any new da&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10573v3-abstract-full').style.display = 'inline'; document.getElementById('2410.10573v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10573v3-abstract-full" style="display: none;"> Whole Slide Image (WSI) classification has very significant applications in clinical pathology, e.g., tumor identification and cancer diagnosis. Currently, most research attention is focused on Multiple Instance Learning (MIL) using static datasets. One of the most obvious weaknesses of these methods is that they cannot efficiently preserve and utilize previously learned knowledge. With any new data arriving, classification models are required to be re-trained on both previous and current new data. To overcome this shortcoming and break through traditional vision modality, this paper proposes the first Vision-Language-based framework with Queryable Prototype Multiple Instance Learning (QPMIL-VL) specially designed for incremental WSI classification. This framework mainly consists of two information processing branches: one is for generating bag-level features by prototype-guided aggregation of instance features, while the other is for enhancing class features through a combination of class ensemble, tunable vector and class similarity loss. The experiments on four public WSI datasets demonstrate that our QPMIL-VL framework is effective for incremental WSI classification and often significantly outperforms other compared methods, achieving state-of-the-art (SOTA) performance. Our source code is publicly available at https://github.com/can-can-ya/QPMIL-VL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10573v3-abstract-full').style.display = 'none'; document.getElementById('2410.10573v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08871">arXiv:2410.08871</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.08871">pdf</a>, <a href="https://arxiv.org/format/2410.08871">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> Adaptive optimization of wave energy conversion in oscillatory wave surge converters via SPH simulation and deep reinforcement learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Mai Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+C">Chi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+Y">Yaru Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Ziyuan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Haidn%2C+O+J">Oskar J. Haidn</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+X">Xiangyu Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08871v1-abstract-short" style="display: inline;"> The nonlinear damping characteristics of the oscillating wave surge converter (OWSC) significantly impact the performance of the power take-off system. This study presents a framework by integrating deep reinforcement learning (DRL) with numerical simulations of OWSC to identify optimal adaptive damping policy under varying wave conditions, thereby enhancing wave energy harvesting efficiency. Firs&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08871v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08871v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08871v1-abstract-full" style="display: none;"> The nonlinear damping characteristics of the oscillating wave surge converter (OWSC) significantly impact the performance of the power take-off system. This study presents a framework by integrating deep reinforcement learning (DRL) with numerical simulations of OWSC to identify optimal adaptive damping policy under varying wave conditions, thereby enhancing wave energy harvesting efficiency. Firstly, the open-source multiphysics libraries SPHinXsys and Simbody are employed to establish the numerical environment for wave interaction with OWSCs. Subsequently, a comparative analysis of three DRL algorithms-proximal policy optimization (PPO), twin delayed deep deterministic policy gradient (TD3), and soft actor-critic (SAC)-is conducted using the two-dimensional (2D) numerical study of OWSC interacting with regular waves. The results reveal that artificial neural networks capture the nonlinear characteristics of wave-structure interactions and provide efficient PTO policies. Notably, the SAC algorithm demonstrates exceptional robustness and accuracy, achieving a 10.61% improvement in wave energy harvesting. Furthermore, policies trained in a 2D environment are successfully applied to the three-dimensional (3D) study, with an improvement of 22.54% in energy harvesting. Additionally, the study shows that energy harvesting is improved by 6.42% for complex irregular waves. However, for the complex dual OWSC system, optimizing the damping characteristics alone is insufficient to enhance energy harvesting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08871v1-abstract-full').style.display = 'none'; document.getElementById('2410.08871v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">67 pages and 25 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06977">arXiv:2410.06977</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.06977">pdf</a>, <a href="https://arxiv.org/format/2410.06977">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Adaptive High-Frequency Transformer for Diverse Wildlife Re-Identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chenyue Li</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Shuoyi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Mang Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06977v2-abstract-short" style="display: inline;"> Wildlife ReID involves utilizing visual technology to identify specific individuals of wild animals in different scenarios, holding significant importance for wildlife conservation, ecological research, and environmental monitoring. Existing wildlife ReID methods are predominantly tailored to specific species, exhibiting limited applicability. Although some approaches leverage extensively studied&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06977v2-abstract-full').style.display = 'inline'; document.getElementById('2410.06977v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06977v2-abstract-full" style="display: none;"> Wildlife ReID involves utilizing visual technology to identify specific individuals of wild animals in different scenarios, holding significant importance for wildlife conservation, ecological research, and environmental monitoring. Existing wildlife ReID methods are predominantly tailored to specific species, exhibiting limited applicability. Although some approaches leverage extensively studied person ReID techniques, they struggle to address the unique challenges posed by wildlife. Therefore, in this paper, we present a unified, multi-species general framework for wildlife ReID. Given that high-frequency information is a consistent representation of unique features in various species, significantly aiding in identifying contours and details such as fur textures, we propose the Adaptive High-Frequency Transformer model with the goal of enhancing high-frequency information learning. To mitigate the inevitable high-frequency interference in the wilderness environment, we introduce an object-aware high-frequency selection strategy to adaptively capture more valuable high-frequency components. Notably, we unify the experimental settings of multiple wildlife datasets for ReID, achieving superior performance over state-of-the-art ReID methods. In domain generalization scenarios, our approach demonstrates robust generalization to unknown species. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06977v2-abstract-full').style.display = 'none'; document.getElementById('2410.06977v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by European Conference on Computer Vision (ECCV) 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06663">arXiv:2410.06663</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.06663">pdf</a>, <a href="https://arxiv.org/format/2410.06663">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Dynamical Systems">math.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Physics and Society">physics.soc-ph</span> </div> </div> <p class="title is-5 mathjax"> Data-informed modeling of the formation, persistence, and evolution of social norms and conventions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Mengbin Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Zino%2C+L">Lorenzo Zino</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06663v2-abstract-short" style="display: inline;"> Social norms and conventions are commonly accepted and adopted behaviors and practices within a social group that guide interactions -- e.g., how to spell a word or how to greet people -- and are central to a group&#39;s culture and identity. Understanding the key mechanisms that govern the formation, persistence, and evolution of social norms and conventions in social communities is a problem of para&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06663v2-abstract-full').style.display = 'inline'; document.getElementById('2410.06663v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06663v2-abstract-full" style="display: none;"> Social norms and conventions are commonly accepted and adopted behaviors and practices within a social group that guide interactions -- e.g., how to spell a word or how to greet people -- and are central to a group&#39;s culture and identity. Understanding the key mechanisms that govern the formation, persistence, and evolution of social norms and conventions in social communities is a problem of paramount importance for a broad range of real-world applications, spanning from preparedness for future emergencies to promotion of sustainable practices. In the past decades, mathematical modeling has emerged as a powerful tool to reproduce and study the complex dynamics of norm and convention change, gaining insights into their mechanisms, and ultimately deriving tools to predict their evolution. The first goal of this chapter is to introduce some of the main mathematical approaches for modeling social norms and conventions, including population models and agent-based models relying on the theories of dynamical systems, evolutionary dynamics, and game theory. The second goal of the chapter is to illustrate how quantitative observations and empirical data can be incorporated into these mathematical models in a systematic manner, establishing a data-based approach to mathematical modeling of formation, persistence, and evolution of social norms and conventions. Finally, current challenges and future opportunities in this growing field of research are discussed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06663v2-abstract-full').style.display = 'none'; document.getElementById('2410.06663v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This is an author&#39;s (preprint) version of a book chapter that is part of the Handbook of Visual, Experimental and Computational Mathematics - Bridges through Data</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05557">arXiv:2410.05557</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.05557">pdf</a>, <a href="https://arxiv.org/format/2410.05557">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Weak-to-Strong Augmentation in Source-Free Domain Adaptive Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+J">Jiuzheng Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+S">Song Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yangkuiyi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+S">Shuaifeng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Mao Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jianwei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+X">Xiatian Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05557v1-abstract-short" style="display: inline;"> Source-Free domain adaptive Object Detection (SFOD) aims to transfer a detector (pre-trained on source domain) to new unlabelled target domains. Current SFOD methods typically follow the Mean Teacher framework, where weak-to-strong augmentation provides diverse and sharp contrast for self-supervised learning. However, this augmentation strategy suffers from an inherent problem called crucial seman&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05557v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05557v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05557v1-abstract-full" style="display: none;"> Source-Free domain adaptive Object Detection (SFOD) aims to transfer a detector (pre-trained on source domain) to new unlabelled target domains. Current SFOD methods typically follow the Mean Teacher framework, where weak-to-strong augmentation provides diverse and sharp contrast for self-supervised learning. However, this augmentation strategy suffers from an inherent problem called crucial semantics loss: Due to random, strong disturbance, strong augmentation is prone to losing typical visual components, hindering cross-domain feature extraction. To address this thus-far ignored limitation, this paper introduces a novel Weak-to-Strong Contrastive Learning (WSCoL) approach. The core idea is to distill semantics lossless knowledge in the weak features (from the weak/teacher branch) to guide the representation learning upon the strong features (from the strong/student branch). To achieve this, we project the original features into a shared space using a mapping network, thereby reducing the bias between the weak and strong features. Meanwhile, a weak features-guided contrastive learning is performed in a weak-to-strong manner alternatively. Specifically, we first conduct an adaptation-aware prototype-guided clustering on the weak features to generate pseudo labels for corresponding strong features matched through proposals. Sequentially, we identify positive-negative samples based on the pseudo labels and perform cross-category contrastive learning on the strong features where an uncertainty estimator encourages adaptive background contrast. Extensive experiments demonstrate that WSCoL yields new state-of-the-art performance, offering a built-in mechanism mitigating crucial semantics loss for traditional Mean Teacher framework. The code and data will be released soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05557v1-abstract-full').style.display = 'none'; document.getElementById('2410.05557v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02220">arXiv:2410.02220</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.02220">pdf</a>, <a href="https://arxiv.org/format/2410.02220">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Data to Defense: The Role of Curation in Customizing LLMs Against Jailbreaking Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xiaoqun Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+J">Jiacheng Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+L">Luoxi Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Muchao Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+W">Weicheng Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Xi%2C+Z">Zhaohan Xi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02220v4-abstract-short" style="display: inline;"> Large language models (LLMs) are widely adapted for downstream applications through fine-tuning, a process named customization. However, recent studies have identified a vulnerability during this process, where malicious samples can compromise the robustness of LLMs and amplify harmful behaviors-an attack commonly referred to as jailbreaking. To address this challenge, we propose an adaptive data&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02220v4-abstract-full').style.display = 'inline'; document.getElementById('2410.02220v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02220v4-abstract-full" style="display: none;"> Large language models (LLMs) are widely adapted for downstream applications through fine-tuning, a process named customization. However, recent studies have identified a vulnerability during this process, where malicious samples can compromise the robustness of LLMs and amplify harmful behaviors-an attack commonly referred to as jailbreaking. To address this challenge, we propose an adaptive data curation approach allowing any text to be curated to enhance its effectiveness in counteracting harmful samples during customization. To avoid the need for additional defensive modules, we further introduce a comprehensive mitigation framework spanning the lifecycle of the customization process: before customization to immunize LLMs against future jailbreak attempts, during customization to neutralize risks, and after customization to restore compromised models. Experimental results demonstrate a significant reduction in jailbreaking effects, achieving up to a 100% success rate in generating safe responses. By combining adaptive data curation with lifecycle-based mitigation strategies, this work represents a solid step forward in mitigating jailbreaking risks and ensuring the secure adaptation of LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02220v4-abstract-full').style.display = 'none'; document.getElementById('2410.02220v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01144">arXiv:2410.01144</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.01144">pdf</a>, <a href="https://arxiv.org/format/2410.01144">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Uncertainty-Guided Enhancement on Driving Perception System via Foundation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yunhao Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+Y">Yuxin Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Mao Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zaiwei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Z">Zhichao Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Y">Yi Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Topcu%2C+U">Ufuk Topcu</a>, <a href="/search/cs?searchtype=author&amp;query=Snyder%2C+B">Ben Snyder</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01144v1-abstract-short" style="display: inline;"> Multimodal foundation models offer promising advancements for enhancing driving perception systems, but their high computational and financial costs pose challenges. We develop a method that leverages foundation models to refine predictions from existing driving perception models -- such as enhancing object classification accuracy -- while minimizing the frequency of using these resource-intensive&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01144v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01144v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01144v1-abstract-full" style="display: none;"> Multimodal foundation models offer promising advancements for enhancing driving perception systems, but their high computational and financial costs pose challenges. We develop a method that leverages foundation models to refine predictions from existing driving perception models -- such as enhancing object classification accuracy -- while minimizing the frequency of using these resource-intensive models. The method quantitatively characterizes uncertainties in the perception model&#39;s predictions and engages the foundation model only when these uncertainties exceed a pre-specified threshold. Specifically, it characterizes uncertainty by calibrating the perception model&#39;s confidence scores into theoretical lower bounds on the probability of correct predictions using conformal prediction. Then, it sends images to the foundation model and queries for refining the predictions only if the theoretical bound of the perception model&#39;s outcome is below the threshold. Additionally, we propose a temporal inference mechanism that enhances prediction accuracy by integrating historical predictions, leading to tighter theoretical bounds. The method demonstrates a 10 to 15 percent improvement in prediction accuracy and reduces the number of queries to the foundation model by 50 percent, based on quantitative evaluations from driving datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01144v1-abstract-full').style.display = 'none'; document.getElementById('2410.01144v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00868">arXiv:2410.00868</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.00868">pdf</a>, <a href="https://arxiv.org/format/2410.00868">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Fine-Grained Gradient Restriction: A Simple Approach for Mitigating Catastrophic Forgetting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+B">Bo Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Mao Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Stone%2C+P">Peter Stone</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Q">Qiang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00868v1-abstract-short" style="display: inline;"> A fundamental challenge in continual learning is to balance the trade-off between learning new tasks and remembering the previously acquired knowledge. Gradient Episodic Memory (GEM) achieves this balance by utilizing a subset of past training samples to restrict the update direction of the model parameters. In this work, we start by analyzing an often overlooked hyper-parameter in GEM, the memory&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00868v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00868v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00868v1-abstract-full" style="display: none;"> A fundamental challenge in continual learning is to balance the trade-off between learning new tasks and remembering the previously acquired knowledge. Gradient Episodic Memory (GEM) achieves this balance by utilizing a subset of past training samples to restrict the update direction of the model parameters. In this work, we start by analyzing an often overlooked hyper-parameter in GEM, the memory strength, which boosts the empirical performance by further constraining the update direction. We show that memory strength is effective mainly because it improves GEM&#39;s generalization ability and therefore leads to a more favorable trade-off. By this finding, we propose two approaches that more flexibly constrain the update direction. Our methods are able to achieve uniformly better Pareto Frontiers of remembering old and learning new knowledge than using memory strength. We further propose a computationally efficient method to approximately solve the optimization problem with more constraints. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00868v1-abstract-full').style.display = 'none'; document.getElementById('2410.00868v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.20134">arXiv:2409.20134</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.20134">pdf</a>, <a href="https://arxiv.org/format/2409.20134">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> DRLinSPH: An open-source platform using deep reinforcement learning and SPHinXsys for fluid-structure-interaction problems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Mai Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+H">Hao Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+Y">Yaru Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+C">Chi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Haidn%2C+O+J">Oskar J. Haidn</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+X">Xiangyu Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.20134v1-abstract-short" style="display: inline;"> Fluid-structure interaction (FSI) problems are characterized by strong nonlinearities arising from complex interactions between fluids and structures. These pose significant challenges for traditional control strategies in optimizing structural motion, often leading to suboptimal performance. In contrast, deep reinforcement learning (DRL), through agent interactions within numerical simulation env&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20134v1-abstract-full').style.display = 'inline'; document.getElementById('2409.20134v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.20134v1-abstract-full" style="display: none;"> Fluid-structure interaction (FSI) problems are characterized by strong nonlinearities arising from complex interactions between fluids and structures. These pose significant challenges for traditional control strategies in optimizing structural motion, often leading to suboptimal performance. In contrast, deep reinforcement learning (DRL), through agent interactions within numerical simulation environments and the approximation of control policies using deep neural networks (DNNs), has shown considerable promise in addressing high-dimensional FSI problems. Additionally, smoothed particle hydrodynamics (SPH) offers a flexible and efficient computational approach for modeling large deformations, fractures, and complex interface movements inherent in FSI, outperforming traditional grid-based methods. In this work, we present DRLinSPH, an open-source Python platform that integrates the SPH-based numerical environment provided by the open-source software SPHinXsys with the mature DRL platform Tianshou to enable parallel training for FSI problems. DRLinSPH has been successfully applied to four FSI scenarios: sloshing suppression using rigid and elastic baffles, optimization of wave energy capture through an oscillating wave surge converter (OWSC), and muscle-driven fish swimming in vortices. The results demonstrate the platform&#39;s accuracy, stability, and scalability, highlighting its potential to advance industrial solutions for complex FSI challenges. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20134v1-abstract-full').style.display = 'none'; document.getElementById('2409.20134v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">68 pages 31 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18468">arXiv:2409.18468</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.18468">pdf</a>, <a href="https://arxiv.org/format/2409.18468">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> SmartReco: Detecting Read-Only Reentrancy via Fine-Grained Cross-DApp Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jingwen Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+Z">Zibin Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Nan%2C+Y">Yuhong Nan</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Mingxi Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Ning%2C+K">Kaiwen Ning</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+W">Weizhe Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18468v2-abstract-short" style="display: inline;"> Despite the increasing popularity of Decentralized Applications (DApps), they are suffering from various vulnerabilities that can be exploited by adversaries for profits. Among such vulnerabilities, Read-Only Reentrancy (called ROR in this paper), is an emerging type of vulnerability that arises from the complex interactions between DApps. In the recent three years, attack incidents of ROR have al&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18468v2-abstract-full').style.display = 'inline'; document.getElementById('2409.18468v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18468v2-abstract-full" style="display: none;"> Despite the increasing popularity of Decentralized Applications (DApps), they are suffering from various vulnerabilities that can be exploited by adversaries for profits. Among such vulnerabilities, Read-Only Reentrancy (called ROR in this paper), is an emerging type of vulnerability that arises from the complex interactions between DApps. In the recent three years, attack incidents of ROR have already caused around 30M USD losses to the DApp ecosystem. Existing techniques for vulnerability detection in smart contracts can hardly detect Read-Only Reentrancy attacks, due to the lack of tracking and analyzing the complex interactions between multiple DApps. In this paper, we propose SmartReco, a new framework for detecting Read-Only Reentrancy vulnerability in DApps through a novel combination of static and dynamic analysis (i.e., fuzzing) over smart contracts. The key design behind SmartReco is threefold: (1) SmartReco identifies the boundary between different DApps from the heavy-coupled cross-contract interactions. (2) SmartReco performs fine-grained static analysis to locate points of interest (i.e., entry functions) that may lead to ROR. (3) SmartReco utilizes the on-chain transaction data and performs multi-function fuzzing (i.e., the entry function and victim function) across different DApps to verify the existence of ROR. Our evaluation of a manual-labeled dataset with 45 RORs shows that SmartReco achieves a precision of 88.63% and a recall of 86.36%. In addition, SmartReco successfully detects 43 new RORs from 123 popular DApps. The total assets affected by such RORs reach around 520,000 USD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18468v2-abstract-full').style.display = 'none'; document.getElementById('2409.18468v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICSE 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18147">arXiv:2409.18147</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.18147">pdf</a>, <a href="https://arxiv.org/ps/2409.18147">ps</a>, <a href="https://arxiv.org/format/2409.18147">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SSP-RACL: Classification of Noisy Fundus Images with Self-Supervised Pretraining and Robust Adaptive Credal Loss </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Mengwen Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Huangfu%2C+Y">Yingzi Huangfu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">You Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+Z">Zekuan Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18147v3-abstract-short" style="display: inline;"> Fundus image classification is crucial in the computer aided diagnosis tasks, but label noise significantly impairs the performance of deep neural networks. To address this challenge, we propose a robust framework, Self-Supervised Pre-training with Robust Adaptive Credal Loss (SSP-RACL), for handling label noise in fundus image datasets. First, we use Masked Autoencoders (MAE) for pre-training to&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18147v3-abstract-full').style.display = 'inline'; document.getElementById('2409.18147v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18147v3-abstract-full" style="display: none;"> Fundus image classification is crucial in the computer aided diagnosis tasks, but label noise significantly impairs the performance of deep neural networks. To address this challenge, we propose a robust framework, Self-Supervised Pre-training with Robust Adaptive Credal Loss (SSP-RACL), for handling label noise in fundus image datasets. First, we use Masked Autoencoders (MAE) for pre-training to extract features, unaffected by label noise. Subsequently, RACL employ a superset learning framework, setting confidence thresholds and adaptive label relaxation parameter to construct possibility distributions and provide more reliable ground-truth estimates, thus effectively suppressing the memorization effect. Additionally, we introduce clinical knowledge-based asymmetric noise generation to simulate real-world noisy fundus image datasets. Experimental results demonstrate that our proposed method outperforms existing approaches in handling label noise, showing superior performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18147v3-abstract-full').style.display = 'none'; document.getElementById('2409.18147v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IEEE BioCAS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15486">arXiv:2409.15486</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.15486">pdf</a>, <a href="https://arxiv.org/format/2409.15486">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> VLMine: Long-Tail Data Mining with Vision Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Mao Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Meyer%2C+G+P">Gregory P. Meyer</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zaiwei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Park%2C+D">Dennis Park</a>, <a href="/search/cs?searchtype=author&amp;query=Mustikovela%2C+S+K">Siva Karthik Mustikovela</a>, <a href="/search/cs?searchtype=author&amp;query=Chai%2C+Y">Yuning Chai</a>, <a href="/search/cs?searchtype=author&amp;query=Wolff%2C+E+M">Eric M Wolff</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15486v1-abstract-short" style="display: inline;"> Ensuring robust performance on long-tail examples is an important problem for many real-world applications of machine learning, such as autonomous driving. This work focuses on the problem of identifying rare examples within a corpus of unlabeled data. We propose a simple and scalable data mining approach that leverages the knowledge contained within a large vision language model (VLM). Our approa&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15486v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15486v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15486v1-abstract-full" style="display: none;"> Ensuring robust performance on long-tail examples is an important problem for many real-world applications of machine learning, such as autonomous driving. This work focuses on the problem of identifying rare examples within a corpus of unlabeled data. We propose a simple and scalable data mining approach that leverages the knowledge contained within a large vision language model (VLM). Our approach utilizes a VLM to summarize the content of an image into a set of keywords, and we identify rare examples based on keyword frequency. We find that the VLM offers a distinct signal for identifying long-tail examples when compared to conventional methods based on model uncertainty. Therefore, we propose a simple and general approach for integrating signals from multiple mining algorithms. We evaluate the proposed method on two diverse tasks: 2D image classification, in which inter-class variation is the primary source of data diversity, and on 3D object detection, where intra-class variation is the main concern. Furthermore, through the detection task, we demonstrate that the knowledge extracted from 2D images is transferable to the 3D domain. Our experiments consistently show large improvements (between 10\% and 50\%) over the baseline techniques on several representative benchmarks: ImageNet-LT, Places-LT, and the Waymo Open Dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15486v1-abstract-full').style.display = 'none'; document.getElementById('2409.15486v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09369">arXiv:2409.09369</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.09369">pdf</a>, <a href="https://arxiv.org/format/2409.09369">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Interpretable Vision-Language Survival Analysis with Ordinal Inductive Bias for Computational Pathology </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+P">Pei Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Ji%2C+L">Luping Ji</a>, <a href="/search/cs?searchtype=author&amp;query=Gou%2C+J">Jiaxiang Gou</a>, <a href="/search/cs?searchtype=author&amp;query=Fu%2C+B">Bo Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Mao Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09369v4-abstract-short" style="display: inline;"> Histopathology Whole-Slide Images (WSIs) provide an important tool to assess cancer prognosis in computational pathology (CPATH). While existing survival analysis (SA) approaches have made exciting progress, they are generally limited to adopting highly-expressive network architectures and only coarse-grained patient-level labels to learn visual prognostic representations from gigapixel WSIs. Such&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09369v4-abstract-full').style.display = 'inline'; document.getElementById('2409.09369v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09369v4-abstract-full" style="display: none;"> Histopathology Whole-Slide Images (WSIs) provide an important tool to assess cancer prognosis in computational pathology (CPATH). While existing survival analysis (SA) approaches have made exciting progress, they are generally limited to adopting highly-expressive network architectures and only coarse-grained patient-level labels to learn visual prognostic representations from gigapixel WSIs. Such learning paradigm suffers from critical performance bottlenecks, when facing present scarce training data and standard multi-instance learning (MIL) framework in CPATH. To overcome it, this paper, for the first time, proposes a new Vision-Language-based SA (VLSA) paradigm. Concretely, (1) VLSA is driven by pathology VL foundation models. It no longer relies on high-capability networks and shows the advantage of data efficiency. (2) In vision-end, VLSA encodes textual prognostic prior and then employs it as auxiliary signals to guide the aggregating of visual prognostic features at instance level, thereby compensating for the weak supervision in MIL. Moreover, given the characteristics of SA, we propose i) ordinal survival prompt learning to transform continuous survival labels into textual prompts; and ii) ordinal incidence function as prediction target to make SA compatible with VL-based prediction. Notably, VLSA&#39;s predictions can be interpreted intuitively by our Shapley values-based method. The extensive experiments on five datasets confirm the effectiveness of our scheme. Our VLSA could pave a new way for SA in CPATH by offering weakly-supervised MIL an effective means to learn valuable prognostic clues from gigapixel WSIs. Our source code is available at https://github.com/liupei101/VLSA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09369v4-abstract-full').style.display = 'none'; document.getElementById('2409.09369v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08583">arXiv:2409.08583</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.08583">pdf</a>, <a href="https://arxiv.org/format/2409.08583">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> LHQ-SVC: Lightweight and High Quality Singing Voice Conversion Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Y">Yubo Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Lai%2C+X">Xin Lai</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Muyang Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+A">Anran Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zixi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+J">Jingzehua Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+S">Shuai Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Z">Zhiyuan Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Niu%2C+W">Weijie Niu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08583v2-abstract-short" style="display: inline;"> Singing Voice Conversion (SVC) has emerged as a significant subfield of Voice Conversion (VC), enabling the transformation of one singer&#39;s voice into another while preserving musical elements such as melody, rhythm, and timbre. Traditional SVC methods have limitations in terms of audio quality, data requirements, and computational complexity. In this paper, we propose LHQ-SVC, a lightweight, CPU-c&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08583v2-abstract-full').style.display = 'inline'; document.getElementById('2409.08583v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08583v2-abstract-full" style="display: none;"> Singing Voice Conversion (SVC) has emerged as a significant subfield of Voice Conversion (VC), enabling the transformation of one singer&#39;s voice into another while preserving musical elements such as melody, rhythm, and timbre. Traditional SVC methods have limitations in terms of audio quality, data requirements, and computational complexity. In this paper, we propose LHQ-SVC, a lightweight, CPU-compatible model based on the SVC framework and diffusion model, designed to reduce model size and computational demand without sacrificing performance. We incorporate features to improve inference quality, and optimize for CPU execution by using performance tuning tools and parallel computing frameworks. Our experiments demonstrate that LHQ-SVC maintains competitive performance, with significant improvements in processing speed and efficiency across different devices. The results suggest that LHQ-SVC can meet <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08583v2-abstract-full').style.display = 'none'; document.getElementById('2409.08583v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05888">arXiv:2409.05888</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.05888">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MA-CDMR: An Intelligent Cross-domain Multicast Routing Method based on Multiagent Deep Reinforcement Learning in Multi-domain SDWN </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Miao Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+H">Hongwen Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xiaoli Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yuping Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+W">Wen Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+J">Jihao Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05888v2-abstract-short" style="display: inline;"> The cross-domain multicast routing problem in a software-defined wireless network with multiple controllers is a classic NP-hard optimization problem. As the network size increases, designing and implementing cross-domain multicast routing paths in the network requires not only designing efficient solution algorithms to obtain the optimal cross-domain multicast tree but also ensuring the timely an&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05888v2-abstract-full').style.display = 'inline'; document.getElementById('2409.05888v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05888v2-abstract-full" style="display: none;"> The cross-domain multicast routing problem in a software-defined wireless network with multiple controllers is a classic NP-hard optimization problem. As the network size increases, designing and implementing cross-domain multicast routing paths in the network requires not only designing efficient solution algorithms to obtain the optimal cross-domain multicast tree but also ensuring the timely and flexible acquisition and maintenance of global network state information. However, existing solutions have a limited ability to sense the network traffic state, affecting the quality of service of multicast services. In addition, these methods have difficulty adapting to the highly dynamically changing network states and have slow convergence speeds. To this end, this paper aims to design and implement a multiagent deep reinforcement learning based cross-domain multicast routing method for SDWN with multicontroller domains. First, a multicontroller communication mechanism and a multicast group management module are designed to transfer and synchronize network information between different control domains of the SDWN, thus effectively managing the joining and classification of members in the cross-domain multicast group. Second, a theoretical analysis and proof show that the optimal cross-domain multicast tree includes an interdomain multicast tree and an intradomain multicast tree. An agent is established for each controller, and a cooperation mechanism between multiple agents is designed to effectively optimize cross-domain multicast routing and ensure consistency and validity in the representation of network state information for cross-domain multicast routing decisions. Third, a multiagent reinforcement learning-based method that combines online and offline training is designed to reduce the dependence on the real-time environment and increase the convergence speed of multiple agents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05888v2-abstract-full').style.display = 'none'; document.getElementById('2409.05888v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.01710">arXiv:2409.01710</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.01710">pdf</a>, <a href="https://arxiv.org/format/2409.01710">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Privacy-Preserving Multimedia Mobile Cloud Computing Using Protective Perturbation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tang%2C+Z">Zhongze Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Mengmei Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yao Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+S">Sheng Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.01710v1-abstract-short" style="display: inline;"> Mobile cloud computing has been adopted in many multimedia applications, where the resource-constrained mobile device sends multimedia data (e.g., images) to remote cloud servers to request computation-intensive multimedia services (e.g., image recognition). While significantly improving the performance of the mobile applications, the cloud-based mechanism often causes privacy concerns as the mult&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01710v1-abstract-full').style.display = 'inline'; document.getElementById('2409.01710v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.01710v1-abstract-full" style="display: none;"> Mobile cloud computing has been adopted in many multimedia applications, where the resource-constrained mobile device sends multimedia data (e.g., images) to remote cloud servers to request computation-intensive multimedia services (e.g., image recognition). While significantly improving the performance of the mobile applications, the cloud-based mechanism often causes privacy concerns as the multimedia data and services are offloaded from the trusted user device to untrusted cloud servers. Several recent studies have proposed perturbation-based privacy preserving mechanisms, which obfuscate the offloaded multimedia data to eliminate privacy exposures without affecting the functionality of the remote multimedia services. However, the existing privacy protection approaches require the deployment of computation-intensive perturbation generation on the resource-constrained mobile devices. Also, the obfuscated images are typically not compliant with the standard image compression algorithms and suffer from significant bandwidth consumption. In this paper, we develop a novel privacy-preserving multimedia mobile cloud computing framework, namely $PMC^2$, to address the resource and bandwidth challenges. $PMC^2$ employs secure confidential computing in the cloud to deploy the perturbation generator, which addresses the resource challenge while maintaining the privacy. Furthermore, we develop a neural compressor specifically trained to compress the perturbed images in order to address the bandwidth challenge. We implement $PMC^2$ in an end-to-end mobile cloud computing system, based on which our evaluations demonstrate superior latency, power efficiency, and bandwidth consumption achieved by $PMC^2$ while maintaining high accuracy in the target multimedia service. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01710v1-abstract-full').style.display = 'none'; document.getElementById('2409.01710v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.17073">arXiv:2408.17073</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.17073">pdf</a>, <a href="https://arxiv.org/format/2408.17073">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Approximately Invertible Neural Network for Learned Image Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gao%2C+Y">Yanbo Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Fu%2C+M">Meng Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+S">Shuai Li</a>, <a href="/search/cs?searchtype=author&amp;query=Lv%2C+C">Chong Lv</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+X">Xun Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+H">Hui Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Mao Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.17073v1-abstract-short" style="display: inline;"> Learned image compression have attracted considerable interests in recent years. It typically comprises an analysis transform, a synthesis transform, quantization and an entropy coding model. The analysis transform and synthesis transform are used to encode an image to latent feature and decode the quantized feature to reconstruct the image, and can be regarded as coupled transforms. However, the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17073v1-abstract-full').style.display = 'inline'; document.getElementById('2408.17073v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.17073v1-abstract-full" style="display: none;"> Learned image compression have attracted considerable interests in recent years. It typically comprises an analysis transform, a synthesis transform, quantization and an entropy coding model. The analysis transform and synthesis transform are used to encode an image to latent feature and decode the quantized feature to reconstruct the image, and can be regarded as coupled transforms. However, the analysis transform and synthesis transform are designed independently in the existing methods, making them unreliable in high-quality image compression. Inspired by the invertible neural networks in generative modeling, invertible modules are used to construct the coupled analysis and synthesis transforms. Considering the noise introduced in the feature quantization invalidates the invertible process, this paper proposes an Approximately Invertible Neural Network (A-INN) framework for learned image compression. It formulates the rate-distortion optimization in lossy image compression when using INN with quantization, which differentiates from using INN for generative modelling. Generally speaking, A-INN can be used as the theoretical foundation for any INN based lossy compression method. Based on this formulation, A-INN with a progressive denoising module (PDM) is developed to effectively reduce the quantization noise in the decoding. Moreover, a Cascaded Feature Recovery Module (CFRM) is designed to learn high-dimensional feature recovery from low-dimensional ones to further reduce the noise in feature channel compression. In addition, a Frequency-enhanced Decomposition and Synthesis Module (FDSM) is developed by explicitly enhancing the high-frequency components in an image to address the loss of high-frequency information inherent in neural network based image compression. Extensive experiments demonstrate that the proposed A-INN outperforms the existing learned image compression methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17073v1-abstract-full').style.display = 'none'; document.getElementById('2408.17073v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12795">arXiv:2408.12795</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.12795">pdf</a>, <a href="https://arxiv.org/format/2408.12795">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Physics and Society">physics.soc-ph</span> </div> </div> <p class="title is-5 mathjax"> From Mobilisation to Radicalisation: Probing the Persistence and Radicalisation of Social Movements Using an Agent-Based Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Thomas%2C+E+F">Emma F. Thomas</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Mengbin Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Angus%2C+S+D">Simon D. Angus</a>, <a href="/search/cs?searchtype=author&amp;query=Mathew%2C+T+J">Tony J. Mathew</a>, <a href="/search/cs?searchtype=author&amp;query=Louis%2C+W">Winnifred Louis</a>, <a href="/search/cs?searchtype=author&amp;query=Walsh%2C+L">Liam Walsh</a>, <a href="/search/cs?searchtype=author&amp;query=Ellery%2C+S">Silas Ellery</a>, <a href="/search/cs?searchtype=author&amp;query=Lizzio-Wilson%2C+M">Morgana Lizzio-Wilson</a>, <a href="/search/cs?searchtype=author&amp;query=McGarty%2C+C">Craig McGarty</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12795v1-abstract-short" style="display: inline;"> We are living in an age of protest. Although we have an excellent understanding of the factors that predict participation in protest, we understand little about the conditions that foster a sustained (versus transient) movement. How do interactions between supporters and authorities combine to influence whether and how people engage (i.e., using conventional or radical tactics)? This paper introdu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12795v1-abstract-full').style.display = 'inline'; document.getElementById('2408.12795v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12795v1-abstract-full" style="display: none;"> We are living in an age of protest. Although we have an excellent understanding of the factors that predict participation in protest, we understand little about the conditions that foster a sustained (versus transient) movement. How do interactions between supporters and authorities combine to influence whether and how people engage (i.e., using conventional or radical tactics)? This paper introduces a novel, theoretically-founded and empirically-informed agent-based model (DIMESim) to address these questions. We model the complex interactions between the psychological attributes of the protester (agents), the authority to whom the protests are targeted, and the environment that allows protesters to coordinate with each other -- over time, and at a population scale. Where an authority is responsive and failure is contested, a modest sized conventional movement endured. Where authorities repeatedly and incontrovertibly fail the movement, the population disengaged from action but evidenced an ongoing commitment to radicalism (latent radicalism). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12795v1-abstract-full').style.display = 'none'; document.getElementById('2408.12795v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Initial submission version of journal paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12127">arXiv:2408.12127</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.12127">pdf</a>, <a href="https://arxiv.org/ps/2408.12127">ps</a>, <a href="https://arxiv.org/format/2408.12127">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Physics and Society">physics.soc-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.chaos.2024.115935">10.1016/j.chaos.2024.115935 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> An evidence-accumulating drift-diffusion model of competing information spread on networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Corsin%2C+J">Julien Corsin</a>, <a href="/search/cs?searchtype=author&amp;query=Zino%2C+L">Lorenzo Zino</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Mengbin Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12127v1-abstract-short" style="display: inline;"> In this paper, we propose an agent-based model of information spread, grounded on psychological insights on the formation and spread of beliefs. In our model, we consider a network of individuals who share two opposing types of information on a specific topic (e.g., pro- vs. anti-vaccine stances), and the accumulation of evidence supporting either type of information is modelled by means of a drif&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12127v1-abstract-full').style.display = 'inline'; document.getElementById('2408.12127v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12127v1-abstract-full" style="display: none;"> In this paper, we propose an agent-based model of information spread, grounded on psychological insights on the formation and spread of beliefs. In our model, we consider a network of individuals who share two opposing types of information on a specific topic (e.g., pro- vs. anti-vaccine stances), and the accumulation of evidence supporting either type of information is modelled by means of a drift-diffusion process. After formalising the model, we put forward a campaign of Monte Carlo simulations to identify population-wide behaviours emerging from agents&#39; exposure to different sources of information, investigating the impact of the number and persistence of such sources, and the role of the network structure through which the individuals interact. We find similar emergent behaviours for all network structures considered. When there is a single type of information, the main observed emergent behaviour is consensus. When there are opposing information sources, both consensus or polarisation can result; the latter occurs if the number and persistence of the sources exceeds some threshold values. Importantly, we find the emergent behaviour is mainly influenced by how long the information sources are present for, as opposed to how many sources there are. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12127v1-abstract-full').style.display = 'none'; document.getElementById('2408.12127v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.06567">arXiv:2408.06567</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.06567">pdf</a>, <a href="https://arxiv.org/format/2408.06567">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AquilaMoE: Efficient Training for MoE Models with Scale-Up and Scale-Out Strategies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+B">Bo-Wen Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Liangdong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+Y">Ye Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jijie Li</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+S">Shuhao Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+M">Mengdi Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xinya Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+G">Guang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+C">Chengwei Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+H">Hanyu Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Du%2C+L">Li Du</a>, <a href="/search/cs?searchtype=author&amp;query=Ju%2C+Y">Yiming Ju</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+Q">Quanyue Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Ao%2C+Y">Yulong Ao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Y">Yingli Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Songhe Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+Z">Zhou Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+D">Dong Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Y">Yonghua Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+M">Ming Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shunfei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Y">Yanxin Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Min Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xuekai Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+X">Xinyang Yu</a> , et al. (2 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.06567v1-abstract-short" style="display: inline;"> In recent years, with the rapid application of large language models across various fields, the scale of these models has gradually increased, and the resources required for their pre-training have grown exponentially. Training an LLM from scratch will cost a lot of computation resources while scaling up from a smaller model is a more efficient approach and has thus attracted significant attention&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06567v1-abstract-full').style.display = 'inline'; document.getElementById('2408.06567v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.06567v1-abstract-full" style="display: none;"> In recent years, with the rapid application of large language models across various fields, the scale of these models has gradually increased, and the resources required for their pre-training have grown exponentially. Training an LLM from scratch will cost a lot of computation resources while scaling up from a smaller model is a more efficient approach and has thus attracted significant attention. In this paper, we present AquilaMoE, a cutting-edge bilingual 8*16B Mixture of Experts (MoE) language model that has 8 experts with 16 billion parameters each and is developed using an innovative training methodology called EfficientScale. This approach optimizes performance while minimizing data requirements through a two-stage process. The first stage, termed Scale-Up, initializes the larger model with weights from a pre-trained smaller model, enabling substantial knowledge transfer and continuous pretraining with significantly less data. The second stage, Scale-Out, uses a pre-trained dense model to initialize the MoE experts, further enhancing knowledge transfer and performance. Extensive validation experiments on 1.8B and 7B models compared various initialization schemes, achieving models that maintain and reduce loss during continuous pretraining. Utilizing the optimal scheme, we successfully trained a 16B model and subsequently the 8*16B AquilaMoE model, demonstrating significant improvements in performance and training efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06567v1-abstract-full').style.display = 'none'; document.getElementById('2408.06567v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.15354">arXiv:2407.15354</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.15354">pdf</a>, <a href="https://arxiv.org/format/2407.15354">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Learning High-resolution Vector Representation from Multi-Camera Images for 3D Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zhili Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+S">Shuangjie Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Maosheng Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Qian%2C+Z">Zian Qian</a>, <a href="/search/cs?searchtype=author&amp;query=Zou%2C+X">Xiaoyi Zou</a>, <a href="/search/cs?searchtype=author&amp;query=Yeung%2C+D">Dit-Yan Yeung</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Q">Qifeng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.15354v1-abstract-short" style="display: inline;"> The Bird&#39;s-Eye-View (BEV) representation is a critical factor that directly impacts the 3D object detection performance, but the traditional BEV grid representation induces quadratic computational cost as the spatial resolution grows. To address this limitation, we present a new camera-based 3D object detector with high-resolution vector representation: VectorFormer. The presented high-resolution&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15354v1-abstract-full').style.display = 'inline'; document.getElementById('2407.15354v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.15354v1-abstract-full" style="display: none;"> The Bird&#39;s-Eye-View (BEV) representation is a critical factor that directly impacts the 3D object detection performance, but the traditional BEV grid representation induces quadratic computational cost as the spatial resolution grows. To address this limitation, we present a new camera-based 3D object detector with high-resolution vector representation: VectorFormer. The presented high-resolution vector representation is combined with the lower-resolution BEV representation to efficiently exploit 3D geometry from multi-camera images at a high resolution through our two novel modules: vector scattering and gathering. To this end, the learned vector representation with richer scene contexts can serve as the decoding query for final predictions. We conduct extensive experiments on the nuScenes dataset and demonstrate state-of-the-art performance in NDS and inference time. Furthermore, we investigate query-BEV-based methods incorporated with our proposed vector representation and observe a consistent performance improvement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15354v1-abstract-full').style.display = 'none'; document.getElementById('2407.15354v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ECCV 2024. Project page: https://github.com/zlichen/VectorFormer</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.11794">arXiv:2407.11794</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.11794">pdf</a>, <a href="https://arxiv.org/format/2407.11794">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> What&#39;s in a Niche? Migration Patterns in Online Communities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Van+Koevering%2C+K">Katherine Van Koevering</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Meryl Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Kleinberg%2C+J">Jon Kleinberg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.11794v1-abstract-short" style="display: inline;"> Broad topics in online platforms represent a type of meso-scale between individual user-defined communities and the whole platform; they typically consist of related communities that address different facets of a shared topic. Users often engage with the topic by moving among the communities within a single category. We find that there are strong regularities in the aggregate pattern of user migra&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11794v1-abstract-full').style.display = 'inline'; document.getElementById('2407.11794v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.11794v1-abstract-full" style="display: none;"> Broad topics in online platforms represent a type of meso-scale between individual user-defined communities and the whole platform; they typically consist of related communities that address different facets of a shared topic. Users often engage with the topic by moving among the communities within a single category. We find that there are strong regularities in the aggregate pattern of user migration, in that the communities comprising a topic can be ordered in a partial order such that there is more migration in the direction defined by the partial order than against it. Ordered along this overall direction, we find that communities in aggregate become smaller, less toxic, and more linguistically distinctive, suggesting a picture consistent with specialization. We study directions defined not just in the movement of users but also by the movement of URLs and by the direction of mentions from one community to another; each of these produces a consistent direction, but the directions all differ from each other. We show how, collectively, these distinct trends help organize the structure of large online topics and compare our findings across both Reddit and Wikipedia and in simulations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11794v1-abstract-full').style.display = 'none'; document.getElementById('2407.11794v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07351">arXiv:2407.07351</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.07351">pdf</a>, <a href="https://arxiv.org/format/2407.07351">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unity in Diversity: Multi-expert Knowledge Confrontation and Collaboration for Generalizable Vehicle Re-identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kuang%2C+Z">Zhenyu Kuang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hongyang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Mang Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+B">Bin Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yinhao Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Y">Yue Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+X">Xinghao Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Huafeng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07351v2-abstract-short" style="display: inline;"> Generalizable vehicle re-identification (ReID) seeks to develop models that can adapt to unknown target domains without the need for additional fine-tuning or retraining. Previous works have mainly focused on extracting domain-invariant features by aligning data distributions between source domains. However, interfered by the inherent domain-related redundancy in the source images, solely relying&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07351v2-abstract-full').style.display = 'inline'; document.getElementById('2407.07351v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07351v2-abstract-full" style="display: none;"> Generalizable vehicle re-identification (ReID) seeks to develop models that can adapt to unknown target domains without the need for additional fine-tuning or retraining. Previous works have mainly focused on extracting domain-invariant features by aligning data distributions between source domains. However, interfered by the inherent domain-related redundancy in the source images, solely relying on common features is insufficient for accurately capturing the complementary features with lower occurrence probability and smaller energy. To solve this unique problem, we propose a two-stage Multi-expert Knowledge Confrontation and Collaboration (MiKeCoCo) method, which fully leverages the high-level semantics of Contrastive Language-Image Pretraining (CLIP) to obtain a diversified prompt set and achieve complementary feature representations. Specifically, this paper first designs a Spectrum-based Transformation for Redundancy Elimination and Augmentation Module (STREAM) through simple image preprocessing to obtain two types of image inputs for the training process. Since STREAM eliminates domain-related redundancy in source images, it enables the model to pay closer attention to the detailed prompt set that is crucial for distinguishing fine-grained vehicles. This learned prompt set related to the vehicle identity is then utilized to guide the comprehensive representation learning of complementary features for final knowledge fusion and identity recognition. Inspired by the unity principle, MiKeCoCo integrates the diverse evaluation ways of experts to ensure the accuracy and consistency of ReID. Extensive experimental results demonstrate that our method achieves state-of-the-art performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07351v2-abstract-full').style.display = 'none'; document.getElementById('2407.07351v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07289">arXiv:2407.07289</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.07289">pdf</a>, <a href="https://arxiv.org/format/2407.07289">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Deformable Feature Alignment and Refinement for Moving Infrared Dim-small Target Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Luo%2C+D">Dengyan Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Xiang%2C+Y">Yanping Xiang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Hu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Ji%2C+L">Luping Ji</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+S">Shuai Li</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Mao Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07289v1-abstract-short" style="display: inline;"> The detection of moving infrared dim-small targets has been a challenging and prevalent research topic. The current state-of-the-art methods are mainly based on ConvLSTM to aggregate information from adjacent frames to facilitate the detection of the current frame. However, these methods implicitly utilize motion information only in the training stage and fail to explicitly explore motion compensa&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07289v1-abstract-full').style.display = 'inline'; document.getElementById('2407.07289v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07289v1-abstract-full" style="display: none;"> The detection of moving infrared dim-small targets has been a challenging and prevalent research topic. The current state-of-the-art methods are mainly based on ConvLSTM to aggregate information from adjacent frames to facilitate the detection of the current frame. However, these methods implicitly utilize motion information only in the training stage and fail to explicitly explore motion compensation, resulting in poor performance in the case of a video sequence including large motion. In this paper, we propose a Deformable Feature Alignment and Refinement (DFAR) method based on deformable convolution to explicitly use motion context in both the training and inference stages. Specifically, a Temporal Deformable Alignment (TDA) module based on the designed Dilated Convolution Attention Fusion (DCAF) block is developed to explicitly align the adjacent frames with the current frame at the feature level. Then, the feature refinement module adaptively fuses the aligned features and further aggregates useful spatio-temporal information by means of the proposed Attention-guided Deformable Fusion (AGDF) block. In addition, to improve the alignment of adjacent frames with the current frame, we extend the traditional loss function by introducing a new motion compensation loss. Extensive experimental results demonstrate that the proposed DFAR method achieves the state-of-the-art performance on two benchmark datasets including DAUB and IRDST. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07289v1-abstract-full').style.display = 'none'; document.getElementById('2407.07289v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.06040">arXiv:2407.06040</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.06040">pdf</a>, <a href="https://arxiv.org/format/2407.06040">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Enabling Performant and Secure EDA as a Service in Public Clouds Using Confidential Containers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Mengmei Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Dunn%2C+D">Derren Dunn</a>, <a href="/search/cs?searchtype=author&amp;query=Buono%2C+D">Daniele Buono</a>, <a href="/search/cs?searchtype=author&amp;query=Ruocco%2C+A">Angelo Ruocco</a>, <a href="/search/cs?searchtype=author&amp;query=Carvalho%2C+C">Claudio Carvalho</a>, <a href="/search/cs?searchtype=author&amp;query=Feldman-fitzthum%2C+T">Tobin Feldman-fitzthum</a>, <a href="/search/cs?searchtype=author&amp;query=Franke%2C+H">Hubertus Franke</a>, <a href="/search/cs?searchtype=author&amp;query=Bottomley%2C+J">James Bottomley</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.06040v1-abstract-short" style="display: inline;"> Increasingly, business opportunities available to fabless design teams in the semiconductor industry far exceed those addressable with on-prem compute resources. An attractive option to capture these electronic design automation (EDA) design opportunities is through public cloud bursting. However, security concerns with public cloud bursting arise from having to protect process design kits, third&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06040v1-abstract-full').style.display = 'inline'; document.getElementById('2407.06040v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.06040v1-abstract-full" style="display: none;"> Increasingly, business opportunities available to fabless design teams in the semiconductor industry far exceed those addressable with on-prem compute resources. An attractive option to capture these electronic design automation (EDA) design opportunities is through public cloud bursting. However, security concerns with public cloud bursting arise from having to protect process design kits, third party intellectual property, and new design data for semiconductor devices and chips. One way to address security concerns for public cloud bursting is to leverage confidential containers for EDA workloads. Confidential containers add zero trust computing elements to significantly reduce the probability of intellectual property escapes. A key concern that often follows security discussions is whether EDA workload performance will suffer with confidential computing. In this work we demonstrate a full set of EDA confidential containers and their deployment and characterize performance impacts of confidential elements of the flow including storage and networking. A complete end-to-end confidential container-based EDA workload exhibits 7.13% and 2.05% performance overheads over bare-metal container and VM based solutions, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06040v1-abstract-full').style.display = 'none'; document.getElementById('2407.06040v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.18937">arXiv:2406.18937</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.18937">pdf</a>, <a href="https://arxiv.org/format/2406.18937">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Federated Graph Semantic and Structural Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Huang%2C+W">Wenke Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+G">Guancheng Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Mang Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Du%2C+B">Bo Du</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.18937v2-abstract-short" style="display: inline;"> Federated graph learning collaboratively learns a global graph neural network with distributed graphs, where the non-independent and identically distributed property is one of the major challenges. Most relative arts focus on traditional distributed tasks like images and voices, incapable of graph structures. This paper firstly reveals that local client distortion is brought by both node-level sem&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18937v2-abstract-full').style.display = 'inline'; document.getElementById('2406.18937v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.18937v2-abstract-full" style="display: none;"> Federated graph learning collaboratively learns a global graph neural network with distributed graphs, where the non-independent and identically distributed property is one of the major challenges. Most relative arts focus on traditional distributed tasks like images and voices, incapable of graph structures. This paper firstly reveals that local client distortion is brought by both node-level semantics and graph-level structure. First, for node-level semantics, we find that contrasting nodes from distinct classes is beneficial to provide a well-performing discrimination. We pull the local node towards the global node of the same class and push it away from the global node of different classes. Second, we postulate that a well-structural graph neural network possesses similarity for neighbors due to the inherent adjacency relationships. However, aligning each node with adjacent nodes hinders discrimination due to the potential class inconsistency. We transform the adjacency relationships into the similarity distribution and leverage the global model to distill the relation knowledge into the local model, which preserves the structural information and discriminability of the local model. Empirical results on three graph datasets manifest the superiority of the proposed method over its counterparts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18937v2-abstract-full').style.display = 'none'; document.getElementById('2406.18937v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> International Joint Conference on Artificial Intelligence (IJCAI), 2023 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.18074">arXiv:2406.18074</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.18074">pdf</a>, <a href="https://arxiv.org/format/2406.18074">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Few-Shot Medical Image Segmentation with High-Fidelity Prototypes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tang%2C+S">Song Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+S">Shaxu Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+X">Xiaozhi Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+J">Jianxin Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Mao Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jianwei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+X">Xiatian Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.18074v1-abstract-short" style="display: inline;"> Few-shot Semantic Segmentation (FSS) aims to adapt a pretrained model to new classes with as few as a single labelled training sample per class. Despite the prototype based approaches have achieved substantial success, existing models are limited to the imaging scenarios with considerably distinct objects and not highly complex background, e.g., natural images. This makes such models suboptimal fo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18074v1-abstract-full').style.display = 'inline'; document.getElementById('2406.18074v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.18074v1-abstract-full" style="display: none;"> Few-shot Semantic Segmentation (FSS) aims to adapt a pretrained model to new classes with as few as a single labelled training sample per class. Despite the prototype based approaches have achieved substantial success, existing models are limited to the imaging scenarios with considerably distinct objects and not highly complex background, e.g., natural images. This makes such models suboptimal for medical imaging with both conditions invalid. To address this problem, we propose a novel Detail Self-refined Prototype Network (DSPNet) to constructing high-fidelity prototypes representing the object foreground and the background more comprehensively. Specifically, to construct global semantics while maintaining the captured detail semantics, we learn the foreground prototypes by modelling the multi-modal structures with clustering and then fusing each in a channel-wise manner. Considering that the background often has no apparent semantic relation in the spatial dimensions, we integrate channel-specific structural information under sparse channel-aware regulation. Extensive experiments on three challenging medical image benchmarks show the superiority of DSPNet over previous state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18074v1-abstract-full').style.display = 'none'; document.getElementById('2406.18074v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.17963">arXiv:2406.17963</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.17963">pdf</a>, <a href="https://arxiv.org/format/2406.17963">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Empowering Interdisciplinary Insights with Dynamic Graph Embedding Trajectories </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jin%2C+Y">Yiqiao Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+A">Andrew Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+Y">Yeon-Chang Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Meng Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Divakaran%2C+A">Ajay Divakaran</a>, <a href="/search/cs?searchtype=author&amp;query=Kumar%2C+S">Srijan Kumar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.17963v2-abstract-short" style="display: inline;"> We developed DyGETViz, a novel framework for effectively visualizing dynamic graphs (DGs) that are ubiquitous across diverse real-world systems. This framework leverages recent advancements in discrete-time dynamic graph (DTDG) models to adeptly handle the temporal dynamics inherent in dynamic graphs. DyGETViz effectively captures both micro- and macro-level structural shifts within these graphs,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17963v2-abstract-full').style.display = 'inline'; document.getElementById('2406.17963v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.17963v2-abstract-full" style="display: none;"> We developed DyGETViz, a novel framework for effectively visualizing dynamic graphs (DGs) that are ubiquitous across diverse real-world systems. This framework leverages recent advancements in discrete-time dynamic graph (DTDG) models to adeptly handle the temporal dynamics inherent in dynamic graphs. DyGETViz effectively captures both micro- and macro-level structural shifts within these graphs, offering a robust method for representing complex and massive dynamic graphs. The application of DyGETViz extends to a diverse array of domains, including ethology, epidemiology, finance, genetics, linguistics, communication studies, social studies, and international relations. Through its implementation, DyGETViz has revealed or confirmed various critical insights. These include the diversity of content sharing patterns and the degree of specialization within online communities, the chronological evolution of lexicons across decades, and the distinct trajectories exhibited by aging-related and non-related genes. Importantly, DyGETViz enhances the accessibility of scientific findings to non-domain experts by simplifying the complexities of dynamic graphs. Our framework is released as an open-source Python package for use across diverse disciplines. Our work not only addresses the ongoing challenges in visualizing and analyzing DTDG models but also establishes a foundational framework for future investigations into dynamic graph representation and analysis across various disciplines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17963v2-abstract-full').style.display = 'none'; document.getElementById('2406.17963v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.16442">arXiv:2406.16442</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.16442">pdf</a>, <a href="https://arxiv.org/format/2406.16442">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EmoLLM: Multimodal Emotional Understanding Meets Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Q">Qu Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Mang Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Du%2C+B">Bo Du</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.16442v2-abstract-short" style="display: inline;"> Multi-modal large language models (MLLMs) have achieved remarkable performance on objective multimodal perception tasks, but their ability to interpret subjective, emotionally nuanced multimodal content remains largely unexplored. Thus, it impedes their ability to effectively understand and react to the intricate emotions expressed by humans through multimodal media. To bridge this gap, we introdu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16442v2-abstract-full').style.display = 'inline'; document.getElementById('2406.16442v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.16442v2-abstract-full" style="display: none;"> Multi-modal large language models (MLLMs) have achieved remarkable performance on objective multimodal perception tasks, but their ability to interpret subjective, emotionally nuanced multimodal content remains largely unexplored. Thus, it impedes their ability to effectively understand and react to the intricate emotions expressed by humans through multimodal media. To bridge this gap, we introduce EmoBench, the first comprehensive benchmark designed specifically to evaluate the emotional capabilities of MLLMs across five popular emotional tasks, using a diverse dataset of 287k images and videos paired with corresponding textual instructions. Meanwhile, we propose EmoLLM, a novel model for multimodal emotional understanding, incorporating with two core techniques. 1) Multi-perspective Visual Projection, it captures diverse emotional cues from visual data from multiple perspectives. 2) EmoPrompt, it guides MLLMs to reason about emotions in the correct direction. Experimental results demonstrate that EmoLLM significantly elevates multimodal emotional understanding performance, with an average improvement of 12.1% across multiple foundation models on EmoBench. Our work contributes to the advancement of MLLMs by facilitating a deeper and more nuanced comprehension of intricate human emotions, paving the way for the development of artificial emotional intelligence capabilities with wide-ranging applications in areas such as human-computer interaction, mental health support, and empathetic AI systems. Code, data, and model will be released. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16442v2-abstract-full').style.display = 'none'; document.getElementById('2406.16442v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Ye%2C+M&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Ye%2C+M&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Ye%2C+M&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Ye%2C+M&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Ye%2C+M&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Ye%2C+M&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Ye%2C+M&amp;start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10