CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 1,603 results for author: <span class="mathjax">Wang, G</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Wang%2C+G">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Wang, G"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Wang%2C+G&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Wang, G"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Wang%2C+G&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Wang%2C+G&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+G&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+G&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+G&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+G&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18169">arXiv:2411.18169</a> <span> [<a href="https://arxiv.org/pdf/2411.18169">pdf</a>, <a href="https://arxiv.org/format/2411.18169">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PDZSeg: Adapting the Foundation Model for Dissection Zone Segmentation with Visual Prompts in Robot-assisted Endoscopic Submucosal Dissection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mengya Xu</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+W">Wenjin Mo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guankun Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Huxin Gao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A">An Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhen Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaoxiao Yang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hongliang Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18169v1-abstract-short" style="display: inline;"> Purpose: Endoscopic surgical environments present challenges for dissection zone segmentation due to unclear boundaries between tissue types, leading to segmentation errors where models misidentify or overlook edges. This study aims to provide precise dissection zone suggestions during endoscopic submucosal dissection (ESD) procedures, enhancing ESD safety. Methods: We propose the Prompted-based… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18169v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18169v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18169v1-abstract-full" style="display: none;"> Purpose: Endoscopic surgical environments present challenges for dissection zone segmentation due to unclear boundaries between tissue types, leading to segmentation errors where models misidentify or overlook edges. This study aims to provide precise dissection zone suggestions during endoscopic submucosal dissection (ESD) procedures, enhancing ESD safety. Methods: We propose the Prompted-based Dissection Zone Segmentation (PDZSeg) model, designed to leverage diverse visual prompts such as scribbles and bounding boxes. By overlaying these prompts onto images and fine-tuning a foundational model on a specialized dataset, our approach improves segmentation performance and user experience through flexible input methods. Results: The PDZSeg model was validated using three experimental setups: in-domain evaluation, variability in visual prompt availability, and robustness assessment. Using the ESD-DZSeg dataset, results show that our method outperforms state-of-the-art segmentation approaches. This is the first study to integrate visual prompt design into dissection zone segmentation. Conclusion: The PDZSeg model effectively utilizes visual prompts to enhance segmentation performance and user experience, supported by the novel ESD-DZSeg dataset as a benchmark for dissection zone segmentation in ESD. Our work establishes a foundation for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18169v1-abstract-full').style.display = 'none'; document.getElementById('2411.18169v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17795">arXiv:2411.17795</a> <span> [<a href="https://arxiv.org/pdf/2411.17795">pdf</a>, <a href="https://arxiv.org/format/2411.17795">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Pan-protein Design Learning Enables Task-adaptive Generalization for Low-resource Enzyme Design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+J">Jiangbin Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Ge Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Han Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S+Z">Stan Z. Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17795v1-abstract-short" style="display: inline;"> Computational protein design (CPD) offers transformative potential for bioengineering, but current deep CPD models, focused on universal domains, struggle with function-specific designs. This work introduces a novel CPD paradigm tailored for functional design tasks, particularly for enzymes-a key protein class often lacking specific application efficiency. To address structural data scarcity, we p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17795v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17795v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17795v1-abstract-full" style="display: none;"> Computational protein design (CPD) offers transformative potential for bioengineering, but current deep CPD models, focused on universal domains, struggle with function-specific designs. This work introduces a novel CPD paradigm tailored for functional design tasks, particularly for enzymes-a key protein class often lacking specific application efficiency. To address structural data scarcity, we present CrossDesign, a domain-adaptive framework that leverages pretrained protein language models (PPLMs). By aligning protein structures with sequences, CrossDesign transfers pretrained knowledge to structure models, overcoming the limitations of limited structural data. The framework combines autoregressive (AR) and non-autoregressive (NAR) states in its encoder-decoder architecture, applying it to enzyme datasets and pan-proteins. Experimental results highlight CrossDesign's superior performance and robustness, especially with out-of-domain enzymes. Additionally, the model excels in fitness prediction when tested on large-scale mutation data, showcasing its stability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17795v1-abstract-full').style.display = 'none'; document.getElementById('2411.17795v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16721">arXiv:2411.16721</a> <span> [<a href="https://arxiv.org/pdf/2411.16721">pdf</a>, <a href="https://arxiv.org/format/2411.16721">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Steering Away from Harm: An Adaptive Approach to Defending Vision Language Model Against Jailbreaks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Han Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Gang Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Huan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16721v1-abstract-short" style="display: inline;"> Vision Language Models (VLMs) can produce unintended and harmful content when exposed to adversarial attacks, particularly because their vision capabilities create new vulnerabilities. Existing defenses, such as input preprocessing, adversarial training, and response evaluation-based methods, are often impractical for real-world deployment due to their high costs. To address this challenge, we pro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16721v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16721v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16721v1-abstract-full" style="display: none;"> Vision Language Models (VLMs) can produce unintended and harmful content when exposed to adversarial attacks, particularly because their vision capabilities create new vulnerabilities. Existing defenses, such as input preprocessing, adversarial training, and response evaluation-based methods, are often impractical for real-world deployment due to their high costs. To address this challenge, we propose ASTRA, an efficient and effective defense by adaptively steering models away from adversarial feature directions to resist VLM attacks. Our key procedures involve finding transferable steering vectors representing the direction of harmful response and applying adaptive activation steering to remove these directions at inference time. To create effective steering vectors, we randomly ablate the visual tokens from the adversarial images and identify those most strongly associated with jailbreaks. These tokens are then used to construct steering vectors. During inference, we perform the adaptive steering method that involves the projection between the steering vectors and calibrated activation, resulting in little performance drops on benign inputs while strongly avoiding harmful outputs under adversarial inputs. Extensive experiments across multiple models and baselines demonstrate our state-of-the-art performance and high efficiency in mitigating jailbreak risks. Additionally, ASTRA exhibits good transferability, defending against both unseen attacks at design time (i.e., structured-based attacks) and adversarial images from diverse distributions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16721v1-abstract-full').style.display = 'none'; document.getElementById('2411.16721v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16420">arXiv:2411.16420</a> <span> [<a href="https://arxiv.org/pdf/2411.16420">pdf</a>, <a href="https://arxiv.org/ps/2411.16420">ps</a>, <a href="https://arxiv.org/format/2411.16420">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Structured Tensor Decomposition Based Channel Estimation and Double Refinements for Active RIS Empowered Broadband Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yirun Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yongqing Wang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yuyao Shen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Gongpu Wang</a>, <a href="/search/cs?searchtype=author&query=Tellambura%2C+C">Chintha Tellambura</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16420v1-abstract-short" style="display: inline;"> Channel parameter estimation is crucial for optimal designs of next-generation reconfigurable intelligent surface (RIS)-empowered communications and sensing. Tensor-based mechanisms are particularly effective, capturing the multi-dimensional nature of wireless channels, especially in scenarios where RIS integrates with multiple-antenna devices. However, existing studies assume either a line-of-sig… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16420v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16420v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16420v1-abstract-full" style="display: none;"> Channel parameter estimation is crucial for optimal designs of next-generation reconfigurable intelligent surface (RIS)-empowered communications and sensing. Tensor-based mechanisms are particularly effective, capturing the multi-dimensional nature of wireless channels, especially in scenarios where RIS integrates with multiple-antenna devices. However, existing studies assume either a line-of-sight (LOS) scenario or a blocked condition for non-RIS channel. This paper solves a novel problem: tensor-based channel parameter recovery for active RIS-aided multiple-antenna wideband connections in a multipath environment with non-RIS paths. System settings are customized to construct the received signals as a fifth-order canonical polyadic (CP) tensor. Four of the five-factor matrices unfortunately contain redundant columns, and the remaining one is a Vandermonde matrix, which fails to satisfy the Kruskal condition for tensor decomposition uniqueness. To address this issue, spatial smoothing and Vandermonde structured CP decomposition (VSCPD) are applied, making the tensor factorization problem solvable and providing a relaxed general uniqueness condition. A sequential triple-stage channel estimation framework is proposed based on the factor estimates. The first stage enables multipath identification and algebraic coarse estimation, while the following two stages offer optional successive refinements at the cost of increased complexity. The closed-form Cramer-Rao lower bound (CRLB) is derived to assess the estimation performance. Herein, the noise covariance matrix depends on multipath parameters in our active-RIS scenario. Finally, numerical results are provided to verify the effectiveness of proposed algorithms under various evaluation metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16420v1-abstract-full').style.display = 'none'; document.getElementById('2411.16420v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16380">arXiv:2411.16380</a> <span> [<a href="https://arxiv.org/pdf/2411.16380">pdf</a>, <a href="https://arxiv.org/format/2411.16380">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Privacy-Preserving Federated Foundation Model for Generalist Ultrasound Artificial Intelligence </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yuncheng Jiang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chun-Mei Feng</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+J">Jinke Ren</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+J">Jun Wei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zixun Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yiwen Hu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yunbi Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+R">Rui Sun</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xuemei Tang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+J">Juan Du</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+X">Xiang Wan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yong Xu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+B">Bo Du</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xin Gao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guangyu Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Shaohua Zhou</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+S">Shuguang Cui</a>, <a href="/search/cs?searchtype=author&query=Goh%2C+R+S+M">Rick Siow Mong Goh</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yong Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhen Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16380v1-abstract-short" style="display: inline;"> Ultrasound imaging is widely used in clinical diagnosis due to its non-invasive nature and real-time capabilities. However, conventional ultrasound diagnostics face several limitations, including high dependence on physician expertise and suboptimal image quality, which complicates interpretation and increases the likelihood of diagnostic errors. Artificial intelligence (AI) has emerged as a promi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16380v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16380v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16380v1-abstract-full" style="display: none;"> Ultrasound imaging is widely used in clinical diagnosis due to its non-invasive nature and real-time capabilities. However, conventional ultrasound diagnostics face several limitations, including high dependence on physician expertise and suboptimal image quality, which complicates interpretation and increases the likelihood of diagnostic errors. Artificial intelligence (AI) has emerged as a promising solution to enhance clinical diagnosis, particularly in detecting abnormalities across various biomedical imaging modalities. Nonetheless, current AI models for ultrasound imaging face critical challenges. First, these models often require large volumes of labeled medical data, raising concerns over patient privacy breaches. Second, most existing models are task-specific, which restricts their broader clinical utility. To overcome these challenges, we present UltraFedFM, an innovative privacy-preserving ultrasound foundation model. UltraFedFM is collaboratively pre-trained using federated learning across 16 distributed medical institutions in 9 countries, leveraging a dataset of over 1 million ultrasound images covering 19 organs and 10 ultrasound modalities. This extensive and diverse data, combined with a secure training framework, enables UltraFedFM to exhibit strong generalization and diagnostic capabilities. It achieves an average area under the receiver operating characteristic curve of 0.927 for disease diagnosis and a dice similarity coefficient of 0.878 for lesion segmentation. Notably, UltraFedFM surpasses the diagnostic accuracy of mid-level ultrasonographers and matches the performance of expert-level sonographers in the joint diagnosis of 8 common systemic diseases. These findings indicate that UltraFedFM can significantly enhance clinical diagnostics while safeguarding patient privacy, marking an advancement in AI-driven ultrasound imaging for future clinical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16380v1-abstract-full').style.display = 'none'; document.getElementById('2411.16380v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16106">arXiv:2411.16106</a> <span> [<a href="https://arxiv.org/pdf/2411.16106">pdf</a>, <a href="https://arxiv.org/format/2411.16106">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> UNOPose: Unseen Object Pose Estimation with an Unposed RGB-D Reference Image </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xingyu Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Gu Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruida Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenyangguang Zhang</a>, <a href="/search/cs?searchtype=author&query=Tombari%2C+F">Federico Tombari</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+X">Xiangyang Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16106v1-abstract-short" style="display: inline;"> Unseen object pose estimation methods often rely on CAD models or multiple reference views, making the onboarding stage costly. To simplify reference acquisition, we aim to estimate the unseen object's pose through a single unposed RGB-D reference image. While previous works leverage reference images as pose anchors to limit the range of relative pose, our scenario presents significant challenges… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16106v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16106v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16106v1-abstract-full" style="display: none;"> Unseen object pose estimation methods often rely on CAD models or multiple reference views, making the onboarding stage costly. To simplify reference acquisition, we aim to estimate the unseen object's pose through a single unposed RGB-D reference image. While previous works leverage reference images as pose anchors to limit the range of relative pose, our scenario presents significant challenges since the relative transformation could vary across the entire SE(3) space. Moreover, factors like occlusion, sensor noise, and extreme geometry could result in low viewpoint overlap. To address these challenges, we present a novel approach and benchmark, termed UNOPose, for unseen one-reference-based object pose estimation. Building upon a coarse-to-fine paradigm, UNOPose constructs an SE(3)-invariant reference frame to standardize object representation despite pose and size variations. To alleviate small overlap across viewpoints, we recalibrate the weight of each correspondence based on its predicted likelihood of being within the overlapping region. Evaluated on our proposed benchmark based on the BOP Challenge, UNOPose demonstrates superior performance, significantly outperforming traditional and learning-based methods in the one-reference setting and remaining competitive with CAD-model-based methods. The code and dataset will be available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16106v1-abstract-full').style.display = 'none'; document.getElementById('2411.16106v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15998">arXiv:2411.15998</a> <span> [<a href="https://arxiv.org/pdf/2411.15998">pdf</a>, <a href="https://arxiv.org/format/2411.15998">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> PIANIST: Learning Partially Observable World Models with LLMs for Multi-Agent Decision Making </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Light%2C+J">Jonathan Light</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+S">Sixue Xing</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuanzhe Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weiqin Chen</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+M">Min Cai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiusi Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guanzhi Wang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+W">Wei Cheng</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+Y">Yisong Yue</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Ziniu Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15998v1-abstract-short" style="display: inline;"> Effective extraction of the world knowledge in LLMs for complex decision-making tasks remains a challenge. We propose a framework PIANIST for decomposing the world model into seven intuitive components conducive to zero-shot LLM generation. Given only the natural language description of the game and how input observations are formatted, our method can generate a working world model for fast and ef… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15998v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15998v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15998v1-abstract-full" style="display: none;"> Effective extraction of the world knowledge in LLMs for complex decision-making tasks remains a challenge. We propose a framework PIANIST for decomposing the world model into seven intuitive components conducive to zero-shot LLM generation. Given only the natural language description of the game and how input observations are formatted, our method can generate a working world model for fast and efficient MCTS simulation. We show that our method works well on two different games that challenge the planning and decision making skills of the agent for both language and non-language based action taking, without any training on domain-specific training data or explicitly defined world model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15998v1-abstract-full').style.display = 'none'; document.getElementById('2411.15998v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published at Language Gamification Workshop 2024 @ NeurIPS</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14684">arXiv:2411.14684</a> <span> [<a href="https://arxiv.org/pdf/2411.14684">pdf</a>, <a href="https://arxiv.org/format/2411.14684">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Cross Group Attention and Group-wise Rolling for Multimodal Medical Image Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+T">Tao Song</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yicheng Wu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+M">Minhao Hu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+X">Xiangde Luo</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+L">Linda Wei</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guotai Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yi Guo</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+F">Feng Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shaoting Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14684v1-abstract-short" style="display: inline;"> Multimodal MR image synthesis aims to generate missing modality image by fusing and mapping a few available MRI data. Most existing approaches typically adopt an image-to-image translation scheme. However, these methods often suffer from sub-optimal performance due to the spatial misalignment between different modalities while they are typically treated as input channels. Therefore, in this paper,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14684v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14684v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14684v1-abstract-full" style="display: none;"> Multimodal MR image synthesis aims to generate missing modality image by fusing and mapping a few available MRI data. Most existing approaches typically adopt an image-to-image translation scheme. However, these methods often suffer from sub-optimal performance due to the spatial misalignment between different modalities while they are typically treated as input channels. Therefore, in this paper, we propose an Adaptive Group-wise Interaction Network (AGI-Net) that explores both inter-modality and intra-modality relationships for multimodal MR image synthesis. Specifically, groups are first pre-defined along the channel dimension and then we perform an adaptive rolling for the standard convolutional kernel to capture inter-modality spatial correspondences. At the same time, a cross-group attention module is introduced to fuse information across different channel groups, leading to better feature representation. We evaluated the effectiveness of our model on the publicly available IXI and BraTS2023 datasets, where the AGI-Net achieved state-of-the-art performance for multimodal MR image synthesis. Code will be released. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14684v1-abstract-full').style.display = 'none'; document.getElementById('2411.14684v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14522">arXiv:2411.14522</a> <span> [<a href="https://arxiv.org/pdf/2411.14522">pdf</a>, <a href="https://arxiv.org/format/2411.14522">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GMAI-VL & GMAI-VL-5.5M: A Large Vision-Language Model and A Comprehensive Multimodal Dataset Towards General Medical AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianbin Li</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yanzhou Su</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wei Li</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+B">Bin Fu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhe Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Ziyan Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoan Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chenglong Ma</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Ying Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+M">Ming Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yanjun Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+P">Pengcheng Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiaowei Hu</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Z">Zhongying Deng</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+Y">Yuanfeng Ji</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jin Ye</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Junjun He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14522v1-abstract-short" style="display: inline;"> Despite significant advancements in general artificial intelligence, such as GPT-4, their effectiveness in the medical domain (general medical AI, GMAI) remains constrained due to the absence of specialized medical knowledge. To address this challenge, we present GMAI-VL-5.5M, a comprehensive multimodal medical dataset created by converting hundreds of specialized medical datasets into meticulousl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14522v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14522v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14522v1-abstract-full" style="display: none;"> Despite significant advancements in general artificial intelligence, such as GPT-4, their effectiveness in the medical domain (general medical AI, GMAI) remains constrained due to the absence of specialized medical knowledge. To address this challenge, we present GMAI-VL-5.5M, a comprehensive multimodal medical dataset created by converting hundreds of specialized medical datasets into meticulously constructed image-text pairs. This dataset features comprehensive task coverage, diverse modalities, and high-quality image-text data. Building upon this multimodal dataset, we propose GMAI-VL, a general medical vision-language model with a progressively three-stage training strategy. This approach significantly enhances the model's ability by integrating visual and textual information, thereby improving its ability to process multimodal data and support accurate diagnosis and clinical decision-making. Experimental evaluations demonstrate that GMAI-VL achieves state-of-the-art results across a wide range of multimodal medical tasks, such as visual question answering and medical image diagnosis. Our contributions include the development of the GMAI-VL-5.5M dataset, the introduction of the GMAI-VL model, and the establishment of new benchmarks in multiple medical domains. Code and dataset will be released at https://github.com/uni-medical/GMAI-VL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14522v1-abstract-full').style.display = 'none'; document.getElementById('2411.14522v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13867">arXiv:2411.13867</a> <span> [<a href="https://arxiv.org/pdf/2411.13867">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Generative Fuzzy System for Sequence Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hailong Yang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Z">Zhaohong Deng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhuangzhuang Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guanjin Wang</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+K">Kup-sze Choi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13867v1-abstract-short" style="display: inline;"> Generative Models (GMs), particularly Large Language Models (LLMs), have garnered significant attention in machine learning and artificial intelligence for their ability to generate new data by learning the statistical properties of training data and creating data that resemble the original. This capability offers a wide range of applications across various domains. However, the complex structures… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13867v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13867v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13867v1-abstract-full" style="display: none;"> Generative Models (GMs), particularly Large Language Models (LLMs), have garnered significant attention in machine learning and artificial intelligence for their ability to generate new data by learning the statistical properties of training data and creating data that resemble the original. This capability offers a wide range of applications across various domains. However, the complex structures and numerous model parameters of GMs make the input-output processes opaque, complicating the understanding and control of outputs. Moreover, the purely data-driven learning mechanism limits GM's ability to acquire broader knowledge. There remains substantial potential for enhancing the robustness and generalization capabilities of GMs. In this work, we introduce the fuzzy system, a classical modeling method that combines data and knowledge-driven mechanisms, to generative tasks. We propose a novel Generative Fuzzy System framework, named GenFS, which integrates the deep learning capabilities of GM with the interpretability and dual-driven mechanisms of fuzzy systems. Specifically, we propose an end-to-end GenFS-based model for sequence generation, called FuzzyS2S. A series of experimental studies were conducted on 12 datasets, covering three distinct categories of generative tasks: machine translation, code generation, and summary generation. The results demonstrate that FuzzyS2S outperforms the Transformer in terms of accuracy and fluency. Furthermore, it exhibits better performance on some datasets compared to state-of-the-art models T5 and CodeT5. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13867v1-abstract-full').style.display = 'none'; document.getElementById('2411.13867v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12814">arXiv:2411.12814</a> <span> [<a href="https://arxiv.org/pdf/2411.12814">pdf</a>, <a href="https://arxiv.org/format/2411.12814">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Interactive Medical Image Segmentation: A Benchmark Dataset and Baseline </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Junlong Cheng</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+B">Bin Fu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jin Ye</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoan Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianbin Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoyu Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruoyu Li</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+H">He Yao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junren Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jingwen Li</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yanzhou Su</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Min Zhu</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Junjun He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12814v2-abstract-short" style="display: inline;"> Interactive Medical Image Segmentation (IMIS) has long been constrained by the limited availability of large-scale, diverse, and densely annotated datasets, which hinders model generalization and consistent evaluation across different models. In this paper, we introduce the IMed-361M benchmark dataset, a significant advancement in general IMIS research. First, we collect and standardize over 6.4 m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12814v2-abstract-full').style.display = 'inline'; document.getElementById('2411.12814v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12814v2-abstract-full" style="display: none;"> Interactive Medical Image Segmentation (IMIS) has long been constrained by the limited availability of large-scale, diverse, and densely annotated datasets, which hinders model generalization and consistent evaluation across different models. In this paper, we introduce the IMed-361M benchmark dataset, a significant advancement in general IMIS research. First, we collect and standardize over 6.4 million medical images and their corresponding ground truth masks from multiple data sources. Then, leveraging the strong object recognition capabilities of a vision foundational model, we automatically generated dense interactive masks for each image and ensured their quality through rigorous quality control and granularity management. Unlike previous datasets, which are limited by specific modalities or sparse annotations, IMed-361M spans 14 modalities and 204 segmentation targets, totaling 361 million masks-an average of 56 masks per image. Finally, we developed an IMIS baseline network on this dataset that supports high-quality mask generation through interactive inputs, including clicks, bounding boxes, text prompts, and their combinations. We evaluate its performance on medical image segmentation tasks from multiple perspectives, demonstrating superior accuracy and scalability compared to existing interactive segmentation models. To facilitate research on foundational models in medical computer vision, we release the IMed-361M and model at https://github.com/uni-medical/IMIS-Bench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12814v2-abstract-full').style.display = 'none'; document.getElementById('2411.12814v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12181">arXiv:2411.12181</a> <span> [<a href="https://arxiv.org/pdf/2411.12181">pdf</a>, <a href="https://arxiv.org/format/2411.12181">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Low Dose Computed Tomography Images Using Consistency Training Techniques </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gokmen%2C+M+S">Mahmut S. Gokmen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jie Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Ge Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jin Chen</a>, <a href="/search/cs?searchtype=author&query=Bumgardner%2C+C">Cody Bumgardner</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12181v1-abstract-short" style="display: inline;"> Diffusion models have significant impact on wide range of generative tasks, especially on image inpainting and restoration. Although the improvements on aiming for decreasing number of function evaluations (NFE), the iterative results are still computationally expensive. Consistency models are as a new family of generative models, enable single-step sampling of high quality data without the need f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12181v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12181v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12181v1-abstract-full" style="display: none;"> Diffusion models have significant impact on wide range of generative tasks, especially on image inpainting and restoration. Although the improvements on aiming for decreasing number of function evaluations (NFE), the iterative results are still computationally expensive. Consistency models are as a new family of generative models, enable single-step sampling of high quality data without the need for adversarial training. In this paper, we introduce the beta noise distribution, which provides flexibility in adjusting noise levels. This is combined with a sinusoidal curriculum that enhances the learning of the trajectory between the noise distribution and the posterior distribution of interest, allowing High Noise Improved Consistency Training (HN-iCT) to be trained in a supervised fashion. Additionally, High Noise Improved Consistency Training with Image Condition (HN-iCT-CN) architecture is introduced, enables to take Low Dose images as a condition for extracting significant features by Weighted Attention Gates (WAG).Our results indicate that unconditional image generation using HN-iCT significantly outperforms basic CT and iCT training techniques with NFE=1 on the CIFAR10 and CelebA datasets. Moreover, our image-conditioned model demonstrates exceptional performance in enhancing low-dose (LD) CT scans. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12181v1-abstract-full').style.display = 'none'; document.getElementById('2411.12181v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10280">arXiv:2411.10280</a> <span> [<a href="https://arxiv.org/pdf/2411.10280">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> From Score-Driven to Value-Sharing: Understanding Chinese Family Use of AI to Support Decision Making of College Applications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+S">Si Chen</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+J">Jingyi Xie</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Ge Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haizhou Wang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Haocong Cheng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yun Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10280v1-abstract-short" style="display: inline;"> This study investigates how 18-year-old students, parents, and experts in China utilize artificial intelligence (AI) tools to support decision-making in college applications during college entrance exam -- a highly competitive, score-driven, annual national exam. Through 32 interviews, we examine the use of Quark GaoKao, an AI tool that generates college application lists and acceptance probabilit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10280v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10280v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10280v1-abstract-full" style="display: none;"> This study investigates how 18-year-old students, parents, and experts in China utilize artificial intelligence (AI) tools to support decision-making in college applications during college entrance exam -- a highly competitive, score-driven, annual national exam. Through 32 interviews, we examine the use of Quark GaoKao, an AI tool that generates college application lists and acceptance probabilities based on exam scores, historical data, preferred locations, etc. Our findings show that AI tools are predominantly used by parents with limited involvement from students, and often focus on immediate exam results, failing to address long-term career goals. We also identify challenges such as misleading AI recommendations, and irresponsible use of AI by third-party consultant agencies. Finally, we offer design insights to better support multi-stakeholders' decision-making in families, especially in the Chinese context, and discuss how emerging AI tools create barriers for families with fewer resources. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10280v1-abstract-full').style.display = 'none'; document.getElementById('2411.10280v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10036">arXiv:2411.10036</a> <span> [<a href="https://arxiv.org/pdf/2411.10036">pdf</a>, <a href="https://arxiv.org/format/2411.10036">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Normalization Strategies and Convolutional Kernels for Multimodal Image Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+D">Dan He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guofen Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weisheng Li</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+Y">Yucheng Shu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenbo Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Lijian Yang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yuping Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F">Feiyan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10036v1-abstract-short" style="display: inline;"> Multimodal image fusion (MMIF) aims to integrate information from different modalities to obtain a comprehensive image, aiding downstream tasks. However, existing methods tend to prioritize natural image fusion and focus on information complementary and network training strategies. They ignore the essential distinction between natural and medical image fusion and the influence of underlying compon… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10036v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10036v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10036v1-abstract-full" style="display: none;"> Multimodal image fusion (MMIF) aims to integrate information from different modalities to obtain a comprehensive image, aiding downstream tasks. However, existing methods tend to prioritize natural image fusion and focus on information complementary and network training strategies. They ignore the essential distinction between natural and medical image fusion and the influence of underlying components. This paper dissects the significant differences between the two tasks regarding fusion goals, statistical properties, and data distribution. Based on this, we rethink the suitability of the normalization strategy and convolutional kernels for end-to-end MMIF.Specifically, this paper proposes a mixture of instance normalization and group normalization to preserve sample independence and reinforce intrinsic feature correlation.This strategy promotes the potential of enriching feature maps, thus boosting fusion performance. To this end, we further introduce the large kernel convolution, effectively expanding receptive fields and enhancing the preservation of image detail. Moreover, the proposed multipath adaptive fusion module recalibrates the decoder input with features of various scales and receptive fields, ensuring the transmission of crucial information. Extensive experiments demonstrate that our method exhibits state-of-the-art performance in multiple fusion tasks and significantly improves downstream applications. The code is available at https://github.com/HeDan-11/LKC-FUNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10036v1-abstract-full').style.display = 'none'; document.getElementById('2411.10036v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09209">arXiv:2411.09209</a> <span> [<a href="https://arxiv.org/pdf/2411.09209">pdf</a>, <a href="https://arxiv.org/format/2411.09209">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> JoyVASA: Portrait and Animal Image Animation with Diffusion-Based Audio-Driven Facial Dynamics and Head Motion Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xuyang Cao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoxin Wang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+S">Sheng Shi</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jun Zhao</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yang Yao</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+J">Jintao Fei</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+M">Minyu Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09209v3-abstract-short" style="display: inline;"> Audio-driven portrait animation has made significant advances with diffusion-based models, improving video quality and lipsync accuracy. However, the increasing complexity of these models has led to inefficiencies in training and inference, as well as constraints on video length and inter-frame continuity. In this paper, we propose JoyVASA, a diffusion-based method for generating facial dynamics a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09209v3-abstract-full').style.display = 'inline'; document.getElementById('2411.09209v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09209v3-abstract-full" style="display: none;"> Audio-driven portrait animation has made significant advances with diffusion-based models, improving video quality and lipsync accuracy. However, the increasing complexity of these models has led to inefficiencies in training and inference, as well as constraints on video length and inter-frame continuity. In this paper, we propose JoyVASA, a diffusion-based method for generating facial dynamics and head motion in audio-driven facial animation. Specifically, in the first stage, we introduce a decoupled facial representation framework that separates dynamic facial expressions from static 3D facial representations. This decoupling allows the system to generate longer videos by combining any static 3D facial representation with dynamic motion sequences. Then, in the second stage, a diffusion transformer is trained to generate motion sequences directly from audio cues, independent of character identity. Finally, a generator trained in the first stage uses the 3D facial representation and the generated motion sequences as inputs to render high-quality animations. With the decoupled facial representation and the identity-independent motion generation process, JoyVASA extends beyond human portraits to animate animal faces seamlessly. The model is trained on a hybrid dataset of private Chinese and public English data, enabling multilingual support. Experimental results validate the effectiveness of our approach. Future work will focus on improving real-time performance and refining expression control, further expanding the applications in portrait animation. The code is available at: https://github.com/jdh-algo/JoyVASA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09209v3-abstract-full').style.display = 'none'; document.getElementById('2411.09209v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07387">arXiv:2411.07387</a> <span> [<a href="https://arxiv.org/pdf/2411.07387">pdf</a>, <a href="https://arxiv.org/format/2411.07387">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Isochrony-Controlled Speech-to-Text Translation: A study on translating from Sino-Tibetan to Indo-European Languages </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yousefi%2C+M">Midia Yousefi</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+Y">Yao Qian</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junkun Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Gang Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yanqing Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Dongmei Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaofei Wang</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+J">Jian Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07387v1-abstract-short" style="display: inline;"> End-to-end speech translation (ST), which translates source language speech directly into target language text, has garnered significant attention in recent years. Many ST applications require strict length control to ensure that the translation duration matches the length of the source audio, including both speech and pause segments. Previous methods often controlled the number of words or charac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07387v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07387v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07387v1-abstract-full" style="display: none;"> End-to-end speech translation (ST), which translates source language speech directly into target language text, has garnered significant attention in recent years. Many ST applications require strict length control to ensure that the translation duration matches the length of the source audio, including both speech and pause segments. Previous methods often controlled the number of words or characters generated by the Machine Translation model to approximate the source sentence's length without considering the isochrony of pauses and speech segments, as duration can vary between languages. To address this, we present improvements to the duration alignment component of our sequence-to-sequence ST model. Our method controls translation length by predicting the duration of speech and pauses in conjunction with the translation process. This is achieved by providing timing information to the decoder, ensuring it tracks the remaining duration for speech and pauses while generating the translation. The evaluation on the Zh-En test set of CoVoST 2, demonstrates that the proposed Isochrony-Controlled ST achieves 0.92 speech overlap and 8.9 BLEU, which has only a 1.4 BLEU drop compared to the ST baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07387v1-abstract-full').style.display = 'none'; document.getElementById('2411.07387v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06503">arXiv:2411.06503</a> <span> [<a href="https://arxiv.org/pdf/2411.06503">pdf</a>, <a href="https://arxiv.org/format/2411.06503">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Diffusion Sampling Correction via Approximately 10 Parameters </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guangyi Wang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+W">Wei Peng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lijiang Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenyu Chen</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yuren Cai</a>, <a href="/search/cs?searchtype=author&query=Su%2C+S">Songzhi Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06503v2-abstract-short" style="display: inline;"> Diffusion Probabilistic Models (DPMs) have demonstrated exceptional performance in generative tasks, but this comes at the expense of sampling efficiency. To enhance sampling speed without sacrificing quality, various distillation-based accelerated sampling algorithms have been recently proposed. However, they typically require significant additional training costs and model parameter storage, whi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06503v2-abstract-full').style.display = 'inline'; document.getElementById('2411.06503v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06503v2-abstract-full" style="display: none;"> Diffusion Probabilistic Models (DPMs) have demonstrated exceptional performance in generative tasks, but this comes at the expense of sampling efficiency. To enhance sampling speed without sacrificing quality, various distillation-based accelerated sampling algorithms have been recently proposed. However, they typically require significant additional training costs and model parameter storage, which limit their practical application. In this work, we propose PCA-based Adaptive Search (PAS), which optimizes existing solvers for DPMs with minimal learnable parameters and training costs. Specifically, we first employ PCA to obtain a few orthogonal unit basis vectors to span the high-dimensional sampling space, which enables us to learn just a set of coordinates to correct the sampling direction; furthermore, based on the observation that the cumulative truncation error exhibits an ``S''-shape, we design an adaptive search strategy that further enhances the sampling efficiency and reduces the number of stored parameters to approximately 10. Extensive experiments demonstrate that PAS can significantly enhance existing fast solvers in a plug-and-play manner with negligible costs. For instance, on CIFAR10, PAS requires only 12 parameters and less than 1 minute of training on a single NVIDIA A100 GPU to optimize the DDIM from 15.69 FID (NFE=10) to 4.37. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06503v2-abstract-full').style.display = 'none'; document.getElementById('2411.06503v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06316">arXiv:2411.06316</a> <span> [<a href="https://arxiv.org/pdf/2411.06316">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Prompts Matter: Comparing ML/GAI Approaches for Generating Inductive Qualitative Coding Results </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">John Chen</a>, <a href="/search/cs?searchtype=author&query=Lotsos%2C+A">Alexandros Lotsos</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Lexie Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Grace Wang</a>, <a href="/search/cs?searchtype=author&query=Wilensky%2C+U">Uri Wilensky</a>, <a href="/search/cs?searchtype=author&query=Sherin%2C+B">Bruce Sherin</a>, <a href="/search/cs?searchtype=author&query=Horn%2C+M">Michael Horn</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06316v1-abstract-short" style="display: inline;"> Inductive qualitative methods have been a mainstay of education research for decades, yet it takes much time and effort to conduct rigorously. Recent advances in artificial intelligence, particularly with generative AI (GAI), have led to initial success in generating inductive coding results. Like human coders, GAI tools rely on instructions to work, and how to instruct it may matter. To understan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06316v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06316v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06316v1-abstract-full" style="display: none;"> Inductive qualitative methods have been a mainstay of education research for decades, yet it takes much time and effort to conduct rigorously. Recent advances in artificial intelligence, particularly with generative AI (GAI), have led to initial success in generating inductive coding results. Like human coders, GAI tools rely on instructions to work, and how to instruct it may matter. To understand how ML/GAI approaches could contribute to qualitative coding processes, this study applied two known and two theory-informed novel approaches to an online community dataset and evaluated the resulting coding results. Our findings show significant discrepancies between ML/GAI approaches and demonstrate the advantage of our approaches, which introduce human coding processes into GAI prompts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06316v1-abstract-full').style.display = 'none'; document.getElementById('2411.06316v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AERA 2025 Annual Meeting</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05878">arXiv:2411.05878</a> <span> [<a href="https://arxiv.org/pdf/2411.05878">pdf</a>, <a href="https://arxiv.org/format/2411.05878">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Joint-Optimized Unsupervised Adversarial Domain Adaptation in Remote Sensing Segmentation with Prompted Foundation Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Shuchang Lyu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Q">Qi Zhao</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+G">Guangliang Cheng</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yiwei He</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zheng Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guangbiao Wang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zhenwei Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05878v2-abstract-short" style="display: inline;"> Unsupervised Domain Adaptation for Remote Sensing Semantic Segmentation (UDA-RSSeg) addresses the challenge of adapting a model trained on source domain data to target domain samples, thereby minimizing the need for annotated data across diverse remote sensing scenes. This task presents two principal challenges: (1) severe inconsistencies in feature representation across different remote sensing d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05878v2-abstract-full').style.display = 'inline'; document.getElementById('2411.05878v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05878v2-abstract-full" style="display: none;"> Unsupervised Domain Adaptation for Remote Sensing Semantic Segmentation (UDA-RSSeg) addresses the challenge of adapting a model trained on source domain data to target domain samples, thereby minimizing the need for annotated data across diverse remote sensing scenes. This task presents two principal challenges: (1) severe inconsistencies in feature representation across different remote sensing domains, and (2) a domain gap that emerges due to the representation bias of source domain patterns when translating features to predictive logits. To tackle these issues, we propose a joint-optimized adversarial network incorporating the "Segment Anything Model (SAM) (SAM-JOANet)" for UDA-RSSeg. Our approach integrates SAM to leverage its robust generalized representation capabilities, thereby alleviating feature inconsistencies. We introduce a finetuning decoder designed to convert SAM-Encoder features into predictive logits. Additionally, a feature-level adversarial-based prompted segmentor is employed to generate class-agnostic maps, which guide the finetuning decoder's feature representations. The network is optimized end-to-end, combining the prompted segmentor and the finetuning decoder. Extensive evaluations on benchmark datasets, including ISPRS (Potsdam/Vaihingen) and CITY-OSM (Paris/Chicago), demonstrate the effectiveness of our method. The results, supported by visualization and analysis, confirm the method's interpretability and robustness. The code of this paper is available at https://github.com/CV-ShuchangLyu/SAM-JOANet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05878v2-abstract-full').style.display = 'none'; document.getElementById('2411.05878v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages,6 figures, 6 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05210">arXiv:2411.05210</a> <span> [<a href="https://arxiv.org/pdf/2411.05210">pdf</a>, <a href="https://arxiv.org/format/2411.05210">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Algorithmic Autonomy in Data-Driven AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+G">Ge Wang</a>, <a href="/search/cs?searchtype=author&query=Pea%2C+R">Roy Pea</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05210v1-abstract-short" style="display: inline;"> In societies increasingly entangled with algorithms, our choices are constantly influenced and shaped by automated systems. This convergence highlights significant concerns for individual autonomy in the age of data-driven AI. It leads to pressing issues such as data-driven segregation, gaps in accountability for algorithmic decisions, and the infringement on essential human rights and values. Thr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05210v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05210v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05210v1-abstract-full" style="display: none;"> In societies increasingly entangled with algorithms, our choices are constantly influenced and shaped by automated systems. This convergence highlights significant concerns for individual autonomy in the age of data-driven AI. It leads to pressing issues such as data-driven segregation, gaps in accountability for algorithmic decisions, and the infringement on essential human rights and values. Through this article, we introduce and explore the concept of algorithmic autonomy, examining what it means for individuals to have autonomy in the face of the pervasive impact of algorithms on our societies. We begin by outlining the data-driven characteristics of AI and its role in diminishing personal autonomy. We then explore the notion of algorithmic autonomy, drawing on existing research. Finally, we address important considerations, highlighting current challenges and directions for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05210v1-abstract-full').style.display = 'none'; document.getElementById('2411.05210v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04826">arXiv:2411.04826</a> <span> [<a href="https://arxiv.org/pdf/2411.04826">pdf</a>, <a href="https://arxiv.org/format/2411.04826">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> D$^3$epth: Self-Supervised Depth Estimation with Dynamic Mask in Dynamic Scenes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+S">Siyu Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hong Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenhao Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Ying Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoquan Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jianbing Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04826v1-abstract-short" style="display: inline;"> Depth estimation is a crucial technology in robotics. Recently, self-supervised depth estimation methods have demonstrated great potential as they can efficiently leverage large amounts of unlabelled real-world data. However, most existing methods are designed under the assumption of static scenes, which hinders their adaptability in dynamic environments. To address this issue, we present D$^3$ept… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04826v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04826v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04826v1-abstract-full" style="display: none;"> Depth estimation is a crucial technology in robotics. Recently, self-supervised depth estimation methods have demonstrated great potential as they can efficiently leverage large amounts of unlabelled real-world data. However, most existing methods are designed under the assumption of static scenes, which hinders their adaptability in dynamic environments. To address this issue, we present D$^3$epth, a novel method for self-supervised depth estimation in dynamic scenes. It tackles the challenge of dynamic objects from two key perspectives. First, within the self-supervised framework, we design a reprojection constraint to identify regions likely to contain dynamic objects, allowing the construction of a dynamic mask that mitigates their impact at the loss level. Second, for multi-frame depth estimation, we introduce a cost volume auto-masking strategy that leverages adjacent frames to identify regions associated with dynamic objects and generate corresponding masks. This provides guidance for subsequent processes. Furthermore, we propose a spectral entropy uncertainty module that incorporates spectral entropy to guide uncertainty estimation during depth fusion, effectively addressing issues arising from cost volume computation in dynamic environments. Extensive experiments on KITTI and Cityscapes datasets demonstrate that the proposed method consistently outperforms existing self-supervised monocular depth estimation baselines. Code is available at \url{https://github.com/Csyunling/D3epth}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04826v1-abstract-full').style.display = 'none'; document.getElementById('2411.04826v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Open sourced</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04595">arXiv:2411.04595</a> <span> [<a href="https://arxiv.org/pdf/2411.04595">pdf</a>, <a href="https://arxiv.org/format/2411.04595">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TexLiverNet: Leveraging Medical Knowledge and Spatial-Frequency Perception for Enhanced Liver Tumor Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xiaoyan Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhi Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hailing Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guozhong Wang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhijun Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04595v1-abstract-short" style="display: inline;"> Integrating textual data with imaging in liver tumor segmentation is essential for enhancing diagnostic accuracy. However, current multi-modal medical datasets offer only general text annotations, lacking lesion-specific details critical for extracting nuanced features, especially for fine-grained segmentation of tumor boundaries and small lesions. To address these limitations, we developed datase… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04595v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04595v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04595v1-abstract-full" style="display: none;"> Integrating textual data with imaging in liver tumor segmentation is essential for enhancing diagnostic accuracy. However, current multi-modal medical datasets offer only general text annotations, lacking lesion-specific details critical for extracting nuanced features, especially for fine-grained segmentation of tumor boundaries and small lesions. To address these limitations, we developed datasets with lesion-specific text annotations for liver tumors and introduced the TexLiverNet model. TexLiverNet employs an agent-based cross-attention module that integrates text features efficiently with visual features, significantly reducing computational costs. Additionally, enhanced spatial and adaptive frequency domain perception is proposed to precisely delineate lesion boundaries, reduce background interference, and recover fine details in small lesions. Comprehensive evaluations on public and private datasets demonstrate that TexLiverNet achieves superior performance compared to current state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04595v1-abstract-full').style.display = 'none'; document.getElementById('2411.04595v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02867">arXiv:2411.02867</a> <span> [<a href="https://arxiv.org/pdf/2411.02867">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AtlasSeg: Atlas Prior Guided Dual-U-Net for Cortical Segmentation in Fetal Brain MRI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+H">Haoan Xu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+T">Tianshu Zheng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xinyi Xu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yao Shen</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jiwei Sun</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+C">Cong Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guangbin Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+D">Dan Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02867v1-abstract-short" style="display: inline;"> Accurate tissue segmentation in fetal brain MRI remains challenging due to the dynamically changing anatomical anatomy and contrast during fetal development. To enhance segmentation accuracy throughout gestation, we introduced AtlasSeg, a dual-U-shape convolution network incorporating gestational age (GA) specific information as guidance. By providing a publicly available fetal brain atlas with se… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02867v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02867v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02867v1-abstract-full" style="display: none;"> Accurate tissue segmentation in fetal brain MRI remains challenging due to the dynamically changing anatomical anatomy and contrast during fetal development. To enhance segmentation accuracy throughout gestation, we introduced AtlasSeg, a dual-U-shape convolution network incorporating gestational age (GA) specific information as guidance. By providing a publicly available fetal brain atlas with segmentation label at the corresponding GA, AtlasSeg effectively extracted the contextual features of age-specific patterns in atlas branch and generated tissue segmentation in segmentation branch. Multi-scale attentive atlas feature fusions were constructed in all stages during encoding and decoding, giving rise to a dual-U-shape network to assist feature flow and information interactions between two branches. AtlasSeg outperformed six well-known segmentation networks in both our internal fetal brain MRI dataset and the external FeTA dataset. Ablation experiments demonstrate the efficiency of atlas guidance and the attention mechanism. The proposed AtlasSeg demonstrated superior segmentation performance against other convolution networks with higher segmentation accuracy, and may facilitate fetal brain MRI analysis in large-scale fetal brain studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02867v1-abstract-full').style.display = 'none'; document.getElementById('2411.02867v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02093">arXiv:2411.02093</a> <span> [<a href="https://arxiv.org/pdf/2411.02093">pdf</a>, <a href="https://arxiv.org/format/2411.02093">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Do Advanced Language Models Eliminate the Need for Prompt Engineering in Software Engineering? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoqing Wang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zeyu Sun</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Z">Zhihao Gong</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Sixiang Ye</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yizhou Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yifan Zhao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Q">Qingyuan Liang</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+D">Dan Hao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02093v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have significantly advanced software engineering (SE) tasks, with prompt engineering techniques enhancing their performance in code-related areas. However, the rapid development of foundational LLMs such as the non-reasoning model GPT-4o and the reasoning model o1 raises questions about the continued effectiveness of these prompt engineering techniques. This paper pres… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02093v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02093v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02093v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have significantly advanced software engineering (SE) tasks, with prompt engineering techniques enhancing their performance in code-related areas. However, the rapid development of foundational LLMs such as the non-reasoning model GPT-4o and the reasoning model o1 raises questions about the continued effectiveness of these prompt engineering techniques. This paper presents an extensive empirical study that reevaluates various prompt engineering techniques within the context of these advanced LLMs. Focusing on three representative SE tasks, i.e., code generation, code translation, and code summarization, we assess whether prompt engineering techniques still yield improvements with advanced models, the actual effectiveness of reasoning models compared to non-reasoning models, and whether the benefits of using these advanced models justify their increased costs. Our findings reveal that prompt engineering techniques developed for earlier LLMs may provide diminished benefits or even hinder performance when applied to advanced models. In reasoning LLMs, the ability of sophisticated built-in reasoning reduces the impact of complex prompts, sometimes making simple zero-shot prompting more effective. Furthermore, while reasoning models outperform non-reasoning models in tasks requiring complex reasoning, they offer minimal advantages in tasks that do not need reasoning and may incur unnecessary costs. Based on our study, we provide practical guidance for practitioners on selecting appropriate prompt engineering techniques and foundational LLMs, considering factors such as task requirements, operational costs, and environmental impact. Our work contributes to a deeper understanding of effectively harnessing advanced LLMs in SE tasks, informing future research and application development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02093v1-abstract-full').style.display = 'none'; document.getElementById('2411.02093v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01214">arXiv:2411.01214</a> <span> [<a href="https://arxiv.org/pdf/2411.01214">pdf</a>, <a href="https://arxiv.org/ps/2411.01214">ps</a>, <a href="https://arxiv.org/format/2411.01214">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> Multivariate Time Series Cleaning under Speed Constraints </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+A">Aoqian Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zexue Wu</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Y">Yifeng Gong</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Ye Yuan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoren Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01214v1-abstract-short" style="display: inline;"> Errors are common in time series due to unreliable sensor measurements. Existing methods focus on univariate data but do not utilize the correlation between dimensions. Cleaning each dimension separately may lead to a less accurate result, as some errors can only be identified in the multivariate case. We also point out that the widely used minimum change principle is not always the best choice. I… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01214v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01214v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01214v1-abstract-full" style="display: none;"> Errors are common in time series due to unreliable sensor measurements. Existing methods focus on univariate data but do not utilize the correlation between dimensions. Cleaning each dimension separately may lead to a less accurate result, as some errors can only be identified in the multivariate case. We also point out that the widely used minimum change principle is not always the best choice. Instead, we try to change the smallest number of data to avoid a significant change in the data distribution. In this paper, we propose MTCSC, the constraint-based method for cleaning multivariate time series. We formalize the repair problem, propose a linear-time method to employ online computing, and improve it by exploiting data trends. We also support adaptive speed constraint capturing. We analyze the properties of our proposals and compare them with SOTA methods in terms of effectiveness, efficiency versus error rates, data sizes, and applications such as classification. Experiments on real datasets show that MTCSC can have higher repair accuracy with less time consumption. Interestingly, it can be effective even when there are only weak or no correlations between the dimensions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01214v1-abstract-full').style.display = 'none'; document.getElementById('2411.01214v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 16 figures, conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00192">arXiv:2411.00192</a> <span> [<a href="https://arxiv.org/pdf/2411.00192">pdf</a>, <a href="https://arxiv.org/format/2411.00192">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Optical Lens Attack on Monocular Depth Estimation for Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+C">Ce Zhou</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Q">Qiben Yan</a>, <a href="/search/cs?searchtype=author&query=Kent%2C+D">Daniel Kent</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guangjing Wang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+W">Weikang Ding</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Ziqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Radha%2C+H">Hayder Radha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00192v1-abstract-short" style="display: inline;"> Monocular Depth Estimation (MDE) is a pivotal component of vision-based Autonomous Driving (AD) systems, enabling vehicles to estimate the depth of surrounding objects using a single camera image. This estimation guides essential driving decisions, such as braking before an obstacle or changing lanes to avoid collisions. In this paper, we explore vulnerabilities of MDE algorithms in AD systems, pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00192v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00192v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00192v1-abstract-full" style="display: none;"> Monocular Depth Estimation (MDE) is a pivotal component of vision-based Autonomous Driving (AD) systems, enabling vehicles to estimate the depth of surrounding objects using a single camera image. This estimation guides essential driving decisions, such as braking before an obstacle or changing lanes to avoid collisions. In this paper, we explore vulnerabilities of MDE algorithms in AD systems, presenting LensAttack, a novel physical attack that strategically places optical lenses on the camera of an autonomous vehicle to manipulate the perceived object depths. LensAttack encompasses two attack formats: concave lens attack and convex lens attack, each utilizing different optical lenses to induce false depth perception. We first develop a mathematical model that outlines the parameters of the attack, followed by simulations and real-world evaluations to assess its efficacy on state-of-the-art MDE models. Additionally, we adopt an attack optimization method to further enhance the attack success rate by optimizing the attack focal length. To better evaluate the implications of LensAttack on AD, we conduct comprehensive end-to-end system simulations using the CARLA platform. The results reveal that LensAttack can significantly disrupt the depth estimation processes in AD systems, posing a serious threat to their reliability and safety. Finally, we discuss some potential defense methods to mitigate the effects of the proposed attack. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00192v1-abstract-full').style.display = 'none'; document.getElementById('2411.00192v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">28 pages. arXiv admin note: substantial text overlap with arXiv:2409.17376</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22980">arXiv:2410.22980</a> <span> [<a href="https://arxiv.org/pdf/2410.22980">pdf</a>, <a href="https://arxiv.org/format/2410.22980">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Efficient End-to-End 6-Dof Grasp Detection Framework for Edge Devices with Hierarchical Heatmaps and Feature Propagation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kaiqin Yang</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+Y">Yixiang Dai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guijin Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Siang Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22980v2-abstract-short" style="display: inline;"> 6-DoF grasp detection is critically important for the advancement of intelligent embodied systems, as it provides feasible robot poses for object grasping. Various methods have been proposed to detect 6-DoF grasps through the extraction of 3D geometric features from RGBD or point cloud data. However, most of these approaches encounter challenges during real robot deployment due to their significan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22980v2-abstract-full').style.display = 'inline'; document.getElementById('2410.22980v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22980v2-abstract-full" style="display: none;"> 6-DoF grasp detection is critically important for the advancement of intelligent embodied systems, as it provides feasible robot poses for object grasping. Various methods have been proposed to detect 6-DoF grasps through the extraction of 3D geometric features from RGBD or point cloud data. However, most of these approaches encounter challenges during real robot deployment due to their significant computational demands, which can be particularly problematic for mobile robot platforms, especially those reliant on edge computing devices. This paper presents an Efficient End-to-End Grasp Detection Network (E3GNet) for 6-DoF grasp detection utilizing hierarchical heatmap representations. E3GNet effectively identifies high-quality and diverse grasps in cluttered real-world environments. Benefiting from our end-to-end methodology and efficient network design, our approach surpasses previous methods in model inference efficiency and achieves real-time 6-Dof grasp detection on edge devices. Furthermore, real-world experiments validate the effectiveness of our method, achieving a satisfactory 94% object grasping success rate. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22980v2-abstract-full').style.display = 'none'; document.getElementById('2410.22980v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22089">arXiv:2410.22089</a> <span> [<a href="https://arxiv.org/pdf/2410.22089">pdf</a>, <a href="https://arxiv.org/format/2410.22089">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> InLINE: Inner-Layer Information Exchange for Multi-task Learning on Heterogeneous Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+X">Xinyue Feng</a>, <a href="/search/cs?searchtype=author&query=Hang%2C+J">Jinquan Hang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuequn Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haotian Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Desheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22089v1-abstract-short" style="display: inline;"> Heterogeneous graph is an important structure for modeling complex relational data in real-world scenarios and usually involves various node prediction tasks within a single graph. Training these tasks separately may neglect beneficial information sharing, hence a preferred way is to learn several tasks in a same model by Multi-Task Learning (MTL). However, MTL introduces the issue of negative tra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22089v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22089v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22089v1-abstract-full" style="display: none;"> Heterogeneous graph is an important structure for modeling complex relational data in real-world scenarios and usually involves various node prediction tasks within a single graph. Training these tasks separately may neglect beneficial information sharing, hence a preferred way is to learn several tasks in a same model by Multi-Task Learning (MTL). However, MTL introduces the issue of negative transfer, where the training of different tasks interferes with each other as they may focus on different information from the data, resulting in suboptimal performance. To solve the issue, existing MTL methods use separate backbones for each task, then selectively exchange beneficial features through interactions among the output embeddings from each layer of different backbones, which we refer to as outer-layer exchange. However, the negative transfer in heterogeneous graphs arises not simply from the varying importance of an individual node feature across tasks, but also from the varying importance of inter-relation between two nodes across tasks. These inter-relations are entangled in the output embedding, making it difficult for existing methods to discriminate beneficial information from the embedding. To address this challenge, we propose the Inner-Layer Information Exchange (InLINE) model that facilitate fine-grained information exchanges within each graph layer rather than through output embeddings. Specifically, InLINE consists of (1) Structure Disentangled Experts for layer-wise structure disentanglement, (2) Structure Disentangled Gates for assigning disentangled information to different tasks. Evaluations on two public datasets and a large industry dataset show that our model effectively alleviates the significant performance drop on specific tasks caused by negative transfer, improving Macro F1 by 6.3% on DBLP dataset and AUC by 3.6% on the industry dataset compared to SoA methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22089v1-abstract-full').style.display = 'none'; document.getElementById('2410.22089v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21111">arXiv:2410.21111</a> <span> [<a href="https://arxiv.org/pdf/2410.21111">pdf</a>, <a href="https://arxiv.org/format/2410.21111">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> </div> </div> <p class="title is-5 mathjax"> LAMA: Stable Dual-Domain Deep Reconstruction For Sparse-View CT </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ding%2C+C">Chi Ding</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qingchao Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Ge Wang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+X">Xiaojing Ye</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yunmei Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21111v1-abstract-short" style="display: inline;"> Inverse problems arise in many applications, especially tomographic imaging. We develop a Learned Alternating Minimization Algorithm (LAMA) to solve such problems via two-block optimization by synergizing data-driven and classical techniques with proven convergence. LAMA is naturally induced by a variational model with learnable regularizers in both data and image domains, parameterized as composi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21111v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21111v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21111v1-abstract-full" style="display: none;"> Inverse problems arise in many applications, especially tomographic imaging. We develop a Learned Alternating Minimization Algorithm (LAMA) to solve such problems via two-block optimization by synergizing data-driven and classical techniques with proven convergence. LAMA is naturally induced by a variational model with learnable regularizers in both data and image domains, parameterized as composite functions of neural networks trained with domain-specific data. We allow these regularizers to be nonconvex and nonsmooth to extract features from data effectively. We minimize the overall objective function using Nesterov's smoothing technique and residual learning architecture. It is demonstrated that LAMA reduces network complexity, improves memory efficiency, and enhances reconstruction accuracy, stability, and interpretability. Extensive experiments show that LAMA significantly outperforms state-of-the-art methods on popular benchmark datasets for Computed Tomography. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21111v1-abstract-full').style.display = 'none'; document.getElementById('2410.21111v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Journal version for LAMA (Learned Alternating Minimization Algorithm)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20119">arXiv:2410.20119</a> <span> [<a href="https://arxiv.org/pdf/2410.20119">pdf</a>, <a href="https://arxiv.org/format/2410.20119">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> On Multi-Stage Loss Dynamics in Neural Networks: Mechanisms of Plateau and Descent Stages </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zheng-An Chen</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+T">Tao Luo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">GuiHong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20119v2-abstract-short" style="display: inline;"> The multi-stage phenomenon in the training loss curves of neural networks has been widely observed, reflecting the non-linearity and complexity inherent in the training process. In this work, we investigate the training dynamics of neural networks (NNs), with particular emphasis on the small initialization regime, identifying three distinct stages observed in the loss curve during training: the in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20119v2-abstract-full').style.display = 'inline'; document.getElementById('2410.20119v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20119v2-abstract-full" style="display: none;"> The multi-stage phenomenon in the training loss curves of neural networks has been widely observed, reflecting the non-linearity and complexity inherent in the training process. In this work, we investigate the training dynamics of neural networks (NNs), with particular emphasis on the small initialization regime, identifying three distinct stages observed in the loss curve during training: the initial plateau stage, the initial descent stage, and the secondary plateau stage. Through rigorous analysis, we reveal the underlying challenges contributing to slow training during the plateau stages. While the proof and estimate for the emergence of the initial plateau were established in our previous work, the behaviors of the initial descent and secondary plateau stages had not been explored before. Here, we provide a more detailed proof for the initial plateau, followed by a comprehensive analysis of the initial descent stage dynamics. Furthermore, we examine the factors facilitating the network's ability to overcome the prolonged secondary plateau, supported by both experimental evidence and heuristic reasoning. Finally, to clarify the link between global training trends and local parameter adjustments, we use the Wasserstein distance to track the fine-scale evolution of weight amplitude distribution. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20119v2-abstract-full').style.display = 'none'; document.getElementById('2410.20119v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18921">arXiv:2410.18921</a> <span> [<a href="https://arxiv.org/pdf/2410.18921">pdf</a>, <a href="https://arxiv.org/format/2410.18921">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Logic in Computer Science">cs.LO</span> </div> </div> <p class="title is-5 mathjax"> From Blind Solvers to Logical Thinkers: Benchmarking LLMs' Logical Integrity on Faulty Mathematical Problems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rahman%2C+A+M+M">A M Muntasir Rahman</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Junyi Ye</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+W">Wei Yao</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+W">Wenpeng Yin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guiling Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18921v1-abstract-short" style="display: inline;"> Consider the math problem: "Lily received 3 cookies from her best friend yesterday and ate 5 for breakfast. Today, her friend gave her 3 more cookies. How many cookies does Lily have now?" Many large language models (LLMs) in previous research approach this problem by calculating the answer "1" using the equation "3 - 5 + 3." However, from a human perspective, we recognize the inherent flaw in thi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18921v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18921v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18921v1-abstract-full" style="display: none;"> Consider the math problem: "Lily received 3 cookies from her best friend yesterday and ate 5 for breakfast. Today, her friend gave her 3 more cookies. How many cookies does Lily have now?" Many large language models (LLMs) in previous research approach this problem by calculating the answer "1" using the equation "3 - 5 + 3." However, from a human perspective, we recognize the inherent flaw in this problem: Lily cannot eat 5 cookies if she initially only had 3. This discrepancy prompts a key question: Are current LLMs merely Blind Solver that apply mathematical operations without deeper reasoning, or can they function as Logical Thinker capable of identifying logical inconsistencies? To explore this question, we propose a benchmark dataset, FaultyMath, which includes faulty math problems of rich diversity: i) multiple mathematical categories, e.g., algebra, geometry, number theory, etc., ii) varying levels of difficulty, and iii) different origins of faultiness -- ranging from violations of common sense and ambiguous statements to mathematical contradictions and more. We evaluate a broad spectrum of LLMs, including open-source, closed-source, and math-specialized models, using FaultyMath across three dimensions: (i) How accurately can the models detect faulty math problems without being explicitly prompted to do so? (ii) When provided with hints -- either correct or misleading -- about the validity of the problems, to what extent do LLMs adapt to become reliable Logical Thinker? (iii) How trustworthy are the explanations generated by LLMs when they recognize a math problem as flawed? Through extensive experimentation and detailed analysis, our results demonstrate that existing LLMs largely function as Blind Solver and fall short of the reasoning capabilities required to perform as Logical Thinker. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18921v1-abstract-full').style.display = 'none'; document.getElementById('2410.18921v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18336">arXiv:2410.18336</a> <span> [<a href="https://arxiv.org/pdf/2410.18336">pdf</a>, <a href="https://arxiv.org/format/2410.18336">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Assessing the Creativity of LLMs in Proposing Novel Solutions to Mathematical Problems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+J">Junyi Ye</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jingyi Gu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xinyun Zhao</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+W">Wenpeng Yin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guiling Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18336v1-abstract-short" style="display: inline;"> The mathematical capabilities of AI systems are complex and multifaceted. Most existing research has predominantly focused on the correctness of AI-generated solutions to mathematical problems. In this work, we argue that beyond producing correct answers, AI systems should also be capable of, or assist humans in, developing novel solutions to mathematical challenges. This study explores the creati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18336v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18336v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18336v1-abstract-full" style="display: none;"> The mathematical capabilities of AI systems are complex and multifaceted. Most existing research has predominantly focused on the correctness of AI-generated solutions to mathematical problems. In this work, we argue that beyond producing correct answers, AI systems should also be capable of, or assist humans in, developing novel solutions to mathematical challenges. This study explores the creative potential of Large Language Models (LLMs) in mathematical reasoning, an aspect that has received limited attention in prior research. We introduce a novel framework and benchmark, CreativeMath, which encompasses problems ranging from middle school curricula to Olympic-level competitions, designed to assess LLMs' ability to propose innovative solutions after some known solutions have been provided. Our experiments demonstrate that, while LLMs perform well on standard mathematical tasks, their capacity for creative problem-solving varies considerably. Notably, the Gemini-1.5-Pro model outperformed other LLMs in generating novel solutions. This research opens a new frontier in evaluating AI creativity, shedding light on both the strengths and limitations of LLMs in fostering mathematical innovation, and setting the stage for future developments in AI-assisted mathematical discovery. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18336v1-abstract-full').style.display = 'none'; document.getElementById('2410.18336v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16481">arXiv:2410.16481</a> <span> [<a href="https://arxiv.org/pdf/2410.16481">pdf</a>, <a href="https://arxiv.org/format/2410.16481">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Caging in Time: A Framework for Robust Object Manipulation under Uncertainties and Limited Robot Perception </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+G">Gaotian Wang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+K">Kejia Ren</a>, <a href="/search/cs?searchtype=author&query=Morgan%2C+A+S">Andrew S. Morgan</a>, <a href="/search/cs?searchtype=author&query=Hang%2C+K">Kaiyu Hang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16481v3-abstract-short" style="display: inline;"> Real-world object manipulation has been commonly challenged by physical uncertainties and perception limitations. Being an effective strategy, while caging configuration-based manipulation frameworks have successfully provided robust solutions, they are not broadly applicable due to their strict requirements on the availability of multiple robots, widely distributed contacts, or specific geometrie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16481v3-abstract-full').style.display = 'inline'; document.getElementById('2410.16481v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16481v3-abstract-full" style="display: none;"> Real-world object manipulation has been commonly challenged by physical uncertainties and perception limitations. Being an effective strategy, while caging configuration-based manipulation frameworks have successfully provided robust solutions, they are not broadly applicable due to their strict requirements on the availability of multiple robots, widely distributed contacts, or specific geometries of the robots or the objects. To this end, this work proposes a novel concept, termed Caging in Time, to allow caging configurations to be formed even if there is just one robot engaged in a task. This novel concept can be explained by an insight that even if a caging configuration is needed to constrain the motion of an object, only a small portion of the cage is actively manipulating at a time. As such, we can switch the configuration of the robot strategically so that by collapsing its configuration in time, we will see a cage formed and its necessary portion active whenever needed. We instantiate our Caging in Time theory on challenging quasistatic and dynamic manipulation tasks, showing that Caging in Time can be achieved in general state spaces including geometry-based and energy-based spaces. With extensive experiments, we show robust and accurate manipulation, in an open-loop manner, without requiring detailed knowledge of the object geometry or physical properties, nor realtime accurate feedback on the manipulation states. In addition to being an effective and robust open-loop manipulation solution, the proposed theory can be a supplementary strategy to other manipulation systems affected by uncertain or limited robot perception. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16481v3-abstract-full').style.display = 'none'; document.getElementById('2410.16481v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">24 pages, 25 figures, video available at: www.youtube.com/watch?v=Ag_jTzazuSM</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15762">arXiv:2410.15762</a> <span> [<a href="https://arxiv.org/pdf/2410.15762">pdf</a>, <a href="https://arxiv.org/format/2410.15762">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Solving Sparse \& High-Dimensional-Output Regression via Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+R">Renyuan Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhehui Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guanyi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15762v1-abstract-short" style="display: inline;"> Multi-Output Regression (MOR) has been widely used in scientific data analysis for decision-making. Unlike traditional regression models, MOR aims to simultaneously predict multiple real-valued outputs given an input. However, the increasing dimensionality of the outputs poses significant challenges regarding interpretability and computational scalability for modern MOR applications. As a first st… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15762v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15762v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15762v1-abstract-full" style="display: none;"> Multi-Output Regression (MOR) has been widely used in scientific data analysis for decision-making. Unlike traditional regression models, MOR aims to simultaneously predict multiple real-valued outputs given an input. However, the increasing dimensionality of the outputs poses significant challenges regarding interpretability and computational scalability for modern MOR applications. As a first step to address these challenges, this paper proposes a Sparse \& High-dimensional-Output REgression (SHORE) model by incorporating additional sparsity requirements to resolve the output interpretability, and then designs a computationally efficient two-stage optimization framework capable of solving SHORE with provable accuracy via compression on outputs. Theoretically, we show that the proposed framework is computationally scalable while maintaining the same order of training loss and prediction loss before-and-after compression under arbitrary or relatively weak sample set conditions. Empirically, numerical results further validate the theoretical findings, showcasing the efficiency and accuracy of the proposed framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15762v1-abstract-full').style.display = 'none'; document.getElementById('2410.15762v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Admitted in Neurips 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15115">arXiv:2410.15115</a> <span> [<a href="https://arxiv.org/pdf/2410.15115">pdf</a>, <a href="https://arxiv.org/format/2410.15115">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> On Designing Effective RL Reward at Training Time for LLM Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jiaxuan Gao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shusheng Xu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+W">Wenjie Ye</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Weilin Liu</a>, <a href="/search/cs?searchtype=author&query=He%2C+C">Chuyi He</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+W">Wei Fu</a>, <a href="/search/cs?searchtype=author&query=Mei%2C+Z">Zhiyu Mei</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guangju Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yi Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15115v3-abstract-short" style="display: inline;"> Reward models have been increasingly critical for improving the reasoning capability of LLMs. Existing research has shown that a well-trained reward model can substantially improve model performances at inference time via search. However, the potential of reward models during RL training time still remains largely under-explored. It is currently unclear whether these reward models can provide addi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15115v3-abstract-full').style.display = 'inline'; document.getElementById('2410.15115v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15115v3-abstract-full" style="display: none;"> Reward models have been increasingly critical for improving the reasoning capability of LLMs. Existing research has shown that a well-trained reward model can substantially improve model performances at inference time via search. However, the potential of reward models during RL training time still remains largely under-explored. It is currently unclear whether these reward models can provide additional training signals to enhance the reasoning capabilities of LLMs in RL training that uses sparse success rewards, which verify the correctness of solutions. In this work, we evaluate popular reward models for RL training, including the Outcome-supervised Reward Model (ORM) and the Process-supervised Reward Model (PRM), and train a collection of LLMs for math problems using RL by combining these learned rewards with success rewards. Surprisingly, even though these learned reward models have strong inference-time performances, they may NOT help or even hurt RL training, producing worse performances than LLMs trained with the success reward only. Our analysis reveals that an LLM can receive high rewards from some of these reward models by repeating correct but unnecessary reasoning steps, leading to a severe reward hacking issue. Therefore, we introduce two novel reward refinement techniques, including Clipping and Delta. The key idea is to ensure the accumulative reward of any reasoning trajectory is upper-bounded to keep a learned reward model effective without being exploited. We evaluate our techniques with multiple reward models over a set of 1.5B and 7B LLMs on MATH and GSM8K benchmarks and demonstrate that with a carefully designed reward function, RL training without any additional supervised tuning can improve all the evaluated LLMs, including the state-of-the-art 7B LLM Qwen2.5-Math-7B-Instruct on MATH and GSM8K benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15115v3-abstract-full').style.display = 'none'; document.getElementById('2410.15115v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15074">arXiv:2410.15074</a> <span> [<a href="https://arxiv.org/pdf/2410.15074">pdf</a>, <a href="https://arxiv.org/format/2410.15074">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LLaVA-Ultra: Large Chinese Language and Vision Assistant for Ultrasound </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xuechen Guo</a>, <a href="/search/cs?searchtype=author&query=Chai%2C+W">Wenhao Chai</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shi-Yan Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Gaoang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15074v1-abstract-short" style="display: inline;"> Multimodal Large Language Model (MLLM) has recently garnered attention as a prominent research focus. By harnessing powerful LLM, it facilitates a transition of conversational generative AI from unimodal text to performing multimodal tasks. This boom begins to significantly impact medical field. However, general visual language model (VLM) lacks sophisticated comprehension for medical visual quest… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15074v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15074v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15074v1-abstract-full" style="display: none;"> Multimodal Large Language Model (MLLM) has recently garnered attention as a prominent research focus. By harnessing powerful LLM, it facilitates a transition of conversational generative AI from unimodal text to performing multimodal tasks. This boom begins to significantly impact medical field. However, general visual language model (VLM) lacks sophisticated comprehension for medical visual question answering (Med-VQA). Even models specifically tailored for medical domain tend to produce vague answers with weak visual relevance. In this paper, we propose a fine-grained adaptive VLM architecture for Chinese medical visual conversations through parameter-efficient tuning. Specifically, we devise a fusion module with fine-grained vision encoders to achieve enhancement for subtle medical visual semantics. Then we note data redundancy common to medical scenes is ignored in most prior works. In cases of a single text paired with multiple figures, we utilize weighted scoring with knowledge distillation to adaptively screen valid images mirroring text descriptions. For execution, we leverage a large-scale multimodal Chinese ultrasound dataset obtained from the hospital. We create instruction-following data based on text from professional doctors, which ensures effective tuning. With enhanced model and quality data, our Large Chinese Language and Vision Assistant for Ultrasound (LLaVA-Ultra) shows strong capability and robustness to medical scenarios. On three Med-VQA datasets, LLaVA-Ultra surpasses previous state-of-the-art models on various metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15074v1-abstract-full').style.display = 'none'; document.getElementById('2410.15074v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14874">arXiv:2410.14874</a> <span> [<a href="https://arxiv.org/pdf/2410.14874">pdf</a>, <a href="https://arxiv.org/format/2410.14874">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Improving Vision Transformers by Overlapping Heads in Multi-Head Self-Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianxiao Zhang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+B">Bo Luo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guanghui Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14874v1-abstract-short" style="display: inline;"> Vision Transformers have made remarkable progress in recent years, achieving state-of-the-art performance in most vision tasks. A key component of this success is due to the introduction of the Multi-Head Self-Attention (MHSA) module, which enables each head to learn different representations by applying the attention mechanism independently. In this paper, we empirically demonstrate that Vision T… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14874v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14874v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14874v1-abstract-full" style="display: none;"> Vision Transformers have made remarkable progress in recent years, achieving state-of-the-art performance in most vision tasks. A key component of this success is due to the introduction of the Multi-Head Self-Attention (MHSA) module, which enables each head to learn different representations by applying the attention mechanism independently. In this paper, we empirically demonstrate that Vision Transformers can be further enhanced by overlapping the heads in MHSA. We introduce Multi-Overlapped-Head Self-Attention (MOHSA), where heads are overlapped with their two adjacent heads for queries, keys, and values, while zero-padding is employed for the first and last heads, which have only one neighboring head. Various paradigms for overlapping ratios are proposed to fully investigate the optimal performance of our approach. The proposed approach is evaluated using five Transformer models on four benchmark datasets and yields a significant performance boost. The source code will be made publicly available upon publication. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14874v1-abstract-full').style.display = 'none'; document.getElementById('2410.14874v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14390">arXiv:2410.14390</a> <span> [<a href="https://arxiv.org/pdf/2410.14390">pdf</a>, <a href="https://arxiv.org/format/2410.14390">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Personalizing Low-Rank Bayesian Neural Networks Via Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Boning Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dongzhu Liu</a>, <a href="/search/cs?searchtype=author&query=Simeone%2C+O">Osvaldo Simeone</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guanchu Wang</a>, <a href="/search/cs?searchtype=author&query=Pezaros%2C+D">Dimitrios Pezaros</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+G">Guangxu Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14390v1-abstract-short" style="display: inline;"> To support real-world decision-making, it is crucial for models to be well-calibrated, i.e., to assign reliable confidence estimates to their predictions. Uncertainty quantification is particularly important in personalized federated learning (PFL), as participating clients typically have small local datasets, making it difficult to unambiguously determine optimal model parameters. Bayesian PFL (B… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14390v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14390v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14390v1-abstract-full" style="display: none;"> To support real-world decision-making, it is crucial for models to be well-calibrated, i.e., to assign reliable confidence estimates to their predictions. Uncertainty quantification is particularly important in personalized federated learning (PFL), as participating clients typically have small local datasets, making it difficult to unambiguously determine optimal model parameters. Bayesian PFL (BPFL) methods can potentially enhance calibration, but they often come with considerable computational and memory requirements due to the need to track the variances of all the individual model parameters. Furthermore, different clients may exhibit heterogeneous uncertainty levels owing to varying local dataset sizes and distributions. To address these challenges, we propose LR-BPFL, a novel BPFL method that learns a global deterministic model along with personalized low-rank Bayesian corrections. To tailor the local model to each client's inherent uncertainty level, LR-BPFL incorporates an adaptive rank selection mechanism. We evaluate LR-BPFL across a variety of datasets, demonstrating its advantages in terms of calibration, accuracy, as well as computational and memory requirements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14390v1-abstract-full').style.display = 'none'; document.getElementById('2410.14390v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13917">arXiv:2410.13917</a> <span> [<a href="https://arxiv.org/pdf/2410.13917">pdf</a>, <a href="https://arxiv.org/format/2410.13917">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> GBCT: An Efficient and Adaptive Granular-Ball Clustering Algorithm for Complex Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shuyin Xia</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+B">Bolun Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yifan Wang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+J">Jiang Xie</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoyin Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xinbo Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13917v1-abstract-short" style="display: inline;"> Traditional clustering algorithms often focus on the most fine-grained information and achieve clustering by calculating the distance between each pair of data points or implementing other calculations based on points. This way is not inconsistent with the cognitive mechanism of "global precedence" in human brain, resulting in those methods' bad performance in efficiency, generalization ability an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13917v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13917v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13917v1-abstract-full" style="display: none;"> Traditional clustering algorithms often focus on the most fine-grained information and achieve clustering by calculating the distance between each pair of data points or implementing other calculations based on points. This way is not inconsistent with the cognitive mechanism of "global precedence" in human brain, resulting in those methods' bad performance in efficiency, generalization ability and robustness. To address this problem, we propose a new clustering algorithm called granular-ball clustering (GBCT) via granular-ball computing. Firstly, GBCT generates a smaller number of granular-balls to represent the original data, and forms clusters according to the relationship between granular-balls, instead of the traditional point relationship. At the same time, its coarse-grained characteristics are not susceptible to noise, and the algorithm is efficient and robust; besides, as granular-balls can fit various complex data, GBCT performs much better in non-spherical data sets than other traditional clustering methods. The completely new coarse granularity representation method of GBCT and cluster formation mode can also used to improve other traditional methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13917v1-abstract-full').style.display = 'none'; document.getElementById('2410.13917v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13187">arXiv:2410.13187</a> <span> [<a href="https://arxiv.org/pdf/2410.13187">pdf</a>, <a href="https://arxiv.org/format/2410.13187">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> aiXcoder-7B: A Lightweight and Effective Large Language Model for Code Completion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+S">Siyuan Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jia Li</a>, <a href="/search/cs?searchtype=author&query=Zong%2C+H">He Zong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huanyu Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Hao Zhu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shukai Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+E">Erlu Li</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+J">Jiazheng Ding</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Y">Yu Han</a>, <a href="/search/cs?searchtype=author&query=Ning%2C+W">Wei Ning</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Gen Wang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yihong Dong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kechi Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Ge Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13187v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) have been widely used in code completion, and researchers are focusing on scaling up LLMs to improve their accuracy. However, larger LLMs will increase the response time of code completion and decrease the developers' productivity. In this paper, we propose a lightweight and effective LLM for code completion named aiXcoder-7B. Compared to existing LLMs, aiXcoder-7B ach… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13187v2-abstract-full').style.display = 'inline'; document.getElementById('2410.13187v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13187v2-abstract-full" style="display: none;"> Large Language Models (LLMs) have been widely used in code completion, and researchers are focusing on scaling up LLMs to improve their accuracy. However, larger LLMs will increase the response time of code completion and decrease the developers' productivity. In this paper, we propose a lightweight and effective LLM for code completion named aiXcoder-7B. Compared to existing LLMs, aiXcoder-7B achieves higher code completion accuracy while having smaller scales (i.e., 7 billion parameters). We attribute the superiority of aiXcoder-7B to three key factors: (1) Multi-objective training. We employ three training objectives, one of which is our proposed Structured Fill-In-the-Middle (SFIM). SFIM considers the syntax structures in code and effectively improves the performance of LLMs for code. (2) Diverse data sampling strategies. They consider inter-file relationships and enhance the capability of LLMs in understanding cross-file contexts. (3) Extensive high-quality data. We establish a rigorous data collection pipeline and consume a total of 1.2 trillion unique tokens for training aiXcoder-7B. This vast volume of data enables aiXcoder-7B to learn a broad distribution of code. We evaluate aiXcoder-7B in five popular code completion benchmarks and a new benchmark collected by this paper. The results show that aiXcoder-7B outperforms the latest six LLMs with similar sizes and even surpasses four larger LLMs (e.g., StarCoder2-15B and CodeLlama-34B), positioning aiXcoder-7B as a lightweight and effective LLM for academia and industry. Finally, we summarize three valuable insights for helping practitioners train the next generations of LLMs for code. aiXcoder-7B has been open-souced and gained significant attention. As of the submission date, aiXcoder-7B has received 2,193 GitHub Stars. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13187v2-abstract-full').style.display = 'none'; document.getElementById('2410.13187v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">aiXcoder-7B is available at https://github.com/aixcoder-plugin/aiXcoder-7B</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12811">arXiv:2410.12811</a> <span> [<a href="https://arxiv.org/pdf/2410.12811">pdf</a>, <a href="https://arxiv.org/format/2410.12811">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Decoding Emotions: Unveiling Facial Expressions through Acoustic Sensing with Contrastive Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guangjing Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Juexing Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+C">Ce Zhou</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+W">Weikang Ding</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+H">Huacheng Zeng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianxing Li</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Q">Qiben Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12811v1-abstract-short" style="display: inline;"> Expression recognition holds great promise for applications such as content recommendation and mental healthcare by accurately detecting users' emotional states. Traditional methods often rely on cameras or wearable sensors, which raise privacy concerns and add extra device burdens. In addition, existing acoustic-based methods struggle to maintain satisfactory performance when there is a distribut… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12811v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12811v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12811v1-abstract-full" style="display: none;"> Expression recognition holds great promise for applications such as content recommendation and mental healthcare by accurately detecting users' emotional states. Traditional methods often rely on cameras or wearable sensors, which raise privacy concerns and add extra device burdens. In addition, existing acoustic-based methods struggle to maintain satisfactory performance when there is a distribution shift between the training dataset and the inference dataset. In this paper, we introduce FacER+, an active acoustic facial expression recognition system, which eliminates the requirement for external microphone arrays. FacER+ extracts facial expression features by analyzing the echoes of near-ultrasound signals emitted between the 3D facial contour and the earpiece speaker on a smartphone. This approach not only reduces background noise but also enables the identification of different expressions from various users with minimal training data. We develop a contrastive external attention-based model to consistently learn expression features across different users, reducing the distribution differences. Extensive experiments involving 20 volunteers, both with and without masks, demonstrate that FacER+ can accurately recognize six common facial expressions with over 90% accuracy in diverse, user-independent real-life scenarios, surpassing the performance of the leading acoustic sensing methods by 10%. FacER+ offers a robust and practical solution for facial expression recognition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12811v1-abstract-full').style.display = 'none'; document.getElementById('2410.12811v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The extended version of the 2023 IEEE INFOCOM conference paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12262">arXiv:2410.12262</a> <span> [<a href="https://arxiv.org/pdf/2410.12262">pdf</a>, <a href="https://arxiv.org/format/2410.12262">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> 3D Gaussian Splatting in Robotics: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+S">Siting Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guangming Wang</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+D">Dezhi Kong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hesheng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12262v1-abstract-short" style="display: inline;"> Dense 3D representations of the environment have been a long-term goal in the robotics field. While previous Neural Radiance Fields (NeRF) representation have been prevalent for its implicit, coordinate-based model, the recent emergence of 3D Gaussian Splatting (3DGS) has demonstrated remarkable potential in its explicit radiance field representation. By leveraging 3D Gaussian primitives for expli… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12262v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12262v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12262v1-abstract-full" style="display: none;"> Dense 3D representations of the environment have been a long-term goal in the robotics field. While previous Neural Radiance Fields (NeRF) representation have been prevalent for its implicit, coordinate-based model, the recent emergence of 3D Gaussian Splatting (3DGS) has demonstrated remarkable potential in its explicit radiance field representation. By leveraging 3D Gaussian primitives for explicit scene representation and enabling differentiable rendering, 3DGS has shown significant advantages over other radiance fields in real-time rendering and photo-realistic performance, which is beneficial for robotic applications. In this survey, we provide a comprehensive understanding of 3DGS in the field of robotics. We divide our discussion of the related works into two main categories: the application of 3DGS and the advancements in 3DGS techniques. In the application section, we explore how 3DGS has been utilized in various robotics tasks from scene understanding and interaction perspectives. The advance of 3DGS section focuses on the improvements of 3DGS own properties in its adaptability and efficiency, aiming to enhance its performance in robotics. We then summarize the most commonly used datasets and evaluation metrics in robotics. Finally, we identify the challenges and limitations of current 3DGS methods and discuss the future development of 3DGS in robotics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12262v1-abstract-full').style.display = 'none'; document.getElementById('2410.12262v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12206">arXiv:2410.12206</a> <span> [<a href="https://arxiv.org/pdf/2410.12206">pdf</a>, <a href="https://arxiv.org/format/2410.12206">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Abnormality Forecasting: Time Series Anomaly Prediction via Future Context Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Sinong Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenrui Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hongzuo Xu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhaoyang Yu</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Q">Qingsong Wen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Gang Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+x">xiaoguang Liu</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+G">Guansong Pang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12206v1-abstract-short" style="display: inline;"> Identifying anomalies from time series data plays an important role in various fields such as infrastructure security, intelligent operation and maintenance, and space exploration. Current research focuses on detecting the anomalies after they occur, which can lead to significant financial/reputation loss or infrastructure damage. In this work we instead study a more practical yet very challenging… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12206v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12206v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12206v1-abstract-full" style="display: none;"> Identifying anomalies from time series data plays an important role in various fields such as infrastructure security, intelligent operation and maintenance, and space exploration. Current research focuses on detecting the anomalies after they occur, which can lead to significant financial/reputation loss or infrastructure damage. In this work we instead study a more practical yet very challenging problem, time series anomaly prediction, aiming at providing early warnings for abnormal events before their occurrence. To tackle this problem, we introduce a novel principled approach, namely future context modeling (FCM). Its key insight is that the future abnormal events in a target window can be accurately predicted if their preceding observation window exhibits any subtle difference to normal data. To effectively capture such differences, FCM first leverages long-term forecasting models to generate a discriminative future context based on the observation data, aiming to amplify those subtle but unusual difference. It then models a normality correlation of the observation data with the forecasting future context to complement the normality modeling of the observation data in foreseeing possible abnormality in the target window. A joint variate-time attention learning is also introduced in FCM to leverage both temporal signals and features of the time series data for more discriminative normality modeling in the aforementioned two views. Comprehensive experiments on five datasets demonstrate that FCM gains good recall rate (70\%+) on multiple datasets and significantly outperforms all baselines in F1 score. Code is available at https://github.com/mala-lab/FCM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12206v1-abstract-full').style.display = 'none'; document.getElementById('2410.12206v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 5 figures, submitted to KDD conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11761">arXiv:2410.11761</a> <span> [<a href="https://arxiv.org/pdf/2410.11761">pdf</a>, <a href="https://arxiv.org/format/2410.11761">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SlideChat: A Large Vision-Language Assistant for Whole-Slide Pathology Image Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Ying Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoan Wang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+Y">Yuanfeng Ji</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yanjun Li</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jin Ye</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianbin Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bin Zhang</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+N">Nana Pei</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Rongshan Yu</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Junjun He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11761v2-abstract-short" style="display: inline;"> Despite the progress made by multimodal large language models (MLLMs) in computational pathology, they remain limited by a predominant focus on patch-level analysis, missing essential contextual information at the whole-slide level. The lack of large-scale instruction datasets and the gigapixel scale of whole slide images (WSIs) pose significant developmental challenges. In this paper, we present… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11761v2-abstract-full').style.display = 'inline'; document.getElementById('2410.11761v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11761v2-abstract-full" style="display: none;"> Despite the progress made by multimodal large language models (MLLMs) in computational pathology, they remain limited by a predominant focus on patch-level analysis, missing essential contextual information at the whole-slide level. The lack of large-scale instruction datasets and the gigapixel scale of whole slide images (WSIs) pose significant developmental challenges. In this paper, we present SlideChat, the first vision-language assistant capable of understanding gigapixel whole-slide images, exhibiting excellent multimodal conversational capability and response complex instruction across diverse pathology scenarios. To support its development, we created SlideInstruction, the largest instruction-following dataset for WSIs consisting of 4.2K WSI captions and 176K VQA pairs with multiple categories. Furthermore, we propose SlideBench, a multimodal benchmark that incorporates captioning and VQA tasks to assess SlideChat's capabilities in varied clinical settings such as microscopy, diagnosis. Compared to both general and specialized MLLMs, SlideChat exhibits exceptional capabilities achieving state-of-the-art performance on 18 of 22 tasks. For example, it achieved an overall accuracy of 81.17% on SlideBench-VQA (TCGA), and 54.15% on SlideBench-VQA (BCNB). We will fully release SlideChat, SlideInstruction and SlideBench as open-source resources to facilitate research and development in computational pathology. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11761v2-abstract-full').style.display = 'none'; document.getElementById('2410.11761v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11488">arXiv:2410.11488</a> <span> [<a href="https://arxiv.org/pdf/2410.11488">pdf</a>, <a href="https://arxiv.org/format/2410.11488">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Advancing Training Efficiency of Deep Spiking Neural Networks through Rate-based Backpropagation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+C">Chengting Yu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lei Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Gaoang Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+E">Erping Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A">Aili Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11488v2-abstract-short" style="display: inline;"> Recent insights have revealed that rate-coding is a primary form of information representation captured by surrogate-gradient-based Backpropagation Through Time (BPTT) in training deep Spiking Neural Networks (SNNs). Motivated by these findings, we propose rate-based backpropagation, a training strategy specifically designed to exploit rate-based representations to reduce the complexity of BPTT. O… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11488v2-abstract-full').style.display = 'inline'; document.getElementById('2410.11488v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11488v2-abstract-full" style="display: none;"> Recent insights have revealed that rate-coding is a primary form of information representation captured by surrogate-gradient-based Backpropagation Through Time (BPTT) in training deep Spiking Neural Networks (SNNs). Motivated by these findings, we propose rate-based backpropagation, a training strategy specifically designed to exploit rate-based representations to reduce the complexity of BPTT. Our method minimizes reliance on detailed temporal derivatives by focusing on averaged dynamics, streamlining the computational graph to reduce memory and computational demands of SNNs training. We substantiate the rationality of the gradient approximation between BPTT and the proposed method through both theoretical analysis and empirical observations. Comprehensive experiments on CIFAR-10, CIFAR-100, ImageNet, and CIFAR10-DVS validate that our method achieves comparable performance to BPTT counterparts, and surpasses state-of-the-art efficient training techniques. By leveraging the inherent benefits of rate-coding, this work sets the stage for more scalable and efficient SNNs training within resource-constrained environments. Our code is available at https://github.com/Tab-ct/rate-based-backpropagation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11488v2-abstract-full').style.display = 'none'; document.getElementById('2410.11488v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11323">arXiv:2410.11323</a> <span> [<a href="https://arxiv.org/pdf/2410.11323">pdf</a>, <a href="https://arxiv.org/format/2410.11323">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> KA-GNN: Kolmogorov-Arnold Graph Neural Networks for Molecular Property Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+L">Longlong Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yipeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guanghui Wang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+K">Kelin Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11323v1-abstract-short" style="display: inline;"> Molecular property prediction is a crucial task in the process of Artificial Intelligence-Driven Drug Discovery (AIDD). The challenge of developing models that surpass traditional non-neural network methods continues to be a vibrant area of research. This paper presents a novel graph neural network model-the Kolmogorov-Arnold Network (KAN)-based Graph Neural Network (KA-GNN), which incorporates Fo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11323v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11323v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11323v1-abstract-full" style="display: none;"> Molecular property prediction is a crucial task in the process of Artificial Intelligence-Driven Drug Discovery (AIDD). The challenge of developing models that surpass traditional non-neural network methods continues to be a vibrant area of research. This paper presents a novel graph neural network model-the Kolmogorov-Arnold Network (KAN)-based Graph Neural Network (KA-GNN), which incorporates Fourier series, specifically designed for molecular property prediction. This model maintains the high interpretability characteristic of KAN methods while being extremely efficient in computational resource usage, making it an ideal choice for deployment in resource-constrained environments. Tested and validated on seven public datasets, KA-GNN has shown significant improvements in property predictions over the existing state-of-the-art (SOTA) benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11323v1-abstract-full').style.display = 'none'; document.getElementById('2410.11323v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10320">arXiv:2410.10320</a> <span> [<a href="https://arxiv.org/pdf/2410.10320">pdf</a>, <a href="https://arxiv.org/format/2410.10320">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DiRW: Path-Aware Digraph Learning for Heterophily </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+D">Daohan Su</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xunkai Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhenjun Li</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+Y">Yinping Liao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Rong-Hua Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoren Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10320v1-abstract-short" style="display: inline;"> Recently, graph neural network (GNN) has emerged as a powerful representation learning tool for graph-structured data. However, most approaches are tailored for undirected graphs, neglecting the abundant information embedded in the edges of directed graphs (digraphs). In fact, digraphs are widely applied in the real world (e.g., social networks and recommendations) and are also confirmed to offer… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10320v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10320v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10320v1-abstract-full" style="display: none;"> Recently, graph neural network (GNN) has emerged as a powerful representation learning tool for graph-structured data. However, most approaches are tailored for undirected graphs, neglecting the abundant information embedded in the edges of directed graphs (digraphs). In fact, digraphs are widely applied in the real world (e.g., social networks and recommendations) and are also confirmed to offer a new perspective for addressing topological heterophily challenges (i.e., connected nodes have complex patterns of feature distribution or labels). Despite recent significant advancements in DiGNNs, existing spatial- and spectral-based methods have inherent limitations due to the complex learning mechanisms and reliance on high-quality topology, leading to low efficiency and unstable performance. To address these issues, we propose Directed Random Walk (DiRW), which can be viewed as a plug-and-play strategy or an innovative neural architecture that provides a guidance or new learning paradigm for most spatial-based methods or digraphs. Specifically, DiRW incorporates a direction-aware path sampler optimized from the perspectives of walk probability, length, and number in a weight-free manner by considering node profiles and topological structure. Building upon this, DiRW utilizes a node-wise learnable path aggregator for generalized messages obtained by our proposed adaptive walkers to represent the current node. Extensive experiments on 9 datasets demonstrate that DiRW: (1) enhances most spatial-based methods as a plug-and-play strategy; (2) achieves SOTA performance as a new digraph learning paradigm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10320v1-abstract-full').style.display = 'none'; document.getElementById('2410.10320v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09885">arXiv:2410.09885</a> <span> [<a href="https://arxiv.org/pdf/2410.09885">pdf</a>, <a href="https://arxiv.org/format/2410.09885">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Occluded Human Pose Estimation based on Limb Joint Augmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+G">Gangtao Han</a>, <a href="/search/cs?searchtype=author&query=Song%2C+C">Chunxiao Song</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Song Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hao Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+E">Enqing Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guanghui Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09885v1-abstract-short" style="display: inline;"> Human pose estimation aims at locating the specific joints of humans from the images or videos. While existing deep learning-based methods have achieved high positioning accuracy, they often struggle with generalization in occlusion scenarios. In this paper, we propose an occluded human pose estimation framework based on limb joint augmentation to enhance the generalization ability of the pose est… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09885v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09885v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09885v1-abstract-full" style="display: none;"> Human pose estimation aims at locating the specific joints of humans from the images or videos. While existing deep learning-based methods have achieved high positioning accuracy, they often struggle with generalization in occlusion scenarios. In this paper, we propose an occluded human pose estimation framework based on limb joint augmentation to enhance the generalization ability of the pose estimation model on the occluded human bodies. Specifically, the occlusion blocks are at first employed to randomly cover the limb joints of the human bodies from the training images, imitating the scene where the objects or other people partially occlude the human body. Trained by the augmented samples, the pose estimation model is encouraged to accurately locate the occluded keypoints based on the visible ones. To further enhance the localization ability of the model, this paper constructs a dynamic structure loss function based on limb graphs to explore the distribution of occluded joints by evaluating the dependence between adjacent joints. Extensive experimental evaluations on two occluded datasets, OCHuman and CrowdPose, demonstrate significant performance improvements without additional computation cost during inference. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09885v1-abstract-full').style.display = 'none'; document.getElementById('2410.09885v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accept by NCAA</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09691">arXiv:2410.09691</a> <span> [<a href="https://arxiv.org/pdf/2410.09691">pdf</a>, <a href="https://arxiv.org/format/2410.09691">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Robust 3D Point Clouds Classification based on Declarative Defenders </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+K">Kaidong Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianxiao Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+C">Cuncong Zhong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Ziming Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guanghui Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09691v2-abstract-short" style="display: inline;"> 3D point cloud classification requires distinct models from 2D image classification due to the divergent characteristics of the respective input data. While 3D point clouds are unstructured and sparse, 2D images are structured and dense. Bridging the domain gap between these two data types is a non-trivial challenge to enable model interchangeability. Recent research using Lattice Point Classifier… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09691v2-abstract-full').style.display = 'inline'; document.getElementById('2410.09691v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09691v2-abstract-full" style="display: none;"> 3D point cloud classification requires distinct models from 2D image classification due to the divergent characteristics of the respective input data. While 3D point clouds are unstructured and sparse, 2D images are structured and dense. Bridging the domain gap between these two data types is a non-trivial challenge to enable model interchangeability. Recent research using Lattice Point Classifier (LPC) highlights the feasibility of cross-domain applicability. However, the lattice projection operation in LPC generates 2D images with disconnected projected pixels. In this paper, we explore three distinct algorithms for mapping 3D point clouds into 2D images. Through extensive experiments, we thoroughly examine and analyze their performance and defense mechanisms. Leveraging current large foundation models, we scrutinize the feature disparities between regular 2D images and projected 2D images. The proposed approaches demonstrate superior accuracy and robustness against adversarial attacks. The generative model-based mapping algorithms yield regular 2D images, further minimizing the domain gap from regular 2D classification tasks. The source code is available at https://github.com/KaidongLi/pytorch-LatticePointClassifier.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09691v2-abstract-full').style.display = 'none'; document.getElementById('2410.09691v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08567">arXiv:2410.08567</a> <span> [<a href="https://arxiv.org/pdf/2410.08567">pdf</a>, <a href="https://arxiv.org/format/2410.08567">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TCSVT.2024.3434740">10.1109/TCSVT.2024.3434740 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Diffusion-Based Depth Inpainting for Transparent and Reflective Objects </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+T">Tianyu Sun</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+D">Dingchang Hu</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+Y">Yixiang Dai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guijin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08567v1-abstract-short" style="display: inline;"> Transparent and reflective objects, which are common in our everyday lives, present a significant challenge to 3D imaging techniques due to their unique visual and optical properties. Faced with these types of objects, RGB-D cameras fail to capture the real depth value with their accurate spatial information. To address this issue, we propose DITR, a diffusion-based Depth Inpainting framework spec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08567v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08567v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08567v1-abstract-full" style="display: none;"> Transparent and reflective objects, which are common in our everyday lives, present a significant challenge to 3D imaging techniques due to their unique visual and optical properties. Faced with these types of objects, RGB-D cameras fail to capture the real depth value with their accurate spatial information. To address this issue, we propose DITR, a diffusion-based Depth Inpainting framework specifically designed for Transparent and Reflective objects. This network consists of two stages, including a Region Proposal stage and a Depth Inpainting stage. DITR dynamically analyzes the optical and geometric depth loss and inpaints them automatically. Furthermore, comprehensive experimental results demonstrate that DITR is highly effective in depth inpainting tasks of transparent and reflective objects with robust adaptability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08567v1-abstract-full').style.display = 'none'; document.getElementById('2410.08567v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Wang%2C+G&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Wang%2C+G&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+G&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+G&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+G&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+G&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>