Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 131 results for author: <span class="mathjax">Shi, F</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Shi%2C+F">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Shi, F"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Shi%2C+F&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Shi, F"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Shi%2C+F&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Shi%2C+F&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Shi%2C+F&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Shi%2C+F&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17862">arXiv:2503.17862</a> <span> [<a href="https://arxiv.org/pdf/2503.17862">pdf</a>, <a href="https://arxiv.org/format/2503.17862">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Causal Adjustment Module for Debiasing Scene Graph Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+L">Li Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+S">Shuzhou Sun</a>, <a href="/search/cs?searchtype=author&query=Zhi%2C+S">Shuaifeng Zhi</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Fan Shi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhen Liu</a>, <a href="/search/cs?searchtype=author&query=Heikkil%C3%A4%2C+J">Janne Heikkil盲</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yongxiang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17862v1-abstract-short" style="display: inline;"> While recent debiasing methods for Scene Graph Generation (SGG) have shown impressive performance, these efforts often attribute model bias solely to the long-tail distribution of relationships, overlooking the more profound causes stemming from skewed object and object pair distributions. In this paper, we employ causal inference techniques to model the causality among these observed skewed distr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17862v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17862v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17862v1-abstract-full" style="display: none;"> While recent debiasing methods for Scene Graph Generation (SGG) have shown impressive performance, these efforts often attribute model bias solely to the long-tail distribution of relationships, overlooking the more profound causes stemming from skewed object and object pair distributions. In this paper, we employ causal inference techniques to model the causality among these observed skewed distributions. Our insight lies in the ability of causal inference to capture the unobservable causal effects between complex distributions, which is crucial for tracing the roots of model bias. Specifically, we introduce the Mediator-based Causal Chain Model (MCCM), which, in addition to modeling causality among objects, object pairs, and relationships, incorporates mediator variables, i.e., cooccurrence distribution, for complementing the causality. Following this, we propose the Causal Adjustment Module (CAModule) to estimate the modeled causal structure, using variables from MCCM as inputs to produce a set of adjustment factors aimed at correcting biased model predictions. Moreover, our method enables the composition of zero-shot relationships, thereby enhancing the model's ability to recognize such relationships. Experiments conducted across various SGG backbones and popular benchmarks demonstrate that CAModule achieves state-of-the-art mean recall rates, with significant improvements also observed on the challenging zero-shot recall rate metric. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17862v1-abstract-full').style.display = 'none'; document.getElementById('2503.17862v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 8 tables, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15078">arXiv:2503.15078</a> <span> [<a href="https://arxiv.org/pdf/2503.15078">pdf</a>, <a href="https://arxiv.org/ps/2503.15078">ps</a>, <a href="https://arxiv.org/format/2503.15078">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Fast But Accurate: A Real-Time Hyperelastic Simulator with Robust Frictional Contact </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Ziqiu Zeng</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+S">Siyuan Luo</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Fan Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhongkai Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15078v1-abstract-short" style="display: inline;"> We present a GPU-friendly framework for real-time implicit simulation of elastic material in the presence of frictional contacts. The integration of hyperelasticity, non-interpenetration contact, and friction in real-time simulations presents formidable nonlinear and non-smooth problems, which are highly challenging to solve. By incorporating nonlinear complementarity conditions within the local-g… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15078v1-abstract-full').style.display = 'inline'; document.getElementById('2503.15078v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15078v1-abstract-full" style="display: none;"> We present a GPU-friendly framework for real-time implicit simulation of elastic material in the presence of frictional contacts. The integration of hyperelasticity, non-interpenetration contact, and friction in real-time simulations presents formidable nonlinear and non-smooth problems, which are highly challenging to solve. By incorporating nonlinear complementarity conditions within the local-global framework, we achieve rapid convergence in addressing these challenges. While the structure of local-global methods is not fully GPU-friendly, our proposal of a simple yet efficient solver with sparse presentation of the system inverse enables highly parallel computing while maintaining a fast convergence rate. Moreover, our novel splitting strategy for non-smooth indicators not only amplifies overall performance but also refines the complementarity preconditioner, enhancing the accuracy of frictional behavior modeling. Through extensive experimentation, the robustness of our framework in managing real-time contact scenarios, ranging from large-scale systems and extreme deformations to non-smooth contacts and precise friction interactions, has been validated. Compatible with a wide range of hyperelastic models, our approach maintains efficiency across both low and high stiffness materials. Despite its remarkable efficiency, robustness, and generality, our method is elegantly simple, with its core contributions grounded solely on standard matrix operations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15078v1-abstract-full').style.display = 'none'; document.getElementById('2503.15078v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08325">arXiv:2503.08325</a> <span> [<a href="https://arxiv.org/pdf/2503.08325">pdf</a>, <a href="https://arxiv.org/format/2503.08325">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Prototype-based Heterogeneous Federated Learning for Blade Icing Detection in Wind Turbines with Class Imbalanced Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qi%2C+L">Lele Qi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mengna Liu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xu Cheng</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Fan Shi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiufeng Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shengyong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08325v1-abstract-short" style="display: inline;"> Wind farms, typically in high-latitude regions, face a high risk of blade icing. Traditional centralized training methods raise serious privacy concerns. To enhance data privacy in detecting wind turbine blade icing, traditional federated learning (FL) is employed. However, data heterogeneity, resulting from collections across wind farms in varying environmental conditions, impacts the model's opt… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08325v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08325v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08325v1-abstract-full" style="display: none;"> Wind farms, typically in high-latitude regions, face a high risk of blade icing. Traditional centralized training methods raise serious privacy concerns. To enhance data privacy in detecting wind turbine blade icing, traditional federated learning (FL) is employed. However, data heterogeneity, resulting from collections across wind farms in varying environmental conditions, impacts the model's optimization capabilities. Moreover, imbalances in wind turbine data lead to models that tend to favor recognizing majority classes, thus neglecting critical icing anomalies. To tackle these challenges, we propose a federated prototype learning model for class-imbalanced data in heterogeneous environments to detect wind turbine blade icing. We also propose a contrastive supervised loss function to address the class imbalance problem. Experiments on real data from 20 turbines across two wind farms show our method outperforms five FL models and five class imbalance methods, with an average improvement of 19.64\% in $ mF_尾 $ and 5.73\% in $ m $BA compared to the second-best method, BiFL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08325v1-abstract-full').style.display = 'none'; document.getElementById('2503.08325v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.01113">arXiv:2503.01113</a> <span> [<a href="https://arxiv.org/pdf/2503.01113">pdf</a>, <a href="https://arxiv.org/format/2503.01113">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SCSegamba: Lightweight Structure-Aware Vision Mamba for Crack Segmentation in Structures </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hui Liu</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+C">Chen Jia</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Fan Shi</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xu Cheng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shengyong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.01113v3-abstract-short" style="display: inline;"> Pixel-level segmentation of structural cracks across various scenarios remains a considerable challenge. Current methods encounter challenges in effectively modeling crack morphology and texture, facing challenges in balancing segmentation quality with low computational resource usage. To overcome these limitations, we propose a lightweight Structure-Aware Vision Mamba Network (SCSegamba), capable… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01113v3-abstract-full').style.display = 'inline'; document.getElementById('2503.01113v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.01113v3-abstract-full" style="display: none;"> Pixel-level segmentation of structural cracks across various scenarios remains a considerable challenge. Current methods encounter challenges in effectively modeling crack morphology and texture, facing challenges in balancing segmentation quality with low computational resource usage. To overcome these limitations, we propose a lightweight Structure-Aware Vision Mamba Network (SCSegamba), capable of generating high-quality pixel-level segmentation maps by leveraging both the morphological information and texture cues of crack pixels with minimal computational cost. Specifically, we developed a Structure-Aware Visual State Space module (SAVSS), which incorporates a lightweight Gated Bottleneck Convolution (GBC) and a Structure-Aware Scanning Strategy (SASS). The key insight of GBC lies in its effectiveness in modeling the morphological information of cracks, while the SASS enhances the perception of crack topology and texture by strengthening the continuity of semantic information between crack pixels. Experiments on crack benchmark datasets demonstrate that our method outperforms other state-of-the-art (SOTA) methods, achieving the highest performance with only 2.8M parameters. On the multi-scenario dataset, our method reached 0.8390 in F1 score and 0.8479 in mIoU. The code is available at https://github.com/Karl1109/SCSegamba. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01113v3-abstract-full').style.display = 'none'; document.getElementById('2503.01113v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted by CVPR2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.00762">arXiv:2503.00762</a> <span> [<a href="https://arxiv.org/pdf/2503.00762">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> MR-EIT: Multi-Resolution Reconstruction for Electrical Impedance Tomography via Data-Driven and Unsupervised Dual-Mode Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+F">Fangming Shi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jinzhen Liu</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+X">Xiangqian Meng</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yapeng Zhou</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+H">Hui Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.00762v1-abstract-short" style="display: inline;"> This paper presents a multi-resolution reconstruction method for Electrical Impedance Tomography (EIT), referred to as MR-EIT, which is capable of operating in both supervised and unsupervised learning modes. MR-EIT integrates an ordered feature extraction module and an unordered coordinate feature expression module. The former achieves the mapping from voltage to two-dimensional conductivity feat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.00762v1-abstract-full').style.display = 'inline'; document.getElementById('2503.00762v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.00762v1-abstract-full" style="display: none;"> This paper presents a multi-resolution reconstruction method for Electrical Impedance Tomography (EIT), referred to as MR-EIT, which is capable of operating in both supervised and unsupervised learning modes. MR-EIT integrates an ordered feature extraction module and an unordered coordinate feature expression module. The former achieves the mapping from voltage to two-dimensional conductivity features through pre-training, while the latter realizes multi-resolution reconstruction independent of the order and size of the input sequence by utilizing symmetric functions and local feature extraction mechanisms. In the data-driven mode, MR-EIT reconstructs high-resolution images from low-resolution data of finite element meshes through two stages of pre-training and joint training, and demonstrates excellent performance in simulation experiments. In the unsupervised learning mode, MR-EIT does not require pre-training data and performs iterative optimization solely based on measured voltages to rapidly achieve image reconstruction from low to high resolution. It shows robustness to noise and efficient super-resolution reconstruction capabilities in both simulation and real water tank experiments. Experimental results indicate that MR-EIT outperforms the comparison methods in terms of Structural Similarity (SSIM) and Relative Image Error (RIE), especially in the unsupervised learning mode, where it can significantly reduce the number of iterations and improve image reconstruction quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.00762v1-abstract-full').style.display = 'none'; document.getElementById('2503.00762v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11364">arXiv:2502.11364</a> <span> [<a href="https://arxiv.org/pdf/2502.11364">pdf</a>, <a href="https://arxiv.org/format/2502.11364">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Blessing of Multilinguality: A Systematic Analysis of Multilingual In-Context Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tu%2C+Y">Yilei Tu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+A">Andrew Xue</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Freda Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11364v2-abstract-short" style="display: inline;"> While multilingual large language models generally perform adequately, and sometimes even rival English performance on high-resource languages (HRLs), they often significantly underperform on low-resource languages (LRLs). Among several prompting strategies aiming at bridging the gap, multilingual in-context learning (ICL) has been particularly effective when demonstration in target languages is u… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11364v2-abstract-full').style.display = 'inline'; document.getElementById('2502.11364v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11364v2-abstract-full" style="display: none;"> While multilingual large language models generally perform adequately, and sometimes even rival English performance on high-resource languages (HRLs), they often significantly underperform on low-resource languages (LRLs). Among several prompting strategies aiming at bridging the gap, multilingual in-context learning (ICL) has been particularly effective when demonstration in target languages is unavailable. However, there lacks a systematic understanding of when and why it works well. In this work, we systematically analyze multilingual ICL, using demonstrations in HRLs to enhance cross-lingual transfer. We show that demonstrations in mixed HRLs consistently outperform English-only ones across the board, particularly for tasks written in LRLs. Surprisingly, our ablation study shows that the presence of irrelevant non-English sentences in the prompt yields measurable gains, suggesting the effectiveness of multilingual exposure itself. Our results highlight the potential of strategically leveraging multilingual resources to bridge the performance gap for underrepresented languages. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11364v2-abstract-full').style.display = 'none'; document.getElementById('2502.11364v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10983">arXiv:2502.10983</a> <span> [<a href="https://arxiv.org/pdf/2502.10983">pdf</a>, <a href="https://arxiv.org/format/2502.10983">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Learning Quiet Walking for a Small Home Robot </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Watanabe%2C+R">Ryo Watanabe</a>, <a href="/search/cs?searchtype=author&query=Miki%2C+T">Takahiro Miki</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Fan Shi</a>, <a href="/search/cs?searchtype=author&query=Kadokawa%2C+Y">Yuki Kadokawa</a>, <a href="/search/cs?searchtype=author&query=Bjelonic%2C+F">Filip Bjelonic</a>, <a href="/search/cs?searchtype=author&query=Kawaharazuka%2C+K">Kento Kawaharazuka</a>, <a href="/search/cs?searchtype=author&query=Cramariuc%2C+A">Andrei Cramariuc</a>, <a href="/search/cs?searchtype=author&query=Hutter%2C+M">Marco Hutter</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10983v2-abstract-short" style="display: inline;"> As home robotics gains traction, robots are increasingly integrated into households, offering companionship and assistance. Quadruped robots, particularly those resembling dogs, have emerged as popular alternatives for traditional pets. However, user feedback highlights concerns about the noise these robots generate during walking at home, particularly the loud footstep sound. To address this issu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10983v2-abstract-full').style.display = 'inline'; document.getElementById('2502.10983v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10983v2-abstract-full" style="display: none;"> As home robotics gains traction, robots are increasingly integrated into households, offering companionship and assistance. Quadruped robots, particularly those resembling dogs, have emerged as popular alternatives for traditional pets. However, user feedback highlights concerns about the noise these robots generate during walking at home, particularly the loud footstep sound. To address this issue, we propose a sim-to-real based reinforcement learning (RL) approach to minimize the foot contact velocity highly related to the footstep sound. Our framework incorporates three key elements: learning varying PD gains to actively dampen and stiffen each joint, utilizing foot contact sensors, and employing curriculum learning to gradually enforce penalties on foot contact velocity. Experiments demonstrate that our learned policy achieves superior quietness compared to a RL baseline and the carefully handcrafted Sony commercial controllers. Furthermore, the trade-off between robustness and quietness is shown. This research contributes to developing quieter and more user-friendly robotic companions in home environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10983v2-abstract-full').style.display = 'none'; document.getElementById('2502.10983v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted at ICRA 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09589">arXiv:2502.09589</a> <span> [<a href="https://arxiv.org/pdf/2502.09589">pdf</a>, <a href="https://arxiv.org/format/2502.09589">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Logic in Computer Science">cs.LO</span> </div> </div> <p class="title is-5 mathjax"> Logical forms complement probability in understanding language model (and human) performance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yixuan Wang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Freda Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09589v2-abstract-short" style="display: inline;"> With the increasing interest in using large language models (LLMs) for planning in natural language, understanding their behaviors becomes an important research question. This work conducts a systematic investigation of LLMs' ability to perform logical reasoning in natural language. We introduce a controlled dataset of hypothetical and disjunctive syllogisms in propositional and modal logic and us… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09589v2-abstract-full').style.display = 'inline'; document.getElementById('2502.09589v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09589v2-abstract-full" style="display: none;"> With the increasing interest in using large language models (LLMs) for planning in natural language, understanding their behaviors becomes an important research question. This work conducts a systematic investigation of LLMs' ability to perform logical reasoning in natural language. We introduce a controlled dataset of hypothetical and disjunctive syllogisms in propositional and modal logic and use it as the testbed for understanding LLM performance. Our results lead to novel insights in predicting LLM behaviors: in addition to the probability of input (Gonen et al., 2023; McCoy et al., 2024), logical forms should be considered as important factors. In addition, we show similarities and discrepancies between the logical reasoning performances of humans and LLMs by collecting and comparing behavioral data from both. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09589v2-abstract-full').style.display = 'none'; document.getElementById('2502.09589v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16452">arXiv:2412.16452</a> <span> [<a href="https://arxiv.org/pdf/2412.16452">pdf</a>, <a href="https://arxiv.org/format/2412.16452">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Methodology">stat.ME</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Econometrics">econ.EM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Statistics Theory">math.ST</span> </div> </div> <p class="title is-5 mathjax"> Sharp Results for Hypothesis Testing with Risk-Sensitive Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+F+C">Flora C. Shi</a>, <a href="/search/cs?searchtype=author&query=Bates%2C+S">Stephen Bates</a>, <a href="/search/cs?searchtype=author&query=Wainwright%2C+M+J">Martin J. Wainwright</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16452v1-abstract-short" style="display: inline;"> Statistical protocols are often used for decision-making involving multiple parties, each with their own incentives, private information, and ability to influence the distributional properties of the data. We study a game-theoretic version of hypothesis testing in which a statistician, also known as a principal, interacts with strategic agents that can generate data. The statistician seeks to desi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16452v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16452v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16452v1-abstract-full" style="display: none;"> Statistical protocols are often used for decision-making involving multiple parties, each with their own incentives, private information, and ability to influence the distributional properties of the data. We study a game-theoretic version of hypothesis testing in which a statistician, also known as a principal, interacts with strategic agents that can generate data. The statistician seeks to design a testing protocol with controlled error, while the data-generating agents, guided by their utility and prior information, choose whether or not to opt in based on expected utility maximization. This strategic behavior affects the data observed by the statistician and, consequently, the associated testing error. We analyze this problem for general concave and monotonic utility functions and prove an upper bound on the Bayes false discovery rate (FDR). Underlying this bound is a form of prior elicitation: we show how an agent's choice to opt in implies a certain upper bound on their prior null probability. Our FDR bound is unimprovable in a strong sense, achieving equality at a single point for an individual agent and at any countable number of points for a population of agents. We also demonstrate that our testing protocols exhibit a desirable maximin property when the principal's utility is considered. To illustrate the qualitative predictions of our theory, we examine the effects of risk aversion, reward stochasticity, and signal-to-noise ratio, as well as the implications for the Food and Drug Administration's testing protocols. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16452v1-abstract-full').style.display = 'none'; document.getElementById('2412.16452v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.02692">arXiv:2412.02692</a> <span> [<a href="https://arxiv.org/pdf/2412.02692">pdf</a>, <a href="https://arxiv.org/format/2412.02692">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Scalable Image Tokenization with Index Backpropagation Quantization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+F">Fengyuan Shi</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Z">Zhuoyan Luo</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+Y">Yixiao Ge</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yujiu Yang</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+Y">Ying Shan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Limin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.02692v2-abstract-short" style="display: inline;"> Existing vector quantization (VQ) methods struggle with scalability, largely attributed to the instability of the codebook that undergoes partial updates during training. The codebook is prone to collapse as utilization decreases, due to the progressively widening distribution gap between non-activated codes and visual features. To solve the problem, we propose Index Backpropagation Quantization (… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02692v2-abstract-full').style.display = 'inline'; document.getElementById('2412.02692v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.02692v2-abstract-full" style="display: none;"> Existing vector quantization (VQ) methods struggle with scalability, largely attributed to the instability of the codebook that undergoes partial updates during training. The codebook is prone to collapse as utilization decreases, due to the progressively widening distribution gap between non-activated codes and visual features. To solve the problem, we propose Index Backpropagation Quantization (IBQ), a new VQ method for the joint optimization of all codebook embeddings and the visual encoder. Applying a straight-through estimator on the one-hot categorical distribution between the encoded feature and codebook, all codes are differentiable and maintain a consistent latent space with the visual encoder. IBQ enables scalable training of visual tokenizers and, for the first time, achieves a large-scale codebook ($2^{18}$) with high dimension ($256$) and high utilization. Experiments on the standard ImageNet benchmark demonstrate the scalability and superiority of IBQ, achieving competitive results on reconstruction and the application of autoregressive visual generation. The code and models are available at https://github.com/TencentARC/SEED-Voken. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02692v2-abstract-full').style.display = 'none'; document.getElementById('2412.02692v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08724">arXiv:2411.08724</a> <span> [<a href="https://arxiv.org/pdf/2411.08724">pdf</a>, <a href="https://arxiv.org/format/2411.08724">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> QCG-Rerank: Chunks Graph Rerank with Query Expansion in Retrieval-Augmented LLMs for Tourism Domain </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+Q">Qikai Wei</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Mingzhi Yang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+C">Chunlong Han</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+J">Jingfu Wei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Minghao Zhang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Feifei Shi</a>, <a href="/search/cs?searchtype=author&query=Ning%2C+H">Huansheng Ning</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08724v1-abstract-short" style="display: inline;"> Retrieval-Augmented Generation (RAG) mitigates the issue of hallucination in Large Language Models (LLMs) by integrating information retrieval techniques. However, in the tourism domain, since the query is usually brief and the content in the database is diverse, existing RAG may contain a significant amount of irrelevant or contradictory information contents after retrieval. To address this chall… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08724v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08724v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08724v1-abstract-full" style="display: none;"> Retrieval-Augmented Generation (RAG) mitigates the issue of hallucination in Large Language Models (LLMs) by integrating information retrieval techniques. However, in the tourism domain, since the query is usually brief and the content in the database is diverse, existing RAG may contain a significant amount of irrelevant or contradictory information contents after retrieval. To address this challenge, we propose the QCG-Rerank model. This model first performs an initial retrieval to obtain candidate chunks and then enhances semantics by extracting critical information to expand the original query. Next, we utilize the expanded query and candidate chunks to calculate similarity scores as the initial transition probability and construct the chunks graph. Subsequently, We iteratively compute the transition probabilities based on an initial estimate until convergence. The chunks with the highest score are selected and input into the LLMs to generate responses. We evaluate the model on Cultour, IIRC, StrategyQA, HotpotQA, SQuAD, and MuSiQue datasets. The experimental results demonstrate the effectiveness and superiority of the QCG-Rerank method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08724v1-abstract-full').style.display = 'none'; document.getElementById('2411.08724v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17385">arXiv:2410.17385</a> <span> [<a href="https://arxiv.org/pdf/2410.17385">pdf</a>, <a href="https://arxiv.org/format/2410.17385">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Do Vision-Language Models Represent Space and How? Evaluating Spatial Frame of Reference Under Ambiguities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zheyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+F">Fengyuan Hu</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jayjun Lee</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Freda Shi</a>, <a href="/search/cs?searchtype=author&query=Kordjamshidi%2C+P">Parisa Kordjamshidi</a>, <a href="/search/cs?searchtype=author&query=Chai%2C+J">Joyce Chai</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Ziqiao Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17385v1-abstract-short" style="display: inline;"> Spatial expressions in situated communication can be ambiguous, as their meanings vary depending on the frames of reference (FoR) adopted by speakers and listeners. While spatial language understanding and reasoning by vision-language models (VLMs) have gained increasing attention, potential ambiguities in these models are still under-explored. To address this issue, we present the COnsistent Mult… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17385v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17385v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17385v1-abstract-full" style="display: none;"> Spatial expressions in situated communication can be ambiguous, as their meanings vary depending on the frames of reference (FoR) adopted by speakers and listeners. While spatial language understanding and reasoning by vision-language models (VLMs) have gained increasing attention, potential ambiguities in these models are still under-explored. To address this issue, we present the COnsistent Multilingual Frame Of Reference Test (COMFORT), an evaluation protocol to systematically assess the spatial reasoning capabilities of VLMs. We evaluate nine state-of-the-art VLMs using COMFORT. Despite showing some alignment with English conventions in resolving ambiguities, our experiments reveal significant shortcomings of VLMs: notably, the models (1) exhibit poor robustness and consistency, (2) lack the flexibility to accommodate multiple FoRs, and (3) fail to adhere to language-specific or culture-specific conventions in cross-lingual tests, as English tends to dominate other languages. With a growing effort to align vision-language models with human cognitive intuitions, we call for more attention to the ambiguous nature and cross-cultural diversity of spatial reasoning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17385v1-abstract-full').style.display = 'none'; document.getElementById('2410.17385v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to Pluralistic Alignment @ NeurIPS 2024 | Project page: https://spatial-comfort.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03076">arXiv:2410.03076</a> <span> [<a href="https://arxiv.org/pdf/2410.03076">pdf</a>, <a href="https://arxiv.org/format/2410.03076">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Residual Policy Learning for Perceptive Quadruped Control Using Differentiable Simulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+J+Y">Jing Yuan Luo</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yunlong Song</a>, <a href="/search/cs?searchtype=author&query=Klemm%2C+V">Victor Klemm</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Fan Shi</a>, <a href="/search/cs?searchtype=author&query=Scaramuzza%2C+D">Davide Scaramuzza</a>, <a href="/search/cs?searchtype=author&query=Hutter%2C+M">Marco Hutter</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03076v1-abstract-short" style="display: inline;"> First-order Policy Gradient (FoPG) algorithms such as Backpropagation through Time and Analytical Policy Gradients leverage local simulation physics to accelerate policy search, significantly improving sample efficiency in robot control compared to standard model-free reinforcement learning. However, FoPG algorithms can exhibit poor learning dynamics in contact-rich tasks like locomotion. Previous… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03076v1-abstract-full').style.display = 'inline'; document.getElementById('2410.03076v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03076v1-abstract-full" style="display: none;"> First-order Policy Gradient (FoPG) algorithms such as Backpropagation through Time and Analytical Policy Gradients leverage local simulation physics to accelerate policy search, significantly improving sample efficiency in robot control compared to standard model-free reinforcement learning. However, FoPG algorithms can exhibit poor learning dynamics in contact-rich tasks like locomotion. Previous approaches address this issue by alleviating contact dynamics via algorithmic or simulation innovations. In contrast, we propose guiding the policy search by learning a residual over a simple baseline policy. For quadruped locomotion, we find that the role of residual policy learning in FoPG-based training (FoPG RPL) is primarily to improve asymptotic rewards, compared to improving sample efficiency for model-free RL. Additionally, we provide insights on applying FoPG's to pixel-based local navigation, training a point-mass robot to convergence within seconds. Finally, we showcase the versatility of FoPG RPL by using it to train locomotion and perceptive navigation end-to-end on a quadruped in minutes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03076v1-abstract-full').style.display = 'none'; document.getElementById('2410.03076v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09601">arXiv:2409.09601</a> <span> [<a href="https://arxiv.org/pdf/2409.09601">pdf</a>, <a href="https://arxiv.org/format/2409.09601">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> A Survey of Foundation Models for Music Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenjun Li</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Ying Cai</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Ziyang Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yifan Chen</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+R">Rundong Qi</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+M">Mengqi Dong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+P">Peigen Chen</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+X">Xiao Dong</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Fenghao Shi</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+L">Lei Guo</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Junwei Han</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+B">Bao Ge</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianming Liu</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+L">Lin Gan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tuo Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09601v1-abstract-short" style="display: inline;"> Music is essential in daily life, fulfilling emotional and entertainment needs, and connecting us personally, socially, and culturally. A better understanding of music can enhance our emotions, cognitive skills, and cultural connections. The rapid advancement of artificial intelligence (AI) has introduced new ways to analyze music, aiming to replicate human understanding of music and provide relat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09601v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09601v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09601v1-abstract-full" style="display: none;"> Music is essential in daily life, fulfilling emotional and entertainment needs, and connecting us personally, socially, and culturally. A better understanding of music can enhance our emotions, cognitive skills, and cultural connections. The rapid advancement of artificial intelligence (AI) has introduced new ways to analyze music, aiming to replicate human understanding of music and provide related services. While the traditional models focused on audio features and simple tasks, the recent development of large language models (LLMs) and foundation models (FMs), which excel in various fields by integrating semantic information and demonstrating strong reasoning abilities, could capture complex musical features and patterns, integrate music with language and incorporate rich musical, emotional and psychological knowledge. Therefore, they have the potential in handling complex music understanding tasks from a semantic perspective, producing outputs closer to human perception. This work, to our best knowledge, is one of the early reviews of the intersection of AI techniques and music understanding. We investigated, analyzed, and tested recent large-scale music foundation models in respect of their music comprehension abilities. We also discussed their limitations and proposed possible future directions, offering insights for researchers in this field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09601v1-abstract-full').style.display = 'none'; document.getElementById('2409.09601v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07146">arXiv:2409.07146</a> <span> [<a href="https://arxiv.org/pdf/2409.07146">pdf</a>, <a href="https://arxiv.org/format/2409.07146">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Gated Slot Attention for Efficient Linear-Time Sequence Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Songlin Yang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+R">Ruijie Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yue Zhang</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+L">Leyang Cui</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yiqiao Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bolun Wang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Freda Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bailin Wang</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+W">Wei Bi</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+P">Peng Zhou</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+G">Guohong Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07146v2-abstract-short" style="display: inline;"> Linear attention Transformers and their gated variants, celebrated for enabling parallel training and efficient recurrent inference, still fall short in recall-intensive tasks compared to traditional Transformers and demand significant resources for training from scratch. This paper introduces Gated Slot Attention (GSA), which enhances Attention with Bounded-memory-Control (ABC) by incorporating a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07146v2-abstract-full').style.display = 'inline'; document.getElementById('2409.07146v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07146v2-abstract-full" style="display: none;"> Linear attention Transformers and their gated variants, celebrated for enabling parallel training and efficient recurrent inference, still fall short in recall-intensive tasks compared to traditional Transformers and demand significant resources for training from scratch. This paper introduces Gated Slot Attention (GSA), which enhances Attention with Bounded-memory-Control (ABC) by incorporating a gating mechanism inspired by Gated Linear Attention (GLA). Essentially, GSA comprises a two-layer GLA linked via $\operatorname{softmax}$, utilizing context-aware memory reading and adaptive forgetting to improve memory capacity while maintaining compact recurrent state size. This design greatly enhances both training and inference efficiency through GLA's hardware-efficient training algorithm and reduced state size. Additionally, retaining the $\operatorname{softmax}$ operation is particularly beneficial in "finetuning pretrained Transformers to RNNs" (T2R) settings, reducing the need for extensive training from scratch. Extensive experiments confirm GSA's superior performance in scenarios requiring in-context recall and in T2R settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07146v2-abstract-full').style.display = 'none'; document.getElementById('2409.07146v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.04410">arXiv:2409.04410</a> <span> [<a href="https://arxiv.org/pdf/2409.04410">pdf</a>, <a href="https://arxiv.org/format/2409.04410">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Open-MAGVIT2: An Open-Source Project Toward Democratizing Auto-regressive Visual Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+Z">Zhuoyan Luo</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Fengyuan Shi</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+Y">Yixiao Ge</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yujiu Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Limin Wang</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+Y">Ying Shan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.04410v3-abstract-short" style="display: inline;"> The Open-MAGVIT2 project produces an open-source replication of Google's MAGVIT-v2 tokenizer, a tokenizer with a super-large codebook (i.e., $2^{18}$ codes), and achieves the state-of-the-art reconstruction performance on ImageNet and UCF benchmarks. We also provide a tokenizer pre-trained on large-scale data, significantly outperforming Cosmos on zero-shot benchmarks (1.93 vs. 0.78 rFID on ImageN… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04410v3-abstract-full').style.display = 'inline'; document.getElementById('2409.04410v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.04410v3-abstract-full" style="display: none;"> The Open-MAGVIT2 project produces an open-source replication of Google's MAGVIT-v2 tokenizer, a tokenizer with a super-large codebook (i.e., $2^{18}$ codes), and achieves the state-of-the-art reconstruction performance on ImageNet and UCF benchmarks. We also provide a tokenizer pre-trained on large-scale data, significantly outperforming Cosmos on zero-shot benchmarks (1.93 vs. 0.78 rFID on ImageNet original resolution). Furthermore, we explore its application in plain auto-regressive models to validate scalability properties, producing a family of auto-regressive image generation models ranging from 300M to 1.5B. To assist auto-regressive models in predicting with a super-large vocabulary, we factorize it into two sub-vocabulary of different sizes by asymmetric token factorization, and further introduce ``next sub-token prediction'' to enhance sub-token interaction for better generation quality. We release all models and codes to foster innovation and creativity in the field of auto-regressive visual generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04410v3-abstract-full').style.display = 'none'; document.getElementById('2409.04410v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03127">arXiv:2409.03127</a> <span> [<a href="https://arxiv.org/pdf/2409.03127">pdf</a>, <a href="https://arxiv.org/format/2409.03127">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Physics and Society">physics.soc-ph</span> </div> </div> <p class="title is-5 mathjax"> Fast algorithms to improve fair information access in networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Windham%2C+D+R">Dennis Robert Windham</a>, <a href="/search/cs?searchtype=author&query=Wendt%2C+C+J">Caroline J. Wendt</a>, <a href="/search/cs?searchtype=author&query=Crane%2C+A">Alex Crane</a>, <a href="/search/cs?searchtype=author&query=Warr%2C+M+J">Madelyn J Warr</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Freda Shi</a>, <a href="/search/cs?searchtype=author&query=Friedler%2C+S+A">Sorelle A. Friedler</a>, <a href="/search/cs?searchtype=author&query=Sullivan%2C+B+D">Blair D. Sullivan</a>, <a href="/search/cs?searchtype=author&query=Clauset%2C+A">Aaron Clauset</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03127v2-abstract-short" style="display: inline;"> We consider the problem of selecting $k$ seed nodes in a network to maximize the minimum probability of activation under an independent cascade beginning at these seeds. The motivation is to promote fairness by ensuring that even the least advantaged members of the network have good access to information. Our problem can be viewed as a variant of the classic influence maximization objective, but i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03127v2-abstract-full').style.display = 'inline'; document.getElementById('2409.03127v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03127v2-abstract-full" style="display: none;"> We consider the problem of selecting $k$ seed nodes in a network to maximize the minimum probability of activation under an independent cascade beginning at these seeds. The motivation is to promote fairness by ensuring that even the least advantaged members of the network have good access to information. Our problem can be viewed as a variant of the classic influence maximization objective, but it appears somewhat more difficult to solve: only heuristics are known. Moreover, the scalability of these methods is sharply constrained by the need to repeatedly estimate access probabilities. We design and evaluate a suite of $10$ new scalable algorithms which crucially do not require probability estimation. To facilitate comparison with the state-of-the-art, we make three more contributions which may be of broader interest. We introduce a principled method of selecting a pairwise information transmission parameter used in experimental evaluations, as well as a new performance metric which allows for comparison of algorithms across a range of values for the parameter $k$. Finally, we provide a new benchmark corpus of $174$ networks drawn from $6$ domains. Our algorithms retain most of the performance of the state-of-the-art while reducing running time by orders of magnitude. Specifically, a meta-learner approach is on average only $20\%$ less effective than the state-of-the-art on held-out data, but about $75-130$ times faster. Further, the meta-learner's performance exceeds the state-of the-art on about $20\%$ of networks, and the magnitude of its running time advantage is maintained on much larger networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03127v2-abstract-full').style.display = 'none'; document.getElementById('2409.03127v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 9 figures, and 3 appendices containing 12 algorithms, 1 table, and 9 additional figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.02070">arXiv:2409.02070</a> <span> [<a href="https://arxiv.org/pdf/2409.02070">pdf</a>, <a href="https://arxiv.org/format/2409.02070">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Explicit Differentiable Slicing and Global Deformation for Cardiac Mesh Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yihao Luo</a>, <a href="/search/cs?searchtype=author&query=Sesia%2C+D">Dario Sesia</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fanwen Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yinzhe Wu</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+W">Wenhao Ding</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiahao Huang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Fadong Shi</a>, <a href="/search/cs?searchtype=author&query=Shah%2C+A">Anoop Shah</a>, <a href="/search/cs?searchtype=author&query=Kaural%2C+A">Amit Kaural</a>, <a href="/search/cs?searchtype=author&query=Mayet%2C+J">Jamil Mayet</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guang Yang</a>, <a href="/search/cs?searchtype=author&query=Yap%2C+C">ChoonHwai Yap</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.02070v2-abstract-short" style="display: inline;"> Mesh reconstruction of the cardiac anatomy from medical images is useful for shape and motion measurements and biophysics simulations to facilitate the assessment of cardiac function and health. However, 3D medical images are often acquired as 2D slices that are sparsely sampled and noisy, and mesh reconstruction on such data is a challenging task. Traditional voxel-based approaches rely on pre- a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02070v2-abstract-full').style.display = 'inline'; document.getElementById('2409.02070v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.02070v2-abstract-full" style="display: none;"> Mesh reconstruction of the cardiac anatomy from medical images is useful for shape and motion measurements and biophysics simulations to facilitate the assessment of cardiac function and health. However, 3D medical images are often acquired as 2D slices that are sparsely sampled and noisy, and mesh reconstruction on such data is a challenging task. Traditional voxel-based approaches rely on pre- and post-processing that compromises image fidelity, while mesh-level deep learning approaches require mesh annotations that are difficult to get. Therefore, direct cross-domain supervision from 2D images to meshes is a key technique for advancing 3D learning in medical imaging, but it has not been well-developed. While there have been attempts to approximate the optimized meshes' slicing, few existing methods directly use 2D slices to supervise mesh reconstruction in a differentiable manner. Here, we propose a novel explicit differentiable voxelization and slicing (DVS) algorithm that allows gradient backpropagation to a mesh from its slices, facilitating refined mesh optimization directly supervised by the losses defined on 2D images. Further, we propose an innovative framework for extracting patient-specific left ventricle (LV) meshes from medical images by coupling DVS with a graph harmonic deformation (GHD) mesh morphing descriptor of cardiac shape that naturally preserves mesh quality and smoothness during optimization. Experimental results demonstrate that our method achieves state-of-the-art performance in cardiac mesh reconstruction tasks from CT and MRI, with an overall Dice score of 90% on multi-datasets, outperforming existing approaches. The proposed method can further quantify clinically useful parameters such as ejection fraction and global myocardial strains, closely matching the ground truth and surpassing the traditional voxel-based approach in sparse images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02070v2-abstract-full').style.display = 'none'; document.getElementById('2409.02070v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12815">arXiv:2408.12815</a> <span> [<a href="https://arxiv.org/pdf/2408.12815">pdf</a>, <a href="https://arxiv.org/format/2408.12815">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Staircase Cascaded Fusion of Lightweight Local Pattern Recognition and Long-Range Dependencies for Structural Crack Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hui Liu</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+C">Chen Jia</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Fan Shi</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xu Cheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mianzhao Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shengyong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12815v3-abstract-short" style="display: inline;"> Detecting cracks with pixel-level precision for key structures is a significant challenge, existing methods struggle to integrate local textures and pixel dependencies of cracks. Furthermore, these methods possess numerous parameters and substantial computational requirements, complicating deployment on edge devices. In this paper, we propose the Staircase Cascaded Fusion Crack Segmentation Networ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12815v3-abstract-full').style.display = 'inline'; document.getElementById('2408.12815v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12815v3-abstract-full" style="display: none;"> Detecting cracks with pixel-level precision for key structures is a significant challenge, existing methods struggle to integrate local textures and pixel dependencies of cracks. Furthermore, these methods possess numerous parameters and substantial computational requirements, complicating deployment on edge devices. In this paper, we propose the Staircase Cascaded Fusion Crack Segmentation Network (CrackSCF), which generates high-quality crack segmentation maps while reducing computational overhead. We design a lightweight convolutional block that substitutes all convolution operations, reducing the model's computational demands while maintaining an effective capture of local details. Additionally, we introduce a lightweight long-range dependency extractor to better capture the long-range dependencies. Furthermore, we develop a staircase cascaded fusion module, which seamlessly integrates local patterns and long-range dependencies, resulting in high-quality segmentation maps. To comprehensively evaluate our method, we created the challenging TUT benchmark dataset and evaluated it alongside five other public datasets. The results show that our method outperforms existing methods, particularly in handling background noise and achieving detailed segmentation. The F1 and mIoU scores on the TUT dataset are 0.8382 and 0.8473, respectively, demonstrating state-of-the-art (SOTA) performance with low computational resources. The code and dataset is available at https://github.com/Karl1109/CrackSCF. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12815v3-abstract-full').style.display = 'none'; document.getElementById('2408.12815v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10479">arXiv:2408.10479</a> <span> [<a href="https://arxiv.org/pdf/2408.10479">pdf</a>, <a href="https://arxiv.org/format/2408.10479">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> An End-to-End Reinforcement Learning Based Approach for Micro-View Order-Dispatching in Ride-Hailing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yue%2C+X">Xinlang Yue</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yiran Liu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Fangzhou Shi</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+S">Sihong Luo</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+C">Chen Zhong</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+M">Min Lu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhe Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10479v1-abstract-short" style="display: inline;"> Assigning orders to drivers under localized spatiotemporal context (micro-view order-dispatching) is a major task in Didi, as it influences ride-hailing service experience. Existing industrial solutions mainly follow a two-stage pattern that incorporate heuristic or learning-based algorithms with naive combinatorial methods, tackling the uncertainty of both sides' behaviors, including emerging tim… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10479v1-abstract-full').style.display = 'inline'; document.getElementById('2408.10479v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10479v1-abstract-full" style="display: none;"> Assigning orders to drivers under localized spatiotemporal context (micro-view order-dispatching) is a major task in Didi, as it influences ride-hailing service experience. Existing industrial solutions mainly follow a two-stage pattern that incorporate heuristic or learning-based algorithms with naive combinatorial methods, tackling the uncertainty of both sides' behaviors, including emerging timings, spatial relationships, and travel duration, etc. In this paper, we propose a one-stage end-to-end reinforcement learning based order-dispatching approach that solves behavior prediction and combinatorial optimization uniformly in a sequential decision-making manner. Specifically, we employ a two-layer Markov Decision Process framework to model this problem, and present \underline{D}eep \underline{D}ouble \underline{S}calable \underline{N}etwork (D2SN), an encoder-decoder structure network to generate order-driver assignments directly and stop assignments accordingly. Besides, by leveraging contextual dynamics, our approach can adapt to the behavioral patterns for better performance. Extensive experiments on Didi's real-world benchmarks justify that the proposed approach significantly outperforms competitive baselines in optimizing matching efficiency and user experience tasks. In addition, we evaluate the deployment outline and discuss the gains and experiences obtained during the deployment tests from the view of large-scale engineering implementation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10479v1-abstract-full').style.display = 'none'; document.getElementById('2408.10479v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08724">arXiv:2408.08724</a> <span> [<a href="https://arxiv.org/pdf/2408.08724">pdf</a>, <a href="https://arxiv.org/format/2408.08724">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ChatZero:Zero-shot Cross-Lingual Dialogue Generation via Pseudo-Target Language </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yongkang Liu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Feng Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Daling Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yifei Zhang</a>, <a href="/search/cs?searchtype=author&query=Sch%C3%BCtze%2C+H">Hinrich Sch眉tze</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08724v1-abstract-short" style="display: inline;"> Although large language models(LLMs) show amazing capabilities, among various exciting applications discovered for LLMs fall short in other low-resource languages. Besides, most existing methods depend on large-scale dialogue corpora and thus building systems for dialogue generation in a zero-shot scenario remains a considerable challenge. To address this challenge, we propose a novel end-to-end z… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08724v1-abstract-full').style.display = 'inline'; document.getElementById('2408.08724v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08724v1-abstract-full" style="display: none;"> Although large language models(LLMs) show amazing capabilities, among various exciting applications discovered for LLMs fall short in other low-resource languages. Besides, most existing methods depend on large-scale dialogue corpora and thus building systems for dialogue generation in a zero-shot scenario remains a considerable challenge. To address this challenge, we propose a novel end-to-end zero-shot dialogue generation model ChatZero based on cross-lingual code-switching method. First, we construct code-switching language and pseudo-target language with placeholders. Then for cross-lingual semantic transfer, we employ unsupervised contrastive learning to minimize the semantics gap of the source language, code-switching language, and pseudo-target language that are mutually positive examples in the high dimensional semantic space. Experiments on the multilingual DailyDialog and DSTC7-AVSD datasets demonstrate that ChatZero can achieve more than 90\% of the original performance under the zero-shot case compared to supervised learning, and achieve state-of-the-art performance compared with other baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08724v1-abstract-full').style.display = 'none'; document.getElementById('2408.08724v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECAI2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ECAI2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.06150">arXiv:2408.06150</a> <span> [<a href="https://arxiv.org/pdf/2408.06150">pdf</a>, <a href="https://arxiv.org/format/2408.06150">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Chemical Physics">physics.chem-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> LipidBERT: A Lipid Language Model Pre-trained on METiS de novo Lipid Library </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+T">Tianhao Yu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+C">Cai Yao</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zhuorui Sun</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Feng Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lin Zhang</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+K">Kangjie Lyu</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+X">Xuan Bai</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A">Andong Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xicheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+J">Jiali Zou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenshou Wang</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+C">Chris Lai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kai Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.06150v2-abstract-short" style="display: inline;"> In this study, we generate and maintain a database of 10 million virtual lipids through METiS's in-house de novo lipid generation algorithms and lipid virtual screening techniques. These virtual lipids serve as a corpus for pre-training, lipid representation learning, and downstream task knowledge transfer, culminating in state-of-the-art LNP property prediction performance. We propose LipidBERT,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06150v2-abstract-full').style.display = 'inline'; document.getElementById('2408.06150v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.06150v2-abstract-full" style="display: none;"> In this study, we generate and maintain a database of 10 million virtual lipids through METiS's in-house de novo lipid generation algorithms and lipid virtual screening techniques. These virtual lipids serve as a corpus for pre-training, lipid representation learning, and downstream task knowledge transfer, culminating in state-of-the-art LNP property prediction performance. We propose LipidBERT, a BERT-like model pre-trained with the Masked Language Model (MLM) and various secondary tasks. Additionally, we compare the performance of embeddings generated by LipidBERT and PhatGPT, our GPT-like lipid generation model, on downstream tasks. The proposed bilingual LipidBERT model operates in two languages: the language of ionizable lipid pre-training, using in-house dry-lab lipid structures, and the language of LNP fine-tuning, utilizing in-house LNP wet-lab data. This dual capability positions LipidBERT as a key AI-based filter for future screening tasks, including new versions of METiS de novo lipid libraries and, more importantly, candidates for in vivo testing for orgran-targeting LNPs. To the best of our knowledge, this is the first successful demonstration of the capability of a pre-trained language model on virtual lipids and its effectiveness in downstream tasks using web-lab data. This work showcases the clever utilization of METiS's in-house de novo lipid library as well as the power of dry-wet lab integration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06150v2-abstract-full').style.display = 'none'; document.getElementById('2408.06150v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05849">arXiv:2408.05849</a> <span> [<a href="https://arxiv.org/pdf/2408.05849">pdf</a>, <a href="https://arxiv.org/format/2408.05849">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> An End-to-End Model for Time Series Classification In the Presence of Missing Values </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+P">Pengshuai Yao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mengna Liu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xu Cheng</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Fan Shi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Huan Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiufeng Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shengyong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05849v1-abstract-short" style="display: inline;"> Time series classification with missing data is a prevalent issue in time series analysis, as temporal data often contain missing values in practical applications. The traditional two-stage approach, which handles imputation and classification separately, can result in sub-optimal performance as label information is not utilized in the imputation process. On the other hand, a one-stage approach ca… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05849v1-abstract-full').style.display = 'inline'; document.getElementById('2408.05849v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05849v1-abstract-full" style="display: none;"> Time series classification with missing data is a prevalent issue in time series analysis, as temporal data often contain missing values in practical applications. The traditional two-stage approach, which handles imputation and classification separately, can result in sub-optimal performance as label information is not utilized in the imputation process. On the other hand, a one-stage approach can learn features under missing information, but feature representation is limited as imputed errors are propagated in the classification process. To overcome these challenges, this study proposes an end-to-end neural network that unifies data imputation and representation learning within a single framework, allowing the imputation process to take advantage of label information. Differing from previous methods, our approach places less emphasis on the accuracy of imputation data and instead prioritizes classification performance. A specifically designed multi-scale feature learning module is implemented to extract useful information from the noise-imputation data. The proposed model is evaluated on 68 univariate time series datasets from the UCR archive, as well as a multivariate time series dataset with various missing data ratios and 4 real-world datasets with missing information. The results indicate that the proposed model outperforms state-of-the-art approaches for incomplete time series classification, particularly in scenarios with high levels of missing data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05849v1-abstract-full').style.display = 'none'; document.getElementById('2408.05849v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05567">arXiv:2408.05567</a> <span> [<a href="https://arxiv.org/pdf/2408.05567">pdf</a>, <a href="https://arxiv.org/format/2408.05567">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/JIOT.2024.3429245">10.1109/JIOT.2024.3429245 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Diffusion Model-based Contrastive Learning for Human Activity Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+C">Chunjing Xiao</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Y">Yanhui Han</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wei Yang</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+Y">Yane Hou</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Fangzhan Shi</a>, <a href="/search/cs?searchtype=author&query=Chetty%2C+K">Kevin Chetty</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05567v1-abstract-short" style="display: inline;"> WiFi Channel State Information (CSI)-based activity recognition has sparked numerous studies due to its widespread availability and privacy protection. However, when applied in practical applications, general CSI-based recognition models may face challenges related to the limited generalization capability, since individuals with different behavior habits will cause various fluctuations in CSI data… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05567v1-abstract-full').style.display = 'inline'; document.getElementById('2408.05567v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05567v1-abstract-full" style="display: none;"> WiFi Channel State Information (CSI)-based activity recognition has sparked numerous studies due to its widespread availability and privacy protection. However, when applied in practical applications, general CSI-based recognition models may face challenges related to the limited generalization capability, since individuals with different behavior habits will cause various fluctuations in CSI data and it is difficult to gather enough training data to cover all kinds of motion habits. To tackle this problem, we design a diffusion model-based Contrastive Learning framework for human Activity Recognition (CLAR) using WiFi CSI. On the basis of the contrastive learning framework, we primarily introduce two components for CLAR to enhance CSI-based activity recognition. To generate diverse augmented data and complement limited training data, we propose a diffusion model-based time series-specific augmentation model. In contrast to typical diffusion models that directly apply conditions to the generative process, potentially resulting in distorted CSI data, our tailored model dissects these condition into the high-frequency and low-frequency components, and then applies these conditions to the generative process with varying weights. This can alleviate data distortion and yield high-quality augmented data. To efficiently capture the difference of the sample importance, we present an adaptive weight algorithm. Different from typical contrastive learning methods which equally consider all the training samples, this algorithm adaptively adjusts the weights of positive sample pairs for learning better data representations. The experiments suggest that CLAR achieves significant gains compared to state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05567v1-abstract-full').style.display = 'none'; document.getElementById('2408.05567v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The paper has been accepted by IEEE Internet of Things Journal</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.04628">arXiv:2408.04628</a> <span> [<a href="https://arxiv.org/pdf/2408.04628">pdf</a>, <a href="https://arxiv.org/format/2408.04628">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LogogramNLP: Comparing Visual and Textual Representations of Ancient Logographic Writing Systems for NLP </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+D">Danlu Chen</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Freda Shi</a>, <a href="/search/cs?searchtype=author&query=Agarwal%2C+A">Aditi Agarwal</a>, <a href="/search/cs?searchtype=author&query=Myerston%2C+J">Jacobo Myerston</a>, <a href="/search/cs?searchtype=author&query=Berg-Kirkpatrick%2C+T">Taylor Berg-Kirkpatrick</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.04628v1-abstract-short" style="display: inline;"> Standard natural language processing (NLP) pipelines operate on symbolic representations of language, which typically consist of sequences of discrete tokens. However, creating an analogous representation for ancient logographic writing systems is an extremely labor intensive process that requires expert knowledge. At present, a large portion of logographic data persists in a purely visual form du… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04628v1-abstract-full').style.display = 'inline'; document.getElementById('2408.04628v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.04628v1-abstract-full" style="display: none;"> Standard natural language processing (NLP) pipelines operate on symbolic representations of language, which typically consist of sequences of discrete tokens. However, creating an analogous representation for ancient logographic writing systems is an extremely labor intensive process that requires expert knowledge. At present, a large portion of logographic data persists in a purely visual form due to the absence of transcription -- this issue poses a bottleneck for researchers seeking to apply NLP toolkits to study ancient logographic languages: most of the relevant data are images of writing. This paper investigates whether direct processing of visual representations of language offers a potential solution. We introduce LogogramNLP, the first benchmark enabling NLP analysis of ancient logographic languages, featuring both transcribed and visual datasets for four writing systems along with annotations for tasks like classification, translation, and parsing. Our experiments compare systems that employ recent visual and text encoding strategies as backbones. The results demonstrate that visual representations outperform textual representations for some investigated tasks, suggesting that visual processing pipelines may unlock a large amount of cultural heritage data of logographic languages for NLP-based analyses. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04628v1-abstract-full').style.display = 'none'; document.getElementById('2408.04628v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ACL 2024, long paper </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21328">arXiv:2407.21328</a> <span> [<a href="https://arxiv.org/pdf/2407.21328">pdf</a>, <a href="https://arxiv.org/format/2407.21328">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Knowledge-Guided Prompt Learning for Lifespan Brain MR Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Teng%2C+L">Lin Teng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zihao Zhao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiawei Huang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Z">Zehong Cao</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+R">Runqi Meng</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Feng Shi</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+D">Dinggang Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21328v1-abstract-short" style="display: inline;"> Automatic and accurate segmentation of brain MR images throughout the human lifespan into tissue and structure is crucial for understanding brain development and diagnosing diseases. However, challenges arise from the intricate variations in brain appearance due to rapid early brain development, aging, and disorders, compounded by the limited availability of manually-labeled datasets. In response,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21328v1-abstract-full').style.display = 'inline'; document.getElementById('2407.21328v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21328v1-abstract-full" style="display: none;"> Automatic and accurate segmentation of brain MR images throughout the human lifespan into tissue and structure is crucial for understanding brain development and diagnosing diseases. However, challenges arise from the intricate variations in brain appearance due to rapid early brain development, aging, and disorders, compounded by the limited availability of manually-labeled datasets. In response, we present a two-step segmentation framework employing Knowledge-Guided Prompt Learning (KGPL) for brain MRI. Specifically, we first pre-train segmentation models on large-scale datasets with sub-optimal labels, followed by the incorporation of knowledge-driven embeddings learned from image-text alignment into the models. The introduction of knowledge-wise prompts captures semantic relationships between anatomical variability and biological processes, enabling models to learn structural feature embeddings across diverse age groups. Experimental findings demonstrate the superiority and robustness of our proposed method, particularly noticeable when employing Swin UNETR as the backbone. Our approach achieves average DSC values of 95.17% and 94.19% for brain tissue and structure segmentation, respectively. Our code is available at https://github.com/TL9792/KGPL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21328v1-abstract-full').style.display = 'none'; document.getElementById('2407.21328v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21075">arXiv:2407.21075</a> <span> [<a href="https://arxiv.org/pdf/2407.21075">pdf</a>, <a href="https://arxiv.org/format/2407.21075">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Apple Intelligence Foundation Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gunter%2C+T">Tom Gunter</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zirui Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chong Wang</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+R">Ruoming Pang</a>, <a href="/search/cs?searchtype=author&query=Narayanan%2C+A">Andy Narayanan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+A">Aonan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bowen Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chen Chen</a>, <a href="/search/cs?searchtype=author&query=Chiu%2C+C">Chung-Cheng Chiu</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+D">David Qiu</a>, <a href="/search/cs?searchtype=author&query=Gopinath%2C+D">Deepak Gopinath</a>, <a href="/search/cs?searchtype=author&query=Yap%2C+D+A">Dian Ang Yap</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+D">Dong Yin</a>, <a href="/search/cs?searchtype=author&query=Nan%2C+F">Feng Nan</a>, <a href="/search/cs?searchtype=author&query=Weers%2C+F">Floris Weers</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+G">Guoli Yin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Haoshuo Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianyu Wang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jiarui Lu</a>, <a href="/search/cs?searchtype=author&query=Peebles%2C+J">John Peebles</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+K">Ke Ye</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+M">Mark Lee</a>, <a href="/search/cs?searchtype=author&query=Du%2C+N">Nan Du</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qibin Chen</a>, <a href="/search/cs?searchtype=author&query=Keunebroek%2C+Q">Quentin Keunebroek</a> , et al. (130 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21075v1-abstract-short" style="display: inline;"> We present foundation language models developed to power Apple Intelligence features, including a ~3 billion parameter model designed to run efficiently on devices and a large server-based language model designed for Private Cloud Compute. These models are designed to perform a wide range of tasks efficiently, accurately, and responsibly. This report describes the model architecture, the data used… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21075v1-abstract-full').style.display = 'inline'; document.getElementById('2407.21075v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21075v1-abstract-full" style="display: none;"> We present foundation language models developed to power Apple Intelligence features, including a ~3 billion parameter model designed to run efficiently on devices and a large server-based language model designed for Private Cloud Compute. These models are designed to perform a wide range of tasks efficiently, accurately, and responsibly. This report describes the model architecture, the data used to train the model, the training process, how the models are optimized for inference, and the evaluation results. We highlight our focus on Responsible AI and how the principles are applied throughout the model development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21075v1-abstract-full').style.display = 'none'; document.getElementById('2407.21075v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.06612">arXiv:2407.06612</a> <span> [<a href="https://arxiv.org/pdf/2407.06612">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> AI-based Automatic Segmentation of Prostate on Multi-modality Images: A Review </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+R">Rui Jin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Derun Li</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+D">Dehui Xiang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hailing Zhou</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Fei Shi</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+W">Weifang Zhu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+J">Jing Cai</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+T">Tao Peng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinjian Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.06612v1-abstract-short" style="display: inline;"> Prostate cancer represents a major threat to health. Early detection is vital in reducing the mortality rate among prostate cancer patients. One approach involves using multi-modality (CT, MRI, US, etc.) computer-aided diagnosis (CAD) systems for the prostate region. However, prostate segmentation is challenging due to imperfections in the images and the prostate's complex tissue structure. The ad… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06612v1-abstract-full').style.display = 'inline'; document.getElementById('2407.06612v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.06612v1-abstract-full" style="display: none;"> Prostate cancer represents a major threat to health. Early detection is vital in reducing the mortality rate among prostate cancer patients. One approach involves using multi-modality (CT, MRI, US, etc.) computer-aided diagnosis (CAD) systems for the prostate region. However, prostate segmentation is challenging due to imperfections in the images and the prostate's complex tissue structure. The advent of precision medicine and a significant increase in clinical capacity have spurred the need for various data-driven tasks in the field of medical imaging. Recently, numerous machine learning and data mining tools have been integrated into various medical areas, including image segmentation. This article proposes a new classification method that differentiates supervision types, either in number or kind, during the training phase. Subsequently, we conducted a survey on artificial intelligence (AI)-based automatic prostate segmentation methods, examining the advantages and limitations of each. Additionally, we introduce variants of evaluation metrics for the verification and performance assessment of the segmentation method and summarize the current challenges. Finally, future research directions and development trends are discussed, reflecting the outcomes of our literature survey, suggesting high-precision detection and treatment of prostate cancer as a promising avenue. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06612v1-abstract-full').style.display = 'none'; document.getElementById('2407.06612v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09662">arXiv:2406.09662</a> <span> [<a href="https://arxiv.org/pdf/2406.09662">pdf</a>, <a href="https://arxiv.org/format/2406.09662">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Learning Language Structures through Grounding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+F">Freda Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09662v2-abstract-short" style="display: inline;"> Language is highly structured, with syntactic and semantic structures, to some extent, agreed upon by speakers of the same language. With implicit or explicit awareness of such structures, humans can learn and use language efficiently and generalize to sentences that contain unseen words. Motivated by human language learning, in this dissertation, we consider a family of machine learning tasks tha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09662v2-abstract-full').style.display = 'inline'; document.getElementById('2406.09662v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09662v2-abstract-full" style="display: none;"> Language is highly structured, with syntactic and semantic structures, to some extent, agreed upon by speakers of the same language. With implicit or explicit awareness of such structures, humans can learn and use language efficiently and generalize to sentences that contain unseen words. Motivated by human language learning, in this dissertation, we consider a family of machine learning tasks that aim to learn language structures through grounding. We seek distant supervision from other data sources (i.e., grounds), including but not limited to other modalities (e.g., vision), execution results of programs, and other languages. We demonstrate the potential of this task formulation and advocate for its adoption through three schemes. In Part I, we consider learning syntactic parses through visual grounding. We propose the task of visually grounded grammar induction, present the first models to induce syntactic structures from visually grounded text and speech, and find that the visual grounding signals can help improve the parsing quality over language-only models. As a side contribution, we propose a novel evaluation metric that enables the evaluation of speech parsing without text or automatic speech recognition systems involved. In Part II, we propose two execution-aware methods to map sentences into corresponding semantic structures (i.e., programs), significantly improving compositional generalization and few-shot program synthesis. In Part III, we propose methods that learn language structures from annotations in other languages. Specifically, we propose a method that sets a new state of the art on cross-lingual word alignment. We then leverage the learned word alignments to improve the performance of zero-shot cross-lingual dependency parsing, by proposing a novel substructure-based projection method that preserves structural knowledge learned from the source language. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09662v2-abstract-full').style.display = 'none'; document.getElementById('2406.09662v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Ph.D. Thesis</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.12424">arXiv:2405.12424</a> <span> [<a href="https://arxiv.org/pdf/2405.12424">pdf</a>, <a href="https://arxiv.org/format/2405.12424">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Robustness Assessment: Adversarial Attacks on Learning-based Quadrupedal Locomotion Controllers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+F">Fan Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chong Zhang</a>, <a href="/search/cs?searchtype=author&query=Miki%2C+T">Takahiro Miki</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Joonho Lee</a>, <a href="/search/cs?searchtype=author&query=Hutter%2C+M">Marco Hutter</a>, <a href="/search/cs?searchtype=author&query=Coros%2C+S">Stelian Coros</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.12424v2-abstract-short" style="display: inline;"> Legged locomotion has recently achieved remarkable success with the progress of machine learning techniques, especially deep reinforcement learning (RL). Controllers employing neural networks have demonstrated empirical and qualitative robustness against real-world uncertainties, including sensor noise and external perturbations. However, formally investigating the vulnerabilities of these locomot… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.12424v2-abstract-full').style.display = 'inline'; document.getElementById('2405.12424v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.12424v2-abstract-full" style="display: none;"> Legged locomotion has recently achieved remarkable success with the progress of machine learning techniques, especially deep reinforcement learning (RL). Controllers employing neural networks have demonstrated empirical and qualitative robustness against real-world uncertainties, including sensor noise and external perturbations. However, formally investigating the vulnerabilities of these locomotion controllers remains a challenge. This difficulty arises from the requirement to pinpoint vulnerabilities across a long-tailed distribution within a high-dimensional, temporally sequential space. As a first step towards quantitative verification, we propose a computational method that leverages sequential adversarial attacks to identify weaknesses in learned locomotion controllers. Our research demonstrates that, even state-of-the-art robust controllers can fail significantly under well-designed, low-magnitude adversarial sequence. Through experiments in simulation and on the real robot, we validate our approach's effectiveness, and we illustrate how the results it generates can be used to robustify the original policy and offer valuable insights into the safety of these black-box policies. Project page: https://fanshi14.github.io/me/rss24.html <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.12424v2-abstract-full').style.display = 'none'; document.getElementById('2405.12424v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">RSS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.09597">arXiv:2405.09597</a> <span> [<a href="https://arxiv.org/pdf/2405.09597">pdf</a>, <a href="https://arxiv.org/format/2405.09597">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> When AI Eats Itself: On the Caveats of AI Autophagy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xing%2C+X">Xiaodan Xing</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Fadong Shi</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiahao Huang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yinzhe Wu</a>, <a href="/search/cs?searchtype=author&query=Nan%2C+Y">Yang Nan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Sheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yingying Fang</a>, <a href="/search/cs?searchtype=author&query=Roberts%2C+M">Mike Roberts</a>, <a href="/search/cs?searchtype=author&query=Sch%C3%B6nlieb%2C+C">Carola-Bibiane Sch枚nlieb</a>, <a href="/search/cs?searchtype=author&query=Del+Ser%2C+J">Javier Del Ser</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.09597v3-abstract-short" style="display: inline;"> Generative Artificial Intelligence (AI) technologies and large models are producing realistic outputs across various domains, such as images, text, speech, and music. Creating these advanced generative models requires significant resources, particularly large and high-quality datasets. To minimise training expenses, many algorithm developers use data created by the models themselves as a cost-effe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.09597v3-abstract-full').style.display = 'inline'; document.getElementById('2405.09597v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.09597v3-abstract-full" style="display: none;"> Generative Artificial Intelligence (AI) technologies and large models are producing realistic outputs across various domains, such as images, text, speech, and music. Creating these advanced generative models requires significant resources, particularly large and high-quality datasets. To minimise training expenses, many algorithm developers use data created by the models themselves as a cost-effective training solution. However, not all synthetic data effectively improve model performance, necessitating a strategic balance in the use of real versus synthetic data to optimise outcomes. Currently, the previously well-controlled integration of real and synthetic data is becoming uncontrollable. The widespread and unregulated dissemination of synthetic data online leads to the contamination of datasets traditionally compiled through web scraping, now mixed with unlabeled synthetic data. This trend, known as the AI autophagy phenomenon, suggests a future where generative AI systems may increasingly consume their own outputs without discernment, raising concerns about model performance, reliability, and ethical implications. What will happen if generative AI continuously consumes itself without discernment? What measures can we take to mitigate the potential adverse effects? To address these research questions, this study examines the existing literature, delving into the consequences of AI autophagy, analyzing the associated risks, and exploring strategies to mitigate its impact. Our aim is to provide a comprehensive perspective on this phenomenon advocating for a balanced approach that promotes the sustainable development of generative AI technologies in the era of large models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.09597v3-abstract-full').style.display = 'none'; document.getElementById('2405.09597v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.13433">arXiv:2402.13433</a> <span> [<a href="https://arxiv.org/pdf/2402.13433">pdf</a>, <a href="https://arxiv.org/format/2402.13433">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> Structured Tree Alignment for Evaluation of (Speech) Constituency Parsing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+F">Freda Shi</a>, <a href="/search/cs?searchtype=author&query=Gimpel%2C+K">Kevin Gimpel</a>, <a href="/search/cs?searchtype=author&query=Livescu%2C+K">Karen Livescu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.13433v2-abstract-short" style="display: inline;"> We present the structured average intersection-over-union ratio (STRUCT-IOU), a similarity metric between constituency parse trees motivated by the problem of evaluating speech parsers. STRUCT-IOU enables comparison between a constituency parse tree (over automatically recognized spoken word boundaries) with the ground-truth parse (over written words). To compute the metric, we project the ground-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.13433v2-abstract-full').style.display = 'inline'; document.getElementById('2402.13433v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.13433v2-abstract-full" style="display: none;"> We present the structured average intersection-over-union ratio (STRUCT-IOU), a similarity metric between constituency parse trees motivated by the problem of evaluating speech parsers. STRUCT-IOU enables comparison between a constituency parse tree (over automatically recognized spoken word boundaries) with the ground-truth parse (over written words). To compute the metric, we project the ground-truth parse tree to the speech domain by forced alignment, align the projected ground-truth constituents with the predicted ones under certain structured constraints, and calculate the average IOU score across all aligned constituent pairs. STRUCT-IOU takes word boundaries into account and overcomes the challenge that the predicted words and ground truth may not have perfect one-to-one correspondence. Extending to the evaluation of text constituency parsing, we demonstrate that STRUCT-IOU can address token-mismatch issues, and shows higher tolerance to syntactically plausible parses than PARSEVAL (Black et al., 1991). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.13433v2-abstract-full').style.display = 'none'; document.getElementById('2402.13433v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACL 2024 camera-ready</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.09966">arXiv:2401.09966</a> <span> [<a href="https://arxiv.org/pdf/2401.09966">pdf</a>, <a href="https://arxiv.org/format/2401.09966">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Towards Generative Abstract Reasoning: Completing Raven's Progressive Matrix via Rule Abstraction and Selection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+F">Fan Shi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bin Li</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+X">Xiangyang Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.09966v3-abstract-short" style="display: inline;"> Endowing machines with abstract reasoning ability has been a long-term research topic in artificial intelligence. Raven's Progressive Matrix (RPM) is widely used to probe abstract visual reasoning in machine intelligence, where models will analyze the underlying rules and select one image from candidates to complete the image matrix. Participators of RPM tests can show powerful reasoning ability b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.09966v3-abstract-full').style.display = 'inline'; document.getElementById('2401.09966v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.09966v3-abstract-full" style="display: none;"> Endowing machines with abstract reasoning ability has been a long-term research topic in artificial intelligence. Raven's Progressive Matrix (RPM) is widely used to probe abstract visual reasoning in machine intelligence, where models will analyze the underlying rules and select one image from candidates to complete the image matrix. Participators of RPM tests can show powerful reasoning ability by inferring and combining attribute-changing rules and imagining the missing images at arbitrary positions of a matrix. However, existing solvers can hardly manifest such an ability in realistic RPM tests. In this paper, we propose a deep latent variable model for answer generation problems through Rule AbstractIon and SElection (RAISE). RAISE can encode image attributes into latent concepts and abstract atomic rules that act on the latent concepts. When generating answers, RAISE selects one atomic rule out of the global knowledge set for each latent concept to constitute the underlying rule of an RPM. In the experiments of bottom-right and arbitrary-position answer generation, RAISE outperforms the compared solvers in most configurations of realistic RPM datasets. In the odd-one-out task and two held-out configurations, RAISE can leverage acquired latent concepts and atomic rules to find the rule-breaking image in a matrix and handle problems with unseen combinations of rules and attributes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.09966v3-abstract-full').style.display = 'none'; document.getElementById('2401.09966v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.17281">arXiv:2312.17281</a> <span> [<a href="https://arxiv.org/pdf/2312.17281">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Revolutionizing Personalized Voice Synthesis: The Journey towards Emotional and Individual Authenticity with DIVSE (Dynamic Individual Voice Synthesis Engine) </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+F">Fan Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.17281v1-abstract-short" style="display: inline;"> This comprehensive paper delves into the forefront of personalized voice synthesis within artificial intelligence (AI), spotlighting the Dynamic Individual Voice Synthesis Engine (DIVSE). DIVSE represents a groundbreaking leap in text-to-voice (TTS) technology, uniquely focusing on adapting and personalizing voice outputs to match individual vocal characteristics. The research underlines the gap i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.17281v1-abstract-full').style.display = 'inline'; document.getElementById('2312.17281v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.17281v1-abstract-full" style="display: none;"> This comprehensive paper delves into the forefront of personalized voice synthesis within artificial intelligence (AI), spotlighting the Dynamic Individual Voice Synthesis Engine (DIVSE). DIVSE represents a groundbreaking leap in text-to-voice (TTS) technology, uniquely focusing on adapting and personalizing voice outputs to match individual vocal characteristics. The research underlines the gap in current AI-generated voices, which, while technically advanced, fall short in replicating the unique individuality and expressiveness intrinsic to human speech. It outlines the challenges and advancements in personalized voice synthesis, emphasizing the importance of emotional expressiveness, accent and dialect variability, and capturing individual voice traits. The architecture of DIVSE is meticulously detailed, showcasing its three core components: Voice Characteristic Learning Module (VCLM), Emotional Tone and Accent Adaptation Module (ETAAM), and Dynamic Speech Synthesis Engine (DSSE). The innovative approach of DIVSE lies in its adaptive learning capability, which evolves over time to tailor voice outputs to specific user traits. The paper presents a rigorous experimental setup, utilizing accepted datasets and personalization metrics like Mean Opinion Score (MOS) and Emotional Alignment Score, to validate DIVSE's superiority over mainstream models. The results depict a clear advancement in achieving higher personalization and emotional resonance in AI-generated voices. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.17281v1-abstract-full').style.display = 'none'; document.getElementById('2312.17281v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.17274">arXiv:2312.17274</a> <span> [<a href="https://arxiv.org/pdf/2312.17274">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> RefineNet: Enhancing Text-to-Image Conversion with High-Resolution and Detail Accuracy through Hierarchical Transformers and Progressive Refinement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+F">Fan Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.17274v1-abstract-short" style="display: inline;"> In this research, we introduce RefineNet, a novel architecture designed to address resolution limitations in text-to-image conversion systems. We explore the challenges of generating high-resolution images from textual descriptions, focusing on the trade-offs between detail accuracy and computational efficiency. RefineNet leverages a hierarchical Transformer combined with progressive and condition… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.17274v1-abstract-full').style.display = 'inline'; document.getElementById('2312.17274v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.17274v1-abstract-full" style="display: none;"> In this research, we introduce RefineNet, a novel architecture designed to address resolution limitations in text-to-image conversion systems. We explore the challenges of generating high-resolution images from textual descriptions, focusing on the trade-offs between detail accuracy and computational efficiency. RefineNet leverages a hierarchical Transformer combined with progressive and conditional refinement techniques, outperforming existing models in producing detailed and high-quality images. Through extensive experiments on diverse datasets, we demonstrate RefineNet's superiority in clarity and resolution, particularly in complex image categories like animals, plants, and human faces. Our work not only advances the field of image-to-text conversion but also opens new avenues for high-fidelity image generation in various applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.17274v1-abstract-full').style.display = 'none'; document.getElementById('2312.17274v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.13752">arXiv:2312.13752</a> <span> [<a href="https://arxiv.org/pdf/2312.13752">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.media.2024.103253">10.1016/j.media.2024.103253 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Hunting imaging biomarkers in pulmonary fibrosis: Benchmarks of the AIIB23 challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nan%2C+Y">Yang Nan</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+X">Xiaodan Xing</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shiyi Wang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zeyu Tang</a>, <a href="/search/cs?searchtype=author&query=Felder%2C+F+N">Federico N Felder</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Sheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Ledda%2C+R+E">Roberta Eufrasia Ledda</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+X">Xiaoliu Ding</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Ruiqi Yu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Weiping Liu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Feng Shi</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+T">Tianyang Sun</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Z">Zehong Cao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Minghui Zhang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yun Gu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hanxiao Zhang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jian Gao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pingyu Wang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+W">Wen Tang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+P">Pengxin Yu</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+H">Han Kang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junqiang Chen</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+X">Xing Lu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Boyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Mamalakis%2C+M">Michail Mamalakis</a> , et al. (16 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.13752v2-abstract-short" style="display: inline;"> Airway-related quantitative imaging biomarkers are crucial for examination, diagnosis, and prognosis in pulmonary diseases. However, the manual delineation of airway trees remains prohibitively time-consuming. While significant efforts have been made towards enhancing airway modelling, current public-available datasets concentrate on lung diseases with moderate morphological variations. The intric… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.13752v2-abstract-full').style.display = 'inline'; document.getElementById('2312.13752v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.13752v2-abstract-full" style="display: none;"> Airway-related quantitative imaging biomarkers are crucial for examination, diagnosis, and prognosis in pulmonary diseases. However, the manual delineation of airway trees remains prohibitively time-consuming. While significant efforts have been made towards enhancing airway modelling, current public-available datasets concentrate on lung diseases with moderate morphological variations. The intricate honeycombing patterns present in the lung tissues of fibrotic lung disease patients exacerbate the challenges, often leading to various prediction errors. To address this issue, the 'Airway-Informed Quantitative CT Imaging Biomarker for Fibrotic Lung Disease 2023' (AIIB23) competition was organized in conjunction with the official 2023 International Conference on Medical Image Computing and Computer Assisted Intervention (MICCAI). The airway structures were meticulously annotated by three experienced radiologists. Competitors were encouraged to develop automatic airway segmentation models with high robustness and generalization abilities, followed by exploring the most correlated QIB of mortality prediction. A training set of 120 high-resolution computerised tomography (HRCT) scans were publicly released with expert annotations and mortality status. The online validation set incorporated 52 HRCT scans from patients with fibrotic lung disease and the offline test set included 140 cases from fibrosis and COVID-19 patients. The results have shown that the capacity of extracting airway trees from patients with fibrotic lung disease could be enhanced by introducing voxel-wise weighted general union loss and continuity loss. In addition to the competitive image biomarkers for prognosis, a strong airway-derived biomarker (Hazard ratio>1.5, p<0.0001) was revealed for survival prognostication compared with existing clinical measurements, clinician assessment and AI-based biomarkers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.13752v2-abstract-full').style.display = 'none'; document.getElementById('2312.13752v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.02813">arXiv:2312.02813</a> <span> [<a href="https://arxiv.org/pdf/2312.02813">pdf</a>, <a href="https://arxiv.org/format/2312.02813">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> BIVDiff: A Training-Free Framework for General-Purpose Video Synthesis via Bridging Image and Video Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+F">Fengyuan Shi</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jiaxi Gu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hang Xu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Songcen Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Limin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.02813v2-abstract-short" style="display: inline;"> Diffusion models have made tremendous progress in text-driven image and video generation. Now text-to-image foundation models are widely applied to various downstream image synthesis tasks, such as controllable image generation and image editing, while downstream video synthesis tasks are less explored for several reasons. First, it requires huge memory and computation overhead to train a video ge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.02813v2-abstract-full').style.display = 'inline'; document.getElementById('2312.02813v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.02813v2-abstract-full" style="display: none;"> Diffusion models have made tremendous progress in text-driven image and video generation. Now text-to-image foundation models are widely applied to various downstream image synthesis tasks, such as controllable image generation and image editing, while downstream video synthesis tasks are less explored for several reasons. First, it requires huge memory and computation overhead to train a video generation foundation model. Even with video foundation models, additional costly training is still required for downstream video synthesis tasks. Second, although some works extend image diffusion models into videos in a training-free manner, temporal consistency cannot be well preserved. Finally, these adaption methods are specifically designed for one task and fail to generalize to different tasks. To mitigate these issues, we propose a training-free general-purpose video synthesis framework, coined as {\bf BIVDiff}, via bridging specific image diffusion models and general text-to-video foundation diffusion models. Specifically, we first use a specific image diffusion model (e.g., ControlNet and Instruct Pix2Pix) for frame-wise video generation, then perform Mixed Inversion on the generated video, and finally input the inverted latents into the video diffusion models (e.g., VidRD and ZeroScope) for temporal smoothing. This decoupled framework enables flexible image model selection for different purposes with strong task generalization and high efficiency. To validate the effectiveness and general use of BIVDiff, we perform a wide range of video synthesis tasks, including controllable video generation, video editing, video inpainting, and outpainting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.02813v2-abstract-full').style.display = 'none'; document.getElementById('2312.02813v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2024. Project page: https://bivdiff.github.io; GitHub repository: https://github.com/MCG-NJU/BIVDiff</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.17177">arXiv:2310.17177</a> <span> [<a href="https://arxiv.org/pdf/2310.17177">pdf</a>, <a href="https://arxiv.org/format/2310.17177">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Bridging The Gaps Between Token Pruning and Full Pre-training via Masked Fine-tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+F">Fengyuan Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Limin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.17177v1-abstract-short" style="display: inline;"> Despite the success of transformers on various computer vision tasks, they suffer from excessive memory and computational cost. Some works present dynamic vision transformers to accelerate inference by pruning redundant tokens. A key to improving token pruning is using well-trained models as initialization for faster convergence and better performance. However, current base models usually adopt fu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.17177v1-abstract-full').style.display = 'inline'; document.getElementById('2310.17177v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.17177v1-abstract-full" style="display: none;"> Despite the success of transformers on various computer vision tasks, they suffer from excessive memory and computational cost. Some works present dynamic vision transformers to accelerate inference by pruning redundant tokens. A key to improving token pruning is using well-trained models as initialization for faster convergence and better performance. However, current base models usually adopt full image training, i.e., using full images as inputs and keeping the whole feature maps through the forward process, which causes inconsistencies with dynamic models that gradually reduce tokens, including calculation pattern, information amount and token selection strategy inconsistencies. Inspired by MAE which performs masking and reconstruction self-supervised task, we devise masked fine-tuning to bridge the gaps between pre-trained base models used for initialization and token pruning based dynamic vision transformers, by masking image patches and predicting the image class label based on left unmasked patches. Extensive experiments on ImageNet demonstrate that base models via masked fine-tuning gain strong occlusion robustness and ability against information loss. With this better initialization, Dynamic ViT achieves higher accuracies, especially under large token pruning ratios (e.g., 81.9% vs. 81.3%, and 62.3% vs. 58.9% for DeiT based Dynamic ViT/0.8 and Dynamic ViT/0.3). Moreover, we apply our method into different token pruning based dynamic vision transformers, different pre-trained models and randomly initialized models to demonstrate the generalization ability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.17177v1-abstract-full').style.display = 'none'; document.getElementById('2310.17177v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to TIP</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.07654">arXiv:2310.07654</a> <span> [<a href="https://arxiv.org/pdf/2310.07654">pdf</a>, <a href="https://arxiv.org/format/2310.07654">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Audio-Visual Neural Syntax Acquisition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lai%2C+C+J">Cheng-I Jeff Lai</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Freda Shi</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+P">Puyuan Peng</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+Y">Yoon Kim</a>, <a href="/search/cs?searchtype=author&query=Gimpel%2C+K">Kevin Gimpel</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+S">Shiyu Chang</a>, <a href="/search/cs?searchtype=author&query=Chuang%2C+Y">Yung-Sung Chuang</a>, <a href="/search/cs?searchtype=author&query=Bhati%2C+S">Saurabhchand Bhati</a>, <a href="/search/cs?searchtype=author&query=Cox%2C+D">David Cox</a>, <a href="/search/cs?searchtype=author&query=Harwath%2C+D">David Harwath</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yang Zhang</a>, <a href="/search/cs?searchtype=author&query=Livescu%2C+K">Karen Livescu</a>, <a href="/search/cs?searchtype=author&query=Glass%2C+J">James Glass</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.07654v1-abstract-short" style="display: inline;"> We study phrase structure induction from visually-grounded speech. The core idea is to first segment the speech waveform into sequences of word segments, and subsequently induce phrase structure using the inferred segment-level continuous representations. We present the Audio-Visual Neural Syntax Learner (AV-NSL) that learns phrase structure by listening to audio and looking at images, without eve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.07654v1-abstract-full').style.display = 'inline'; document.getElementById('2310.07654v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.07654v1-abstract-full" style="display: none;"> We study phrase structure induction from visually-grounded speech. The core idea is to first segment the speech waveform into sequences of word segments, and subsequently induce phrase structure using the inferred segment-level continuous representations. We present the Audio-Visual Neural Syntax Learner (AV-NSL) that learns phrase structure by listening to audio and looking at images, without ever being exposed to text. By training on paired images and spoken captions, AV-NSL exhibits the capability to infer meaningful phrase structures that are comparable to those derived by naturally-supervised text parsers, for both English and German. Our findings extend prior work in unsupervised language acquisition from speech and grounded grammar induction, and present one approach to bridge the gap between the two topics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.07654v1-abstract-full').style.display = 'none'; document.getElementById('2310.07654v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.14359">arXiv:2309.14359</a> <span> [<a href="https://arxiv.org/pdf/2309.14359">pdf</a>, <a href="https://arxiv.org/format/2309.14359">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Optimizing Chance-Constrained Submodular Problems with Variable Uncertainties </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xiankun Yan</a>, <a href="/search/cs?searchtype=author&query=Do%2C+A+V">Anh Viet Do</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Feng Shi</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+X">Xiaoyu Qin</a>, <a href="/search/cs?searchtype=author&query=Neumann%2C+F">Frank Neumann</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.14359v1-abstract-short" style="display: inline;"> Chance constraints are frequently used to limit the probability of constraint violations in real-world optimization problems where the constraints involve stochastic components. We study chance-constrained submodular optimization problems, which capture a wide range of optimization problems with stochastic constraints. Previous studies considered submodular problems with stochastic knapsack constr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.14359v1-abstract-full').style.display = 'inline'; document.getElementById('2309.14359v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.14359v1-abstract-full" style="display: none;"> Chance constraints are frequently used to limit the probability of constraint violations in real-world optimization problems where the constraints involve stochastic components. We study chance-constrained submodular optimization problems, which capture a wide range of optimization problems with stochastic constraints. Previous studies considered submodular problems with stochastic knapsack constraints in the case where uncertainties are the same for each item that can be selected. However, uncertainty levels are usually variable with respect to the different stochastic components in real-world scenarios, and rigorous analysis for this setting is missing in the context of submodular optimization. This paper provides the first such analysis for this case, where the weights of items have the same expectation but different dispersion. We present greedy algorithms that can obtain a high-quality solution, i.e., a constant approximation ratio to the given optimal solution from the deterministic setting. In the experiments, we demonstrate that the algorithms perform effectively on several chance-constrained instances of the maximum coverage problem and the influence maximization problem. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.14359v1-abstract-full').style.display = 'none'; document.getElementById('2309.14359v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.14225">arXiv:2309.14225</a> <span> [<a href="https://arxiv.org/pdf/2309.14225">pdf</a>, <a href="https://arxiv.org/format/2309.14225">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ICRA57147.2024.10610449">10.1109/ICRA57147.2024.10610449 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> HumanMimic: Learning Natural Locomotion and Transitions for Humanoid Robot via Wasserstein Adversarial Imitation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+A">Annan Tang</a>, <a href="/search/cs?searchtype=author&query=Hiraoka%2C+T">Takuma Hiraoka</a>, <a href="/search/cs?searchtype=author&query=Hiraoka%2C+N">Naoki Hiraoka</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Fan Shi</a>, <a href="/search/cs?searchtype=author&query=Kawaharazuka%2C+K">Kento Kawaharazuka</a>, <a href="/search/cs?searchtype=author&query=Kojima%2C+K">Kunio Kojima</a>, <a href="/search/cs?searchtype=author&query=Okada%2C+K">Kei Okada</a>, <a href="/search/cs?searchtype=author&query=Inaba%2C+M">Masayuki Inaba</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.14225v4-abstract-short" style="display: inline;"> Transferring human motion skills to humanoid robots remains a significant challenge. In this study, we introduce a Wasserstein adversarial imitation learning system, allowing humanoid robots to replicate natural whole-body locomotion patterns and execute seamless transitions by mimicking human motions. First, we present a unified primitive-skeleton motion retargeting to mitigate morphological diff… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.14225v4-abstract-full').style.display = 'inline'; document.getElementById('2309.14225v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.14225v4-abstract-full" style="display: none;"> Transferring human motion skills to humanoid robots remains a significant challenge. In this study, we introduce a Wasserstein adversarial imitation learning system, allowing humanoid robots to replicate natural whole-body locomotion patterns and execute seamless transitions by mimicking human motions. First, we present a unified primitive-skeleton motion retargeting to mitigate morphological differences between arbitrary human demonstrators and humanoid robots. An adversarial critic component is integrated with Reinforcement Learning (RL) to guide the control policy to produce behaviors aligned with the data distribution of mixed reference motions. Additionally, we employ a specific Integral Probabilistic Metric (IPM), namely the Wasserstein-1 distance with a novel soft boundary constraint to stabilize the training process and prevent mode collapse. Our system is evaluated on a full-sized humanoid JAXON in the simulator. The resulting control policy demonstrates a wide range of locomotion patterns, including standing, push-recovery, squat walking, human-like straight-leg walking, and dynamic running. Notably, even in the absence of transition motions in the demonstration dataset, robots showcase an emerging ability to transit naturally between distinct locomotion patterns as desired speed changes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.14225v4-abstract-full').style.display = 'none'; document.getElementById('2309.14225v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.01219">arXiv:2309.01219</a> <span> [<a href="https://arxiv.org/pdf/2309.01219">pdf</a>, <a href="https://arxiv.org/format/2309.01219">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Siren's Song in the AI Ocean: A Survey on Hallucination in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yue Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yafu Li</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+L">Leyang Cui</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+D">Deng Cai</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lemao Liu</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+T">Tingchen Fu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xinting Huang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+E">Enbo Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yulong Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Longyue Wang</a>, <a href="/search/cs?searchtype=author&query=Luu%2C+A+T">Anh Tuan Luu</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+W">Wei Bi</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Freda Shi</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+S">Shuming Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.01219v2-abstract-short" style="display: inline;"> While large language models (LLMs) have demonstrated remarkable capabilities across a range of downstream tasks, a significant concern revolves around their propensity to exhibit hallucinations: LLMs occasionally generate content that diverges from the user input, contradicts previously generated context, or misaligns with established world knowledge. This phenomenon poses a substantial challenge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.01219v2-abstract-full').style.display = 'inline'; document.getElementById('2309.01219v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.01219v2-abstract-full" style="display: none;"> While large language models (LLMs) have demonstrated remarkable capabilities across a range of downstream tasks, a significant concern revolves around their propensity to exhibit hallucinations: LLMs occasionally generate content that diverges from the user input, contradicts previously generated context, or misaligns with established world knowledge. This phenomenon poses a substantial challenge to the reliability of LLMs in real-world scenarios. In this paper, we survey recent efforts on the detection, explanation, and mitigation of hallucination, with an emphasis on the unique challenges posed by LLMs. We present taxonomies of the LLM hallucination phenomena and evaluation benchmarks, analyze existing approaches aiming at mitigating LLM hallucination, and discuss potential directions for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.01219v2-abstract-full').style.display = 'none'; document.getElementById('2309.01219v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">work in progress; 32 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.15703">arXiv:2308.15703</a> <span> [<a href="https://arxiv.org/pdf/2308.15703">pdf</a>, <a href="https://arxiv.org/format/2308.15703">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Fragment and Integrate Network (FIN): A Novel Spatial-Temporal Modeling Based on Long Sequential Behavior for Online Food Ordering Click-Through Rate Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jun Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingjian Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongwei Wang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+X">Xing Deng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jielong Chen</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+B">Bing Cao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zekun Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+G">Guanjie Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Ge Zhang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Feng Shi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hualei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.15703v1-abstract-short" style="display: inline;"> Spatial-temporal information has been proven to be of great significance for click-through rate prediction tasks in online Location-Based Services (LBS), especially in mainstream food ordering platforms such as DoorDash, Uber Eats, Meituan, and Ele.me. Modeling user spatial-temporal preferences with sequential behavior data has become a hot topic in recommendation systems and online advertising. H… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.15703v1-abstract-full').style.display = 'inline'; document.getElementById('2308.15703v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.15703v1-abstract-full" style="display: none;"> Spatial-temporal information has been proven to be of great significance for click-through rate prediction tasks in online Location-Based Services (LBS), especially in mainstream food ordering platforms such as DoorDash, Uber Eats, Meituan, and Ele.me. Modeling user spatial-temporal preferences with sequential behavior data has become a hot topic in recommendation systems and online advertising. However, most of existing methods either lack the representation of rich spatial-temporal information or only handle user behaviors with limited length, e.g. 100. In this paper, we tackle these problems by designing a new spatial-temporal modeling paradigm named Fragment and Integrate Network (FIN). FIN consists of two networks: (i) Fragment Network (FN) extracts Multiple Sub-Sequences (MSS) from lifelong sequential behavior data, and captures the specific spatial-temporal representation by modeling each MSS respectively. Here both a simplified attention and a complicated attention are adopted to balance the performance gain and resource consumption. (ii) Integrate Network (IN) builds a new integrated sequence by utilizing spatial-temporal interaction on MSS and captures the comprehensive spatial-temporal representation by modeling the integrated sequence with a complicated attention. Both public datasets and production datasets have demonstrated the accuracy and scalability of FIN. Since 2022, FIN has been fully deployed in the recommendation advertising system of Ele.me, one of the most popular online food ordering platforms in China, obtaining 5.7% improvement on Click-Through Rate (CTR) and 7.3% increase on Revenue Per Mille (RPM). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.15703v1-abstract-full').style.display = 'none'; document.getElementById('2308.15703v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CIKM 2023 Applied Research Paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.07193">arXiv:2308.07193</a> <span> [<a href="https://arxiv.org/pdf/2308.07193">pdf</a>, <a href="https://arxiv.org/format/2308.07193">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Task Offloading for Smart Glasses in Healthcare: Enhancing Detection of Elevated Body Temperature </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Naouri%2C+A">Abdenacer Naouri</a>, <a href="/search/cs?searchtype=author&query=Nouri%2C+N+A">Nabil Abdelkader Nouri</a>, <a href="/search/cs?searchtype=author&query=Qammar%2C+A">Attia Qammar</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Feifei Shi</a>, <a href="/search/cs?searchtype=author&query=Ning%2C+H">Huansheng Ning</a>, <a href="/search/cs?searchtype=author&query=Dhelim%2C+S">Sahraoui Dhelim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.07193v1-abstract-short" style="display: inline;"> Wearable devices like smart glasses have gained popularity across various applications. However, their limited computational capabilities pose challenges for tasks that require extensive processing, such as image and video processing, leading to drained device batteries. To address this, offloading such tasks to nearby powerful remote devices, such as mobile devices or remote servers, has emerged… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.07193v1-abstract-full').style.display = 'inline'; document.getElementById('2308.07193v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.07193v1-abstract-full" style="display: none;"> Wearable devices like smart glasses have gained popularity across various applications. However, their limited computational capabilities pose challenges for tasks that require extensive processing, such as image and video processing, leading to drained device batteries. To address this, offloading such tasks to nearby powerful remote devices, such as mobile devices or remote servers, has emerged as a promising solution. This paper focuses on analyzing task-offloading scenarios for a healthcare monitoring application performed on smart wearable glasses, aiming to identify the optimal conditions for offloading. The study evaluates performance metrics including task completion time, computing capabilities, and energy consumption under realistic conditions. A specific use case is explored within an indoor area like an airport, where security agents wearing smart glasses to detect elevated body temperature in individuals, potentially indicating COVID-19. The findings highlight the potential benefits of task offloading for wearable devices in healthcare settings, demonstrating its practicality and relevance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.07193v1-abstract-full').style.display = 'none'; document.getElementById('2308.07193v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.07734">arXiv:2307.07734</a> <span> [<a href="https://arxiv.org/pdf/2307.07734">pdf</a>, <a href="https://arxiv.org/format/2307.07734">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Abstracting Concept-Changing Rules for Solving Raven's Progressive Matrix Problems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+F">Fan Shi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bin Li</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+X">Xiangyang Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.07734v1-abstract-short" style="display: inline;"> The abstract visual reasoning ability in human intelligence benefits discovering underlying rules in the novel environment. Raven's Progressive Matrix (RPM) is a classic test to realize such ability in machine intelligence by selecting from candidates. Recent studies suggest that solving RPM in an answer-generation way boosts a more in-depth understanding of rules. However, existing generative sol… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.07734v1-abstract-full').style.display = 'inline'; document.getElementById('2307.07734v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.07734v1-abstract-full" style="display: none;"> The abstract visual reasoning ability in human intelligence benefits discovering underlying rules in the novel environment. Raven's Progressive Matrix (RPM) is a classic test to realize such ability in machine intelligence by selecting from candidates. Recent studies suggest that solving RPM in an answer-generation way boosts a more in-depth understanding of rules. However, existing generative solvers cannot discover the global concept-changing rules without auxiliary supervision (e.g., rule annotations and distractors in candidate sets). To this end, we propose a deep latent variable model for Concept-changing Rule ABstraction (CRAB) by learning interpretable concepts and parsing concept-changing rules in the latent space. With the iterative learning process, CRAB can automatically abstract global rules shared on the dataset on each concept and form the learnable prior knowledge of global rules. CRAB outperforms the baselines trained without auxiliary supervision in the arbitrary-position answer generation task and achieves comparable and even higher accuracy than the compared models trained with auxiliary supervision. Finally, we conduct experiments to illustrate the interpretability of CRAB in concept learning, answer selection, and global rule abstraction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.07734v1-abstract-full').style.display = 'none'; document.getElementById('2307.07734v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.03799">arXiv:2306.03799</a> <span> [<a href="https://arxiv.org/pdf/2306.03799">pdf</a>, <a href="https://arxiv.org/format/2306.03799">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Prompt Space Optimizing Few-shot Reasoning Success with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+F">Fobo Shi</a>, <a href="/search/cs?searchtype=author&query=Qing%2C+P">Peijun Qing</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dong Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+N">Nan Wang</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+Y">Youbo Lei</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Haonan Lu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xiaodong Lin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Duantengchuan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.03799v2-abstract-short" style="display: inline;"> Prompt engineering is an essential technique for enhancing the abilities of large language models (LLMs) by providing explicit and specific instructions. It enables LLMs to excel in various tasks, such as arithmetic reasoning, question answering, summarization, relation extraction, machine translation, and sentiment analysis. Researchers have been actively exploring different prompt engineering st… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.03799v2-abstract-full').style.display = 'inline'; document.getElementById('2306.03799v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.03799v2-abstract-full" style="display: none;"> Prompt engineering is an essential technique for enhancing the abilities of large language models (LLMs) by providing explicit and specific instructions. It enables LLMs to excel in various tasks, such as arithmetic reasoning, question answering, summarization, relation extraction, machine translation, and sentiment analysis. Researchers have been actively exploring different prompt engineering strategies, such as Chain of Thought (CoT), Zero-CoT, and In-context learning. However, an unresolved problem arises from the fact that current approaches lack a solid mathematical solution for determining optimal prompts. To address this issue in prompt engineering, we propose a new and effective approach called Prompt Space. Our methodology utilizes text embeddings to obtain basis vectors by matrix decomposition, and then constructs a space for representing all prompts. Prompt Space significantly outperforms state-of-the-art prompt paradigms on ten public reasoning benchmarks. Notably, without the help of the CoT method and the prompt "Let's think step by step", Prompt Space shows superior performance over the few-shot method. Overall, our approach provides a robust and effective mathematical framework for selecting simple and effective prompts. This advancement marks a significant step towards improving prompt engineering for a wide variety of applications in LLMs. Our code is publicly available at \textcolor{blue}{\url{https://github.com/YouBLEI/Prompt-Space}} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.03799v2-abstract-full').style.display = 'none'; document.getElementById('2306.03799v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Natural language processing (NLP)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.15822">arXiv:2305.15822</a> <span> [<a href="https://arxiv.org/pdf/2305.15822">pdf</a>, <a href="https://arxiv.org/format/2305.15822">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Towards Label Position Bias in Graph Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+H">Haoyu Han</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaorui Liu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Feng Shi</a>, <a href="/search/cs?searchtype=author&query=Torkamani%2C+M">MohamadAli Torkamani</a>, <a href="/search/cs?searchtype=author&query=Aggarwal%2C+C+C">Charu C. Aggarwal</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jiliang Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.15822v1-abstract-short" style="display: inline;"> Graph Neural Networks (GNNs) have emerged as a powerful tool for semi-supervised node classification tasks. However, recent studies have revealed various biases in GNNs stemming from both node features and graph topology. In this work, we uncover a new bias - label position bias, which indicates that the node closer to the labeled nodes tends to perform better. We introduce a new metric, the Label… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.15822v1-abstract-full').style.display = 'inline'; document.getElementById('2305.15822v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.15822v1-abstract-full" style="display: none;"> Graph Neural Networks (GNNs) have emerged as a powerful tool for semi-supervised node classification tasks. However, recent studies have revealed various biases in GNNs stemming from both node features and graph topology. In this work, we uncover a new bias - label position bias, which indicates that the node closer to the labeled nodes tends to perform better. We introduce a new metric, the Label Proximity Score, to quantify this bias, and find that it is closely related to performance disparities. To address the label position bias, we propose a novel optimization framework for learning a label position unbiased graph structure, which can be applied to existing GNNs. Extensive experiments demonstrate that our proposed method not only outperforms backbone methods but also significantly mitigates the issue of label position bias in GNNs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.15822v1-abstract-full').style.display = 'none'; document.getElementById('2305.15822v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.01360">arXiv:2305.01360</a> <span> [<a href="https://arxiv.org/pdf/2305.01360">pdf</a>, <a href="https://arxiv.org/format/2305.01360">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Self-supervised arbitrary scale super-resolution framework for anisotropic MRI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haonan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuhan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qing Wu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiangjie Wu</a>, <a href="/search/cs?searchtype=author&query=Zhen%2C+Z">Zhiming Zhen</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Feng Shi</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jianmin Yuan</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+H">Hongjiang Wei</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chen Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuyao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.01360v1-abstract-short" style="display: inline;"> In this paper, we propose an efficient self-supervised arbitrary-scale super-resolution (SR) framework to reconstruct isotropic magnetic resonance (MR) images from anisotropic MRI inputs without involving external training data. The proposed framework builds a training dataset using in-the-wild anisotropic MR volumes with arbitrary image resolution. We then formulate the 3D volume SR task as a SR… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.01360v1-abstract-full').style.display = 'inline'; document.getElementById('2305.01360v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.01360v1-abstract-full" style="display: none;"> In this paper, we propose an efficient self-supervised arbitrary-scale super-resolution (SR) framework to reconstruct isotropic magnetic resonance (MR) images from anisotropic MRI inputs without involving external training data. The proposed framework builds a training dataset using in-the-wild anisotropic MR volumes with arbitrary image resolution. We then formulate the 3D volume SR task as a SR problem for 2D image slices. The anisotropic volume's high-resolution (HR) plane is used to build the HR-LR image pairs for model training. We further adapt the implicit neural representation (INR) network to implement the 2D arbitrary-scale image SR model. Finally, we leverage the well-trained proposed model to up-sample the 2D LR plane extracted from the anisotropic MR volumes to their HR views. The isotropic MR volumes thus can be reconstructed by stacking and averaging the generated HR slices. Our proposed framework has two major advantages: (1) It only involves the arbitrary-resolution anisotropic MR volumes, which greatly improves the model practicality in real MR imaging scenarios (e.g., clinical brain image acquisition); (2) The INR-based SR model enables arbitrary-scale image SR from the arbitrary-resolution input image, which significantly improves model training efficiency. We perform experiments on a simulated public adult brain dataset and a real collected 7T brain dataset. The results indicate that our current framework greatly outperforms two well-known self-supervised models for anisotropic MR image SR tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.01360v1-abstract-full').style.display = 'none'; document.getElementById('2305.01360v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.14006">arXiv:2304.14006</a> <span> [<a href="https://arxiv.org/pdf/2304.14006">pdf</a>, <a href="https://arxiv.org/format/2304.14006">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Edit Everything: A Text-Guided Generative System for Images Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+D">Defeng Xie</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruichen Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jian Ma</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chen Chen</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Haonan Lu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dong Yang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Fobo Shi</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xiaodong Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.14006v1-abstract-short" style="display: inline;"> We introduce a new generative system called Edit Everything, which can take image and text inputs and produce image outputs. Edit Everything allows users to edit images using simple text instructions. Our system designs prompts to guide the visual module in generating requested images. Experiments demonstrate that Edit Everything facilitates the implementation of the visual aspects of Stable Diffu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.14006v1-abstract-full').style.display = 'inline'; document.getElementById('2304.14006v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.14006v1-abstract-full" style="display: none;"> We introduce a new generative system called Edit Everything, which can take image and text inputs and produce image outputs. Edit Everything allows users to edit images using simple text instructions. Our system designs prompts to guide the visual module in generating requested images. Experiments demonstrate that Edit Everything facilitates the implementation of the visual aspects of Stable Diffusion with the use of Segment Anything model and CLIP. Our system is publicly available at https://github.com/DefengXie/Edit_Everything. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.14006v1-abstract-full').style.display = 'none'; document.getElementById('2304.14006v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.03812">arXiv:2304.03812</a> <span> [<a href="https://arxiv.org/pdf/2304.03812">pdf</a>, <a href="https://arxiv.org/format/2304.03812">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> High-order Spatial Interactions Enhanced Lightweight Model for Optical Remote Sensing Image-based Small Ship Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+Y">Yifan Yin</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xu Cheng</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+F">Fan Shi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiufeng Liu</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+H">Huan Huo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shengyong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.03812v1-abstract-short" style="display: inline;"> Accurate and reliable optical remote sensing image-based small-ship detection is crucial for maritime surveillance systems, but existing methods often struggle with balancing detection performance and computational complexity. In this paper, we propose a novel lightweight framework called \textit{HSI-ShipDetectionNet} that is based on high-order spatial interactions and is suitable for deployment… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.03812v1-abstract-full').style.display = 'inline'; document.getElementById('2304.03812v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.03812v1-abstract-full" style="display: none;"> Accurate and reliable optical remote sensing image-based small-ship detection is crucial for maritime surveillance systems, but existing methods often struggle with balancing detection performance and computational complexity. In this paper, we propose a novel lightweight framework called \textit{HSI-ShipDetectionNet} that is based on high-order spatial interactions and is suitable for deployment on resource-limited platforms, such as satellites and unmanned aerial vehicles. HSI-ShipDetectionNet includes a prediction branch specifically for tiny ships and a lightweight hybrid attention block for reduced complexity. Additionally, the use of a high-order spatial interactions module improves advanced feature understanding and modeling ability. Our model is evaluated using the public Kaggle marine ship detection dataset and compared with multiple state-of-the-art models including small object detection models, lightweight detection models, and ship detection models. The results show that HSI-ShipDetectionNet outperforms the other models in terms of recall, and mean average precision (mAP) while being lightweight and suitable for deployment on resource-limited platforms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.03812v1-abstract-full').style.display = 'none'; document.getElementById('2304.03812v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Shi%2C+F&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Shi%2C+F&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Shi%2C+F&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Shi%2C+F&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository