Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 519 results for author: <span class="mathjax">Jia, Y</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Jia%2C+Y">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Jia, Y"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Jia%2C+Y&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Jia, Y"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Jia%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Jia%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Jia%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Jia%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Jia%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Jia%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.19230">arXiv:2411.19230</a> <span> [<a href="https://arxiv.org/pdf/2411.19230">pdf</a>, <a href="https://arxiv.org/format/2411.19230">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Pre-Training Graph Contrastive Masked Autoencoders are Strong Distillers for EEG </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xinxu Wei</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+K">Kanhao Zhao</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+Y">Yong Jiao</a>, <a href="/search/cs?searchtype=author&query=Carlisle%2C+N+B">Nancy B. Carlisle</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+H">Hua Xie</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.19230v1-abstract-short" style="display: inline;"> Effectively utilizing extensive unlabeled high-density EEG data to improve performance in scenarios with limited labeled low-density EEG data presents a significant challenge. In this paper, we address this by framing it as a graph transfer learning and knowledge distillation problem. We propose a Unified Pre-trained Graph Contrastive Masked Autoencoder Distiller, named EEG-DisGCMAE, to bridge the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19230v1-abstract-full').style.display = 'inline'; document.getElementById('2411.19230v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.19230v1-abstract-full" style="display: none;"> Effectively utilizing extensive unlabeled high-density EEG data to improve performance in scenarios with limited labeled low-density EEG data presents a significant challenge. In this paper, we address this by framing it as a graph transfer learning and knowledge distillation problem. We propose a Unified Pre-trained Graph Contrastive Masked Autoencoder Distiller, named EEG-DisGCMAE, to bridge the gap between unlabeled/labeled and high/low-density EEG data. To fully leverage the abundant unlabeled EEG data, we introduce a novel unified graph self-supervised pre-training paradigm, which seamlessly integrates Graph Contrastive Pre-training and Graph Masked Autoencoder Pre-training. This approach synergistically combines contrastive and generative pre-training techniques by reconstructing contrastive samples and contrasting the reconstructions. For knowledge distillation from high-density to low-density EEG data, we propose a Graph Topology Distillation loss function, allowing a lightweight student model trained on low-density data to learn from a teacher model trained on high-density data, effectively handling missing electrodes through contrastive distillation. To integrate transfer learning and distillation, we jointly pre-train the teacher and student models by contrasting their queries and keys during pre-training, enabling robust distillers for downstream tasks. We demonstrate the effectiveness of our method on four classification tasks across two clinical EEG datasets with abundant unlabeled data and limited labeled data. The experimental results show that our approach significantly outperforms contemporary methods in both efficiency and accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19230v1-abstract-full').style.display = 'none'; document.getElementById('2411.19230v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">24 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.19099">arXiv:2411.19099</a> <span> [<a href="https://arxiv.org/pdf/2411.19099">pdf</a>, <a href="https://arxiv.org/format/2411.19099">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Software Maintenance: A Learning to Rank Approach for Co-changed Method Identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yiping Jia</a>, <a href="/search/cs?searchtype=author&query=Hassan%2C+S">Safwat Hassan</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+Y">Ying Zou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.19099v1-abstract-short" style="display: inline;"> With the increasing complexity of large-scale software systems, identifying all necessary modifications for a specific change is challenging. Co-changed methods, which are methods frequently modified together, are crucial for understanding software dependencies. However, existing methods often produce large results with high false positives. Focusing on pull requests instead of individual commits… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19099v1-abstract-full').style.display = 'inline'; document.getElementById('2411.19099v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.19099v1-abstract-full" style="display: none;"> With the increasing complexity of large-scale software systems, identifying all necessary modifications for a specific change is challenging. Co-changed methods, which are methods frequently modified together, are crucial for understanding software dependencies. However, existing methods often produce large results with high false positives. Focusing on pull requests instead of individual commits provides a more comprehensive view of related changes, capturing essential co-change relationships. To address these challenges, we propose a learning-to-rank approach that combines source code features and change history to predict and rank co-changed methods at the pull-request level. Experiments on 150 open-source Java projects, totaling 41.5 million lines of code and 634,216 pull requests, show that the Random Forest model outperforms other models by 2.5 to 12.8 percent in NDCG@5. It also surpasses baselines such as file proximity, code clones, FCP2Vec, and StarCoder 2 by 4.7 to 537.5 percent. Models trained on longer historical data (90 to 180 days) perform consistently, while accuracy declines after 60 days, highlighting the need for bi-monthly retraining. This approach provides an effective tool for managing co-changed methods, enabling development teams to handle dependencies and maintain software quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19099v1-abstract-full').style.display = 'none'; document.getElementById('2411.19099v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18623">arXiv:2411.18623</a> <span> [<a href="https://arxiv.org/pdf/2411.18623">pdf</a>, <a href="https://arxiv.org/format/2411.18623">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Lift3D Foundation Policy: Lifting 2D Large-Scale Pretrained Models for Robust 3D Robotic Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yueru Jia</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaming Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Sixiang Chen</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+C">Chenyang Gu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhilue Wang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+L">Longzan Luo</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+L">Lily Lee</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pengwei Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhongyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Renrui Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shanghang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18623v1-abstract-short" style="display: inline;"> 3D geometric information is essential for manipulation tasks, as robots need to perceive the 3D environment, reason about spatial relationships, and interact with intricate spatial configurations. Recent research has increasingly focused on the explicit extraction of 3D features, while still facing challenges such as the lack of large-scale robotic 3D data and the potential loss of spatial geometr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18623v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18623v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18623v1-abstract-full" style="display: none;"> 3D geometric information is essential for manipulation tasks, as robots need to perceive the 3D environment, reason about spatial relationships, and interact with intricate spatial configurations. Recent research has increasingly focused on the explicit extraction of 3D features, while still facing challenges such as the lack of large-scale robotic 3D data and the potential loss of spatial geometry. To address these limitations, we propose the Lift3D framework, which progressively enhances 2D foundation models with implicit and explicit 3D robotic representations to construct a robust 3D manipulation policy. Specifically, we first design a task-aware masked autoencoder that masks task-relevant affordance patches and reconstructs depth information, enhancing the 2D foundation model's implicit 3D robotic representation. After self-supervised fine-tuning, we introduce a 2D model-lifting strategy that establishes a positional mapping between the input 3D points and the positional embeddings of the 2D model. Based on the mapping, Lift3D utilizes the 2D foundation model to directly encode point cloud data, leveraging large-scale pretrained knowledge to construct explicit 3D robotic representations while minimizing spatial information loss. In experiments, Lift3D consistently outperforms previous state-of-the-art methods across several simulation benchmarks and real-world scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18623v1-abstract-full').style.display = 'none'; document.getElementById('2411.18623v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16095">arXiv:2411.16095</a> <span> [<a href="https://arxiv.org/pdf/2411.16095">pdf</a>, <a href="https://arxiv.org/format/2411.16095">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LDACP: Long-Delayed Ad Conversions Prediction Model for Bidding Strategy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cui%2C+P">Peng Cui</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yiming Yang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+F">Fusheng Jin</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Siyuan Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yunli Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+F">Fukang Yang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yalong Jia</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Q">Qingpeng Cai</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+F">Fei Pan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Changcheng Li</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+P">Peng Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16095v1-abstract-short" style="display: inline;"> In online advertising, once an ad campaign is deployed, the automated bidding system dynamically adjusts the bidding strategy to optimize Cost Per Action (CPA) based on the number of ad conversions. For ads with a long conversion delay, relying solely on the real-time tracked conversion number as a signal for bidding strategy can significantly overestimate the current CPA, leading to conservative… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16095v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16095v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16095v1-abstract-full" style="display: none;"> In online advertising, once an ad campaign is deployed, the automated bidding system dynamically adjusts the bidding strategy to optimize Cost Per Action (CPA) based on the number of ad conversions. For ads with a long conversion delay, relying solely on the real-time tracked conversion number as a signal for bidding strategy can significantly overestimate the current CPA, leading to conservative bidding strategies. Therefore, it is crucial to predict the number of long-delayed conversions. Nonetheless, it is challenging to predict ad conversion numbers through traditional regression methods due to the wide range of ad conversion numbers. Previous regression works have addressed this challenge by transforming regression problems into bucket classification problems, achieving success in various scenarios. However, specific challenges arise when predicting the number of ad conversions: 1) The integer nature of ad conversion numbers exacerbates the discontinuity issue in one-hot hard labels; 2) The long-tail distribution of ad conversion numbers complicates tail data prediction. In this paper, we propose the Long-Delayed Ad Conversions Prediction model for bidding strategy (LDACP), which consists of two sub-modules. To alleviate the issue of discontinuity in one-hot hard labels, the Bucket Classification Module with label Smoothing method (BCMS) converts one-hot hard labels into non-normalized soft labels, then fits these soft labels by minimizing classification loss and regression loss. To address the challenge of predicting tail data, the Value Regression Module with Proxy labels (VRMP) uses the prediction bias of aggregated pCTCVR as proxy labels. Finally, a Mixture of Experts (MoE) structure integrates the predictions from BCMS and VRMP to obtain the final predicted ad conversion number. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16095v1-abstract-full').style.display = 'none'; document.getElementById('2411.16095v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 8 figures, 6 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11551">arXiv:2411.11551</a> <span> [<a href="https://arxiv.org/pdf/2411.11551">pdf</a>, <a href="https://arxiv.org/format/2411.11551">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Simple But Not Secure: An Empirical Security Analysis of Two-factor Authentication Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhi Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xin Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Du Chen</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Han Gao</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+M">Meiqi Tian</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yan Jia</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wanpeng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11551v1-abstract-short" style="display: inline;"> To protect users from data breaches and phishing attacks, service providers typically implement two-factor authentication (2FA) to add an extra layer of security against suspicious login attempts. However, since 2FA can sometimes hinder user experience by introducing additional steps, many websites aim to reduce inconvenience by minimizing the frequency of 2FA prompts. One approach to achieve this… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11551v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11551v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11551v1-abstract-full" style="display: none;"> To protect users from data breaches and phishing attacks, service providers typically implement two-factor authentication (2FA) to add an extra layer of security against suspicious login attempts. However, since 2FA can sometimes hinder user experience by introducing additional steps, many websites aim to reduce inconvenience by minimizing the frequency of 2FA prompts. One approach to achieve this is by storing the user's ``Remember the Device'' preference in a cookie. As a result, users are only prompted for 2FA when this cookie expires or if they log in from a new device. To understand and improve the security of 2FA systems in real-world settings, we propose SE2FA, a vulnerability evaluation framework designed to detect vulnerabilities in 2FA systems. This framework enables us to analyze the security of 407 2FA systems across popular websites from the Tranco Top 10,000 list. Our analysis and evaluation found three zero-day vulnerabilities on three service providers that could allow an attacker to access a victim's account without possessing the victim's second authentication factor, thereby bypassing 2FA protections entirely. A further investigation found that these vulnerabilities stem from design choices aimed at simplifying 2FA for users but that unintentionally reduce its security effectiveness. We have disclosed these findings to the affected websites and assisted them in mitigating the risks. Based on the insights from this research, we provide practical recommendations for countermeasures to strengthen 2FA security and address these newly identified threats. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11551v1-abstract-full').style.display = 'none'; document.getElementById('2411.11551v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06773">arXiv:2411.06773</a> <span> [<a href="https://arxiv.org/pdf/2411.06773">pdf</a>, <a href="https://arxiv.org/format/2411.06773">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Model Partition and Resource Allocation for Split Learning in Vehicular Edge Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+L">Lu Yu</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+Z">Zheng Chang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yunjian Jia</a>, <a href="/search/cs?searchtype=author&query=Min%2C+G">Geyong Min</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06773v1-abstract-short" style="display: inline;"> The integration of autonomous driving technologies with vehicular networks presents significant challenges in privacy preservation, communication efficiency, and resource allocation. This paper proposes a novel U-shaped split federated learning (U-SFL) framework to address these challenges on the way of realizing in vehicular edge networks. U-SFL is able to enhance privacy protection by keeping bo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06773v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06773v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06773v1-abstract-full" style="display: none;"> The integration of autonomous driving technologies with vehicular networks presents significant challenges in privacy preservation, communication efficiency, and resource allocation. This paper proposes a novel U-shaped split federated learning (U-SFL) framework to address these challenges on the way of realizing in vehicular edge networks. U-SFL is able to enhance privacy protection by keeping both raw data and labels on the vehicular user (VU) side while enabling parallel processing across multiple vehicles. To optimize communication efficiency, we introduce a semantic-aware auto-encoder (SAE) that significantly reduces the dimensionality of transmitted data while preserving essential semantic information. Furthermore, we develop a deep reinforcement learning (DRL) based algorithm to solve the NP-hard problem of dynamic resource allocation and split point selection. Our comprehensive evaluation demonstrates that U-SFL achieves comparable classification performance to traditional split learning (SL) while substantially reducing data transmission volume and communication latency. The proposed DRL-based optimization algorithm shows good convergence in balancing latency, energy consumption, and learning performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06773v1-abstract-full').style.display = 'none'; document.getElementById('2411.06773v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2306.12194 by other authors</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05842">arXiv:2411.05842</a> <span> [<a href="https://arxiv.org/pdf/2411.05842">pdf</a>, <a href="https://arxiv.org/format/2411.05842">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Efficient and Robust Freeway Traffic Speed Estimation under Oblique Grid using Vehicle Trajectory Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+Y">Yang He</a>, <a href="/search/cs?searchtype=author&query=An%2C+C">Chengchuan An</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yuheng Jia</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiachao Liu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zhenbo Lu</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+J">Jingxin Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05842v1-abstract-short" style="display: inline;"> Accurately estimating spatiotemporal traffic states on freeways is a significant challenge due to limited sensor deployment and potential data corruption. In this study, we propose an efficient and robust low-rank model for precise spatiotemporal traffic speed state estimation (TSE) using lowpenetration vehicle trajectory data. Leveraging traffic wave priors, an oblique grid-based matrix is first… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05842v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05842v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05842v1-abstract-full" style="display: none;"> Accurately estimating spatiotemporal traffic states on freeways is a significant challenge due to limited sensor deployment and potential data corruption. In this study, we propose an efficient and robust low-rank model for precise spatiotemporal traffic speed state estimation (TSE) using lowpenetration vehicle trajectory data. Leveraging traffic wave priors, an oblique grid-based matrix is first designed to transform the inherent dependencies of spatiotemporal traffic states into the algebraic low-rankness of a matrix. Then, with the enhanced traffic state low-rankness in the oblique matrix, a low-rank matrix completion method is tailored to explicitly capture spatiotemporal traffic propagation characteristics and precisely reconstruct traffic states. In addition, an anomaly-tolerant module based on a sparse matrix is developed to accommodate corrupted data input and thereby improve the TSE model robustness. Notably, driven by the understanding of traffic waves, the computational complexity of the proposed efficient method is only correlated with the problem size itself, not with dataset size and hyperparameter selection prevalent in existing studies. Extensive experiments demonstrate the effectiveness, robustness, and efficiency of the proposed model. The performance of the proposed method achieves up to a 12% improvement in Root Mean Squared Error (RMSE) in the TSE scenarios and an 18% improvement in RMSE in the robust TSE scenarios, and it runs more than 20 times faster than the state-of-the-art (SOTA) methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05842v1-abstract-full').style.display = 'none'; document.getElementById('2411.05842v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by T-ITS</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04368">arXiv:2411.04368</a> <span> [<a href="https://arxiv.org/pdf/2411.04368">pdf</a>, <a href="https://arxiv.org/format/2411.04368">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Measuring short-form factuality in large language models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+J">Jason Wei</a>, <a href="/search/cs?searchtype=author&query=Karina%2C+N">Nguyen Karina</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+H+W">Hyung Won Chung</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+Y+J">Yunxin Joy Jiao</a>, <a href="/search/cs?searchtype=author&query=Papay%2C+S">Spencer Papay</a>, <a href="/search/cs?searchtype=author&query=Glaese%2C+A">Amelia Glaese</a>, <a href="/search/cs?searchtype=author&query=Schulman%2C+J">John Schulman</a>, <a href="/search/cs?searchtype=author&query=Fedus%2C+W">William Fedus</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04368v1-abstract-short" style="display: inline;"> We present SimpleQA, a benchmark that evaluates the ability of language models to answer short, fact-seeking questions. We prioritized two properties in designing this eval. First, SimpleQA is challenging, as it is adversarially collected against GPT-4 responses. Second, responses are easy to grade, because questions are created such that there exists only a single, indisputable answer. Each answe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04368v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04368v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04368v1-abstract-full" style="display: none;"> We present SimpleQA, a benchmark that evaluates the ability of language models to answer short, fact-seeking questions. We prioritized two properties in designing this eval. First, SimpleQA is challenging, as it is adversarially collected against GPT-4 responses. Second, responses are easy to grade, because questions are created such that there exists only a single, indisputable answer. Each answer in SimpleQA is graded as either correct, incorrect, or not attempted. A model with ideal behavior would get as many questions correct as possible while not attempting the questions for which it is not confident it knows the correct answer. SimpleQA is a simple, targeted evaluation for whether models "know what they know," and our hope is that this benchmark will remain relevant for the next few generations of frontier models. SimpleQA can be found at https://github.com/openai/simple-evals. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04368v1-abstract-full').style.display = 'none'; document.getElementById('2411.04368v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Blog post: https://openai.com/index/introducing-simpleqa/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00904">arXiv:2411.00904</a> <span> [<a href="https://arxiv.org/pdf/2411.00904">pdf</a>, <a href="https://arxiv.org/format/2411.00904">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Similarity and Dissimilarity Guided Co-association Matrix Construction for Ensemble Clustering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xu Zhang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yuheng Jia</a>, <a href="/search/cs?searchtype=author&query=Song%2C+M">Mofei Song</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ran Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00904v1-abstract-short" style="display: inline;"> Ensemble clustering aggregates multiple weak clusterings to achieve a more accurate and robust consensus result. The Co-Association matrix (CA matrix) based method is the mainstream ensemble clustering approach that constructs the similarity relationships between sample pairs according the weak clustering partitions to generate the final clustering result. However, the existing methods neglect tha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00904v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00904v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00904v1-abstract-full" style="display: none;"> Ensemble clustering aggregates multiple weak clusterings to achieve a more accurate and robust consensus result. The Co-Association matrix (CA matrix) based method is the mainstream ensemble clustering approach that constructs the similarity relationships between sample pairs according the weak clustering partitions to generate the final clustering result. However, the existing methods neglect that the quality of cluster is related to its size, i.e., a cluster with smaller size tends to higher accuracy. Moreover, they also do not consider the valuable dissimilarity information in the base clusterings which can reflect the varying importance of sample pairs that are completely disconnected. To this end, we propose the Similarity and Dissimilarity Guided Co-association matrix (SDGCA) to achieve ensemble clustering. First, we introduce normalized ensemble entropy to estimate the quality of each cluster, and construct a similarity matrix based on this estimation. Then, we employ the random walk to explore high-order proximity of base clusterings to construct a dissimilarity matrix. Finally, the adversarial relationship between the similarity matrix and the dissimilarity matrix is utilized to construct a promoted CA matrix for ensemble clustering. We compared our method with 13 state-of-the-art methods across 12 datasets, and the results demonstrated the superiority clustering ability and robustness of the proposed approach. The code is available at https://github.com/xuz2019/SDGCA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00904v1-abstract-full').style.display = 'none'; document.getElementById('2411.00904v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22848">arXiv:2410.22848</a> <span> [<a href="https://arxiv.org/pdf/2410.22848">pdf</a>, <a href="https://arxiv.org/format/2410.22848">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Non-contact Dexterous Micromanipulation with Multiple Optoelectronic Robots </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yongyi Jia</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+S">Shu Miao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A">Ao Wang</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+C">Caiding Ni</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+L">Lin Feng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaowo Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22848v1-abstract-short" style="display: inline;"> Micromanipulation systems leverage automation and robotic technologies to improve the precision, repeatability, and efficiency of various tasks at the microscale. However, current approaches are typically limited to specific objects or tasks, which necessitates the use of custom tools and specialized grasping methods. This paper proposes a novel non-contact micromanipulation method based on optoel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22848v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22848v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22848v1-abstract-full" style="display: none;"> Micromanipulation systems leverage automation and robotic technologies to improve the precision, repeatability, and efficiency of various tasks at the microscale. However, current approaches are typically limited to specific objects or tasks, which necessitates the use of custom tools and specialized grasping methods. This paper proposes a novel non-contact micromanipulation method based on optoelectronic technologies. The proposed method utilizes repulsive dielectrophoretic forces generated in the optoelectronic field to drive a microrobot, enabling the microrobot to push the target object in a cluttered environment without physical contact. The non-contact feature can minimize the risks of potential damage, contamination, or adhesion while largely improving the flexibility of manipulation. The feature enables the use of a general tool for indirect object manipulation, eliminating the need for specialized tools. A series of simulation studies and real-world experiments -- including non-contact trajectory tracking, obstacle avoidance, and reciprocal avoidance between multiple microrobots -- are conducted to validate the performance of the proposed method. The proposed formulation provides a general and dexterous solution for a range of objects and tasks at the micro scale. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22848v1-abstract-full').style.display = 'none'; document.getElementById('2410.22848v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22629">arXiv:2410.22629</a> <span> [<a href="https://arxiv.org/pdf/2410.22629">pdf</a>, <a href="https://arxiv.org/format/2410.22629">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CrossEarth: Geospatial Vision Foundation Model for Domain Generalizable Remote Sensing Semantic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gong%2C+Z">Ziyang Gong</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Z">Zhixiang Wei</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Di Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xianzheng Ma</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hongruixuan Chen</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yuru Jia</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Y">Yupeng Deng</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+Z">Zhenming Ji</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xiangwei Zhu</a>, <a href="/search/cs?searchtype=author&query=Yokoya%2C+N">Naoto Yokoya</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+B">Bo Du</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Liangpei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22629v2-abstract-short" style="display: inline;"> The field of Remote Sensing Domain Generalization (RSDG) has emerged as a critical and valuable research frontier, focusing on developing models that generalize effectively across diverse scenarios. Despite the substantial domain gaps in RS images that are characterized by variabilities such as location, wavelength, and sensor type, research in this area remains underexplored: (1) Current cross-do… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22629v2-abstract-full').style.display = 'inline'; document.getElementById('2410.22629v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22629v2-abstract-full" style="display: none;"> The field of Remote Sensing Domain Generalization (RSDG) has emerged as a critical and valuable research frontier, focusing on developing models that generalize effectively across diverse scenarios. Despite the substantial domain gaps in RS images that are characterized by variabilities such as location, wavelength, and sensor type, research in this area remains underexplored: (1) Current cross-domain methods primarily focus on Domain Adaptation (DA), which adapts models to predefined domains rather than to unseen ones; (2) Few studies targeting the RSDG issue, especially for semantic segmentation tasks, where existing models are developed for specific unknown domains, struggling with issues of underfitting on other unknown scenarios; (3) Existing RS foundation models tend to prioritize in-domain performance over cross-domain generalization. To this end, we introduce the first vision foundation model for RSDG semantic segmentation, CrossEarth. CrossEarth demonstrates strong cross-domain generalization through a specially designed data-level Earth-Style Injection pipeline and a model-level Multi-Task Training pipeline. In addition, for the semantic segmentation task, we have curated an RSDG benchmark comprising 28 cross-domain settings across various regions, spectral bands, platforms, and climates, providing a comprehensive framework for testing the generalizability of future RSDG models. Extensive experiments on this benchmark demonstrate the superiority of CrossEarth over existing state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22629v2-abstract-full').style.display = 'none'; document.getElementById('2410.22629v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The codes and models will be available at https://github.com/Cuzyoung/CrossEarth</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18388">arXiv:2410.18388</a> <span> [<a href="https://arxiv.org/pdf/2410.18388">pdf</a>, <a href="https://arxiv.org/ps/2410.18388">ps</a>, <a href="https://arxiv.org/format/2410.18388">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Irregular Tensor Low-Rank Representation for Hyperspectral Image Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+B">Bo Han</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yuheng Jia</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hui Liu</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+J">Junhui Hou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18388v1-abstract-short" style="display: inline;"> Spectral variation is a common problem for hyperspectral image (HSI) representation. Low-rank tensor representation is an important approach to alleviate spectral variations. However, the spatial distribution of the HSI is always irregular, while the previous tensor low-rank representation methods can only be applied to the regular data cubes, which limits the performance. To remedy this issue, in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18388v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18388v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18388v1-abstract-full" style="display: none;"> Spectral variation is a common problem for hyperspectral image (HSI) representation. Low-rank tensor representation is an important approach to alleviate spectral variations. However, the spatial distribution of the HSI is always irregular, while the previous tensor low-rank representation methods can only be applied to the regular data cubes, which limits the performance. To remedy this issue, in this paper we propose a novel irregular tensor low-rank representation model. We first segment the HSI data into several irregular homogeneous regions. Then, we propose a novel irregular tensor low-rank representation method that can efficiently model the irregular 3D cubes. We further use a non-convex nuclear norm to pursue the low-rankness and introduce a negative global low-rank term that improves global consistency. This proposed model is finally formulated as a convex-concave optimization problem and solved by alternative augmented Lagrangian method. Through experiments on four public datasets, the proposed method outperforms the existing low-rank based HSI methods significantly. Code is available at: https://github.com/hb-studying/ITLRR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18388v1-abstract-full').style.display = 'none'; document.getElementById('2410.18388v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14041">arXiv:2410.14041</a> <span> [<a href="https://arxiv.org/pdf/2410.14041">pdf</a>, <a href="https://arxiv.org/format/2410.14041">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> From Barriers to Tactics: A Behavioral Science-Informed Agentic Workflow for Personalized Nutrition Coaching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+E">Eric Yang</a>, <a href="/search/cs?searchtype=author&query=Garcia%2C+T">Tomas Garcia</a>, <a href="/search/cs?searchtype=author&query=Williams%2C+H">Hannah Williams</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+B">Bhawesh Kumar</a>, <a href="/search/cs?searchtype=author&query=Ram%C3%A9%2C+M">Martin Ram茅</a>, <a href="/search/cs?searchtype=author&query=Rivera%2C+E">Eileen Rivera</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yiran Ma</a>, <a href="/search/cs?searchtype=author&query=Amar%2C+J">Jonathan Amar</a>, <a href="/search/cs?searchtype=author&query=Catalani%2C+C">Caricia Catalani</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yugang Jia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14041v1-abstract-short" style="display: inline;"> Effective management of cardiometabolic conditions requires sustained positive nutrition habits, often hindered by complex and individualized barriers. Direct human management is simply not scalable, while previous attempts aimed at automating nutrition coaching lack the personalization needed to address these diverse challenges. This paper introduces a novel LLM-powered agentic workflow designed… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14041v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14041v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14041v1-abstract-full" style="display: none;"> Effective management of cardiometabolic conditions requires sustained positive nutrition habits, often hindered by complex and individualized barriers. Direct human management is simply not scalable, while previous attempts aimed at automating nutrition coaching lack the personalization needed to address these diverse challenges. This paper introduces a novel LLM-powered agentic workflow designed to provide personalized nutrition coaching by directly targeting and mitigating patient-specific barriers. Grounded in behavioral science principles, the workflow leverages a comprehensive mapping of nutrition-related barriers to corresponding evidence-based strategies. A specialized LLM agent intentionally probes for and identifies the root cause of a patient's dietary struggles. Subsequently, a separate LLM agent delivers tailored tactics designed to overcome those specific barriers with patient context. We designed and validated our approach through a user study with individuals with cardiometabolic conditions, demonstrating the system's ability to accurately identify barriers and provide personalized guidance. Furthermore, we conducted a large-scale simulation study, grounding on real patient vignettes and expert-validated metrics, to evaluate the system's performance across a wide range of scenarios. Our findings demonstrate the potential of this LLM-powered agentic workflow to improve nutrition coaching by providing personalized, scalable, and behaviorally-informed interventions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14041v1-abstract-full').style.display = 'none'; document.getElementById('2410.14041v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13738">arXiv:2410.13738</a> <span> [<a href="https://arxiv.org/pdf/2410.13738">pdf</a>, <a href="https://arxiv.org/ps/2410.13738">ps</a>, <a href="https://arxiv.org/format/2410.13738">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Improved Convergence Rate for Diffusion Probabilistic Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+G">Gen Li</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+Y">Yuchen Jiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13738v1-abstract-short" style="display: inline;"> Score-based diffusion models have achieved remarkable empirical performance in the field of machine learning and artificial intelligence for their ability to generate high-quality new data instances from complex distributions. Improving our understanding of diffusion models, including mainly convergence analysis for such models, has attracted a lot of interests. Despite a lot of theoretical attemp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13738v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13738v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13738v1-abstract-full" style="display: none;"> Score-based diffusion models have achieved remarkable empirical performance in the field of machine learning and artificial intelligence for their ability to generate high-quality new data instances from complex distributions. Improving our understanding of diffusion models, including mainly convergence analysis for such models, has attracted a lot of interests. Despite a lot of theoretical attempts, there still exists significant gap between theory and practice. Towards to close this gap, we establish an iteration complexity at the order of $d^{1/3}\varepsilon^{-2/3}$, which is better than $d^{5/12}\varepsilon^{-1}$, the best known complexity achieved before our work. This convergence analysis is based on a randomized midpoint method, which is first proposed for log-concave sampling (Shen and Lee, 2019), and then extended to diffusion models by Gupta et al. (2024). Our theory accommodates $\varepsilon$-accurate score estimates, and does not require log-concavity on the target distribution. Moreover, the algorithm can also be parallelized to run in only $O(\log^2(d/\varepsilon))$ parallel rounds in a similar way to prior works. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13738v1-abstract-full').style.display = 'none'; document.getElementById('2410.13738v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13579">arXiv:2410.13579</a> <span> [<a href="https://arxiv.org/pdf/2410.13579">pdf</a>, <a href="https://arxiv.org/format/2410.13579">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Towards Better Performance in Incomplete LDL: Addressing Data Imbalance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kou%2C+Z">Zhiqiang Kou</a>, <a href="/search/cs?searchtype=author&query=Xuan%2C+H">Haoyuan Xuan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jing Wang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yuheng Jia</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+X">Xin Geng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13579v1-abstract-short" style="display: inline;"> Label Distribution Learning (LDL) is a novel machine learning paradigm that addresses the problem of label ambiguity and has found widespread applications. Obtaining complete label distributions in real-world scenarios is challenging, which has led to the emergence of Incomplete Label Distribution Learning (InLDL). However, the existing InLDL methods overlook a crucial aspect of LDL data: the inhe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13579v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13579v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13579v1-abstract-full" style="display: none;"> Label Distribution Learning (LDL) is a novel machine learning paradigm that addresses the problem of label ambiguity and has found widespread applications. Obtaining complete label distributions in real-world scenarios is challenging, which has led to the emergence of Incomplete Label Distribution Learning (InLDL). However, the existing InLDL methods overlook a crucial aspect of LDL data: the inherent imbalance in label distributions. To address this limitation, we propose \textbf{Incomplete and Imbalance Label Distribution Learning (I$^2$LDL)}, a framework that simultaneously handles incomplete labels and imbalanced label distributions. Our method decomposes the label distribution matrix into a low-rank component for frequent labels and a sparse component for rare labels, effectively capturing the structure of both head and tail labels. We optimize the model using the Alternating Direction Method of Multipliers (ADMM) and derive generalization error bounds via Rademacher complexity, providing strong theoretical guarantees. Extensive experiments on 15 real-world datasets demonstrate the effectiveness and robustness of our proposed framework compared to existing InLDL methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13579v1-abstract-full').style.display = 'none'; document.getElementById('2410.13579v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13240">arXiv:2410.13240</a> <span> [<a href="https://arxiv.org/pdf/2410.13240">pdf</a>, <a href="https://arxiv.org/format/2410.13240">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> TRLO: An Efficient LiDAR Odometry with 3D Dynamic Object Tracking and Removal </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yanpeng Jia</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Ting Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xieyuanli Chen</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+S">Shiliang Shao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13240v1-abstract-short" style="display: inline;"> Simultaneous state estimation and mapping is an essential capability for mobile robots working in dynamic urban environment. The majority of existing SLAM solutions heavily rely on a primarily static assumption. However, due to the presence of moving vehicles and pedestrians, this assumption does not always hold, leading to localization accuracy decreased and maps distorted. To address this challe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13240v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13240v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13240v1-abstract-full" style="display: none;"> Simultaneous state estimation and mapping is an essential capability for mobile robots working in dynamic urban environment. The majority of existing SLAM solutions heavily rely on a primarily static assumption. However, due to the presence of moving vehicles and pedestrians, this assumption does not always hold, leading to localization accuracy decreased and maps distorted. To address this challenge, we propose TRLO, a dynamic LiDAR odometry that efficiently improves the accuracy of state estimation and generates a cleaner point cloud map. To efficiently detect dynamic objects in the surrounding environment, a deep learning-based method is applied, generating detection bounding boxes. We then design a 3D multi-object tracker based on Unscented Kalman Filter (UKF) and nearest neighbor (NN) strategy to reliably identify and remove dynamic objects. Subsequently, a fast two-stage iterative nearest point solver is employed to solve the state estimation using cleaned static point cloud. Note that a novel hash-based keyframe database management is proposed for fast access to search keyframes. Furthermore, all the detected object bounding boxes are leveraged to impose posture consistency constraint to further refine the final state estimation. Extensive evaluations and ablation studies conducted on the KITTI and UrbanLoco datasets demonstrate that our approach not only achieves more accurate state estimation but also generates cleaner maps, compared with baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13240v1-abstract-full').style.display = 'none'; document.getElementById('2410.13240v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8pages, 5figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09383">arXiv:2410.09383</a> <span> [<a href="https://arxiv.org/pdf/2410.09383">pdf</a>, <a href="https://arxiv.org/ps/2410.09383">ps</a>, <a href="https://arxiv.org/format/2410.09383">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Deep Transfer Learning: Model Framework and Error Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiao%2C+Y">Yuling Jiao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Huazhen Lin</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yuchen Luo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J+Z">Jerry Zhijian Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09383v1-abstract-short" style="display: inline;"> This paper presents a framework for deep transfer learning, which aims to leverage information from multi-domain upstream data with a large number of samples $n$ to a single-domain downstream task with a considerably smaller number of samples $m$, where $m \ll n$, in order to enhance performance on downstream task. Our framework has several intriguing features. First, it allows the existence of bo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09383v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09383v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09383v1-abstract-full" style="display: none;"> This paper presents a framework for deep transfer learning, which aims to leverage information from multi-domain upstream data with a large number of samples $n$ to a single-domain downstream task with a considerably smaller number of samples $m$, where $m \ll n$, in order to enhance performance on downstream task. Our framework has several intriguing features. First, it allows the existence of both shared and specific features among multi-domain data and provides a framework for automatic identification, achieving precise transfer and utilization of information. Second, our model framework explicitly indicates the upstream features that contribute to downstream tasks, establishing a relationship between upstream domains and downstream tasks, thereby enhancing interpretability. Error analysis demonstrates that the transfer under our framework can significantly improve the convergence rate for learning Lipschitz functions in downstream supervised tasks, reducing it from $\tilde{O}(m^{-\frac{1}{2(d+2)}}+n^{-\frac{1}{2(d+2)}})$ ("no transfer") to $\tilde{O}(m^{-\frac{1}{2(d^*+3)}} + n^{-\frac{1}{2(d+2)}})$ ("partial transfer"), and even to $\tilde{O}(m^{-1/2}+n^{-\frac{1}{2(d+2)}})$ ("complete transfer"), where $d^* \ll d$ and $d$ is the dimension of the observed data. Our theoretical findings are substantiated by empirical experiments conducted on image classification datasets, along with a regression dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09383v1-abstract-full').style.display = 'none'; document.getElementById('2410.09383v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08262">arXiv:2410.08262</a> <span> [<a href="https://arxiv.org/pdf/2410.08262">pdf</a>, <a href="https://arxiv.org/format/2410.08262">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> ROMAN: Open-Set Object Map Alignment for Robust View-Invariant Global Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peterson%2C+M+B">Mason B. Peterson</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Y+X">Yi Xuan Jia</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yulun Tian</a>, <a href="/search/cs?searchtype=author&query=Thomas%2C+A">Annika Thomas</a>, <a href="/search/cs?searchtype=author&query=How%2C+J+P">Jonathan P. How</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08262v1-abstract-short" style="display: inline;"> Global localization is a fundamental capability required for long-term and drift-free robot navigation. However, current methods fail to relocalize when faced with significantly different viewpoints. We present ROMAN (Robust Object Map Alignment Anywhere), a robust global localization method capable of localizing in challenging and diverse environments based on creating and aligning maps of open-s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08262v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08262v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08262v1-abstract-full" style="display: none;"> Global localization is a fundamental capability required for long-term and drift-free robot navigation. However, current methods fail to relocalize when faced with significantly different viewpoints. We present ROMAN (Robust Object Map Alignment Anywhere), a robust global localization method capable of localizing in challenging and diverse environments based on creating and aligning maps of open-set and view-invariant objects. To address localization difficulties caused by feature-sparse or perceptually aliased environments, ROMAN formulates and solves a registration problem between object submaps using a unified graph-theoretic global data association approach that simultaneously accounts for object shape and semantic similarities and a prior on gravity direction. Through a set of challenging large-scale multi-robot or multi-session SLAM experiments in indoor, urban and unstructured/forested environments, we demonstrate that ROMAN achieves a maximum recall 36% higher than other object-based map alignment methods and an absolute trajectory error that is 37% lower than using visual features for loop closures. Our project page can be found at https://acl.mit.edu/ROMAN/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08262v1-abstract-full').style.display = 'none'; document.getElementById('2410.08262v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07018">arXiv:2410.07018</a> <span> [<a href="https://arxiv.org/pdf/2410.07018">pdf</a>, <a href="https://arxiv.org/format/2410.07018">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Tri-Level Navigator: LLM-Empowered Tri-Level Learning for Time Series OOD Generalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jian%2C+C">Chengtao Jian</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kai Yang</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+Y">Yang Jiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07018v2-abstract-short" style="display: inline;"> Out-of-Distribution (OOD) generalization in machine learning is a burgeoning area of study. Its primary goal is to enhance the adaptability and resilience of machine learning models when faced with new, unseen, and potentially adversarial data that significantly diverges from their original training datasets. In this paper, we investigate time series OOD generalization via pre-trained Large Langua… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07018v2-abstract-full').style.display = 'inline'; document.getElementById('2410.07018v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07018v2-abstract-full" style="display: none;"> Out-of-Distribution (OOD) generalization in machine learning is a burgeoning area of study. Its primary goal is to enhance the adaptability and resilience of machine learning models when faced with new, unseen, and potentially adversarial data that significantly diverges from their original training datasets. In this paper, we investigate time series OOD generalization via pre-trained Large Language Models (LLMs). We first propose a novel \textbf{T}ri-level learning framework for \textbf{T}ime \textbf{S}eries \textbf{O}OD generalization, termed TTSO, which considers both sample-level and group-level uncertainties. This formula offers a fresh theoretic perspective for formulating and analyzing OOD generalization problem. In addition, we provide a theoretical analysis to justify this method is well motivated. We then develop a stratified localization algorithm tailored for this tri-level optimization problem, theoretically demonstrating the guaranteed convergence of the proposed algorithm. Our analysis also reveals that the iteration complexity to obtain an $蔚$-stationary point is bounded by O($\frac{1}{蔚^{2}}$). Extensive experiments on real-world datasets have been conducted to elucidate the effectiveness of the proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07018v2-abstract-full').style.display = 'none'; document.getElementById('2410.07018v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06982">arXiv:2410.06982</a> <span> [<a href="https://arxiv.org/pdf/2410.06982">pdf</a>, <a href="https://arxiv.org/format/2410.06982">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Structure-Centric Robust Monocular Depth Estimation via Knowledge Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+R">Runze Chen</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+H">Haiyong Luo</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+F">Fang Zhao</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jingze Yu</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yupeng Jia</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Juan Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xuepeng Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06982v1-abstract-short" style="display: inline;"> Monocular depth estimation, enabled by self-supervised learning, is a key technique for 3D perception in computer vision. However, it faces significant challenges in real-world scenarios, which encompass adverse weather variations, motion blur, as well as scenes with poor lighting conditions at night. Our research reveals that we can divide monocular depth estimation into three sub-problems: depth… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06982v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06982v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06982v1-abstract-full" style="display: none;"> Monocular depth estimation, enabled by self-supervised learning, is a key technique for 3D perception in computer vision. However, it faces significant challenges in real-world scenarios, which encompass adverse weather variations, motion blur, as well as scenes with poor lighting conditions at night. Our research reveals that we can divide monocular depth estimation into three sub-problems: depth structure consistency, local texture disambiguation, and semantic-structural correlation. Our approach tackles the non-robustness of existing self-supervised monocular depth estimation models to interference textures by adopting a structure-centered perspective and utilizing the scene structure characteristics demonstrated by semantics and illumination. We devise a novel approach to reduce over-reliance on local textures, enhancing robustness against missing or interfering patterns. Additionally, we incorporate a semantic expert model as the teacher and construct inter-model feature dependencies via learnable isomorphic graphs to enable aggregation of semantic structural knowledge. Our approach achieves state-of-the-art out-of-distribution monocular depth estimation performance across a range of public adverse scenario datasets. It demonstrates notable scalability and compatibility, without necessitating extensive model engineering. This showcases the potential for customizing models for diverse industrial applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06982v1-abstract-full').style.display = 'none'; document.getElementById('2410.06982v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To be published in Asian Conference on Computer Vision 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06203">arXiv:2410.06203</a> <span> [<a href="https://arxiv.org/pdf/2410.06203">pdf</a>, <a href="https://arxiv.org/format/2410.06203">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Integrating Planning into Single-Turn Long-Form Text Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yi Liang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">You Wu</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+H">Honglei Zhuang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Li Chen</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Jiaming Shen</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yiling Jia</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Z">Zhen Qin</a>, <a href="/search/cs?searchtype=author&query=Sanghai%2C+S">Sumit Sanghai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xuanhui Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Carl Yang</a>, <a href="/search/cs?searchtype=author&query=Bendersky%2C+M">Michael Bendersky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06203v1-abstract-short" style="display: inline;"> Generating high-quality, in-depth textual documents, such as academic papers, news articles, Wikipedia entries, and books, remains a significant challenge for Large Language Models (LLMs). In this paper, we propose to use planning to generate long form content. To achieve our goal, we generate intermediate steps via an auxiliary task that teaches the LLM to plan, reason and structure before genera… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06203v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06203v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06203v1-abstract-full" style="display: none;"> Generating high-quality, in-depth textual documents, such as academic papers, news articles, Wikipedia entries, and books, remains a significant challenge for Large Language Models (LLMs). In this paper, we propose to use planning to generate long form content. To achieve our goal, we generate intermediate steps via an auxiliary task that teaches the LLM to plan, reason and structure before generating the final text. Our main novelty lies in a single auxiliary task that does not require multiple rounds of prompting or planning. To overcome the scarcity of training data for these intermediate steps, we leverage LLMs to generate synthetic intermediate writing data such as outlines, key information and summaries from existing full articles. Our experiments demonstrate on two datasets from different domains, namely the scientific news dataset SciNews and Wikipedia datasets in KILT-Wiki and FreshWiki, that LLMs fine-tuned with the auxiliary task generate higher quality documents. We observed +2.5% improvement in ROUGE-Lsum, and a strong 3.60 overall win/loss ratio via human SxS evaluation, with clear wins in organization, relevance, and verifiability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06203v1-abstract-full').style.display = 'none'; document.getElementById('2410.06203v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19592">arXiv:2409.19592</a> <span> [<a href="https://arxiv.org/pdf/2409.19592">pdf</a>, <a href="https://arxiv.org/format/2409.19592">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> DiffCP: Ultra-Low Bit Collaborative Perception via Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mao%2C+R">Ruiqing Mao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Haotian Wu</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yukuan Jia</a>, <a href="/search/cs?searchtype=author&query=Nan%2C+Z">Zhaojun Nan</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuxuan Sun</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Sheng Zhou</a>, <a href="/search/cs?searchtype=author&query=G%C3%BCnd%C3%BCz%2C+D">Deniz G眉nd眉z</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+Z">Zhisheng Niu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19592v1-abstract-short" style="display: inline;"> Collaborative perception (CP) is emerging as a promising solution to the inherent limitations of stand-alone intelligence. However, current wireless communication systems are unable to support feature-level and raw-level collaborative algorithms due to their enormous bandwidth demands. In this paper, we propose DiffCP, a novel CP paradigm that utilizes a specialized diffusion model to efficiently… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19592v1-abstract-full').style.display = 'inline'; document.getElementById('2409.19592v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19592v1-abstract-full" style="display: none;"> Collaborative perception (CP) is emerging as a promising solution to the inherent limitations of stand-alone intelligence. However, current wireless communication systems are unable to support feature-level and raw-level collaborative algorithms due to their enormous bandwidth demands. In this paper, we propose DiffCP, a novel CP paradigm that utilizes a specialized diffusion model to efficiently compress the sensing information of collaborators. By incorporating both geometric and semantic conditions into the generative model, DiffCP enables feature-level collaboration with an ultra-low communication cost, advancing the practical implementation of CP systems. This paradigm can be seamlessly integrated into existing CP algorithms to enhance a wide range of downstream tasks. Through extensive experimentation, we investigate the trade-offs between communication, computation, and performance. Numerical results demonstrate that DiffCP can significantly reduce communication costs by 14.5-fold while maintaining the same performance as the state-of-the-art algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19592v1-abstract-full').style.display = 'none'; document.getElementById('2409.19592v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19130">arXiv:2409.19130</a> <span> [<a href="https://arxiv.org/pdf/2409.19130">pdf</a>, <a href="https://arxiv.org/format/2409.19130">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Multi-modal Cross-domain Self-supervised Pre-training for fMRI and EEG Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xinxu Wei</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+K">Kanhao Zhao</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+Y">Yong Jiao</a>, <a href="/search/cs?searchtype=author&query=Carlisle%2C+N+B">Nancy B. Carlisle</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+H">Hua Xie</a>, <a href="/search/cs?searchtype=author&query=Fonzo%2C+G+A">Gregory A. Fonzo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19130v1-abstract-short" style="display: inline;"> Neuroimaging techniques including functional magnetic resonance imaging (fMRI) and electroencephalogram (EEG) have shown promise in detecting functional abnormalities in various brain disorders. However, existing studies often focus on a single domain or modality, neglecting the valuable complementary information offered by multiple domains from both fMRI and EEG, which is crucial for a comprehens… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19130v1-abstract-full').style.display = 'inline'; document.getElementById('2409.19130v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19130v1-abstract-full" style="display: none;"> Neuroimaging techniques including functional magnetic resonance imaging (fMRI) and electroencephalogram (EEG) have shown promise in detecting functional abnormalities in various brain disorders. However, existing studies often focus on a single domain or modality, neglecting the valuable complementary information offered by multiple domains from both fMRI and EEG, which is crucial for a comprehensive representation of disorder pathology. This limitation poses a challenge in effectively leveraging the synergistic information derived from these modalities. To address this, we propose a Multi-modal Cross-domain Self-supervised Pre-training Model (MCSP), a novel approach that leverages self-supervised learning to synergize multi-modal information across spatial, temporal, and spectral domains. Our model employs cross-domain self-supervised loss that bridges domain differences by implementing domain-specific data augmentation and contrastive loss, enhancing feature discrimination. Furthermore, MCSP introduces cross-modal self-supervised loss to capitalize on the complementary information of fMRI and EEG, facilitating knowledge distillation within domains and maximizing cross-modal feature convergence. We constructed a large-scale pre-training dataset and pretrained MCSP model by leveraging proposed self-supervised paradigms to fully harness multimodal neuroimaging data. Through comprehensive experiments, we have demonstrated the superior performance and generalizability of our model on multiple classification tasks. Our study contributes a significant advancement in the fusion of fMRI and EEG, marking a novel integration of cross-domain features, which enriches the existing landscape of neuroimaging research, particularly within the context of mental disorder studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19130v1-abstract-full').style.display = 'none'; document.getElementById('2409.19130v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17703">arXiv:2409.17703</a> <span> [<a href="https://arxiv.org/pdf/2409.17703">pdf</a>, <a href="https://arxiv.org/format/2409.17703">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> PGN: The RNN's New Successor is Effective for Long-Range Time Series Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yuxin Jia</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Youfang Lin</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jing Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuo Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianhao Liu</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+H">Huaiyu Wan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17703v1-abstract-short" style="display: inline;"> Due to the recurrent structure of RNN, the long information propagation path poses limitations in capturing long-term dependencies, gradient explosion/vanishing issues, and inefficient sequential execution. Based on this, we propose a novel paradigm called Parallel Gated Network (PGN) as the new successor to RNN. PGN directly captures information from previous time steps through the designed Histo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17703v1-abstract-full').style.display = 'inline'; document.getElementById('2409.17703v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17703v1-abstract-full" style="display: none;"> Due to the recurrent structure of RNN, the long information propagation path poses limitations in capturing long-term dependencies, gradient explosion/vanishing issues, and inefficient sequential execution. Based on this, we propose a novel paradigm called Parallel Gated Network (PGN) as the new successor to RNN. PGN directly captures information from previous time steps through the designed Historical Information Extraction (HIE) layer and leverages gated mechanisms to select and fuse it with the current time step information. This reduces the information propagation path to $\mathcal{O}(1)$, effectively addressing the limitations of RNN. To enhance PGN's performance in long-range time series forecasting tasks, we propose a novel temporal modeling framework called Temporal PGN (TPGN). TPGN incorporates two branches to comprehensively capture the semantic information of time series. One branch utilizes PGN to capture long-term periodic patterns while preserving their local characteristics. The other branch employs patches to capture short-term information and aggregate the global representation of the series. TPGN achieves a theoretical complexity of $\mathcal{O}(\sqrt{L})$, ensuring efficiency in its operations. Experimental results on five benchmark datasets demonstrate the state-of-the-art (SOTA) performance and high efficiency of TPGN, further confirming the effectiveness of PGN as the new successor to RNN in long-range time series forecasting. The code is available in this repository: \url{https://github.com/Water2sea/TPGN}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17703v1-abstract-full').style.display = 'none'; document.getElementById('2409.17703v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16723">arXiv:2409.16723</a> <span> [<a href="https://arxiv.org/pdf/2409.16723">pdf</a>, <a href="https://arxiv.org/format/2409.16723">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EAGLE: Towards Efficient Arbitrary Referring Visual Prompts Comprehension for Multimodal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiacheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+Y">Yang Jiao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shaoxiang Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jingjing Chen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yu-Gang Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16723v2-abstract-short" style="display: inline;"> Recently, Multimodal Large Language Models (MLLMs) have sparked great research interests owing to their exceptional content-reasoning and instruction-following capabilities. To effectively instruct an MLLM, in addition to conventional language expressions, the practice of referring to objects by painting with brushes on images has emerged as a prevalent tool (referred to as "referring visual promp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16723v2-abstract-full').style.display = 'inline'; document.getElementById('2409.16723v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16723v2-abstract-full" style="display: none;"> Recently, Multimodal Large Language Models (MLLMs) have sparked great research interests owing to their exceptional content-reasoning and instruction-following capabilities. To effectively instruct an MLLM, in addition to conventional language expressions, the practice of referring to objects by painting with brushes on images has emerged as a prevalent tool (referred to as "referring visual prompts") due to its efficacy in aligning the user's intention with specific image regions. To accommodate the most common referring visual prompts, namely points, boxes, and masks, existing approaches initially utilize specialized feature encoding modules to capture the semantics of the highlighted areas indicated by these prompts. Subsequently, these encoded region features are adapted to MLLMs through fine-tuning on a meticulously curated multimodal instruction dataset. However, such designs suffer from redundancy in architecture. Moreover, they face challenges in effectively generalizing when encountering a diverse range of arbitrary referring visual prompts in real-life scenarios. To address the above issues, we propose EAGLE, a novel MLLM that empowers comprehension of arbitrary referring visual prompts with less training efforts than existing approaches. Specifically, our EAGLE maintains the innate format of the referring visual prompts as colored patches rendered on the given image for conducting the instruction tuning. Our approach embeds referring visual prompts as spatial concepts conveying specific spatial areas comprehensible to the MLLM, with the semantic comprehension of these regions originating from the MLLM itself. Besides, we also propose a Geometry-Agnostic Learning paradigm (GAL) to further disentangle the MLLM's region-level comprehension with the specific formats of referring visual prompts. Extensive experiments are conducted to prove the effectiveness of our proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16723v2-abstract-full').style.display = 'none'; document.getElementById('2409.16723v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16597">arXiv:2409.16597</a> <span> [<a href="https://arxiv.org/pdf/2409.16597">pdf</a>, <a href="https://arxiv.org/format/2409.16597">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EventHallusion: Diagnosing Event Hallucinations in Video LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiacheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+Y">Yang Jiao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shaoxiang Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jingjing Chen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yu-Gang Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16597v1-abstract-short" style="display: inline;"> Recently, Multimodal Large Language Models (MLLMs) have made significant progress in the video comprehension field. Despite remarkable content reasoning and instruction following capabilities they demonstrated, the hallucination problem of these VideoLLMs is less explored compared with its counterpart in the image domain. To mitigate this gap, we first propose EventHallusion, a novel benchmark tha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16597v1-abstract-full').style.display = 'inline'; document.getElementById('2409.16597v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16597v1-abstract-full" style="display: none;"> Recently, Multimodal Large Language Models (MLLMs) have made significant progress in the video comprehension field. Despite remarkable content reasoning and instruction following capabilities they demonstrated, the hallucination problem of these VideoLLMs is less explored compared with its counterpart in the image domain. To mitigate this gap, we first propose EventHallusion, a novel benchmark that focuses on assessing the VideoLMMs' hallucination phenomenon on video event comprehension. Based on the observation that existing VideoLLMs are entangled with the priors stemming from their foundation models, our EventHallusion is curated by meticulously collecting videos and annotating questions to intentionally mislead the VideoLLMs into interpreting events based on these priors rather than accurately understanding the video content. On the other hand, we also propose a simple yet effective method, called Temporal Contrastive Decoding (TCD), to tackle the hallucination problems of VideoLLMs. The proposed TCD suppresses the model's preference toward their priors by comparing the original video with a constructed counterpart, whose temporal cues are disrupted, during the autoregressive decoding stage. Through comprehensive evaluation of eight open-source and two closed-source VideoLLMs on the proposed EventHallusion benchmark, we find that the open-source models suffer significantly from hallucination problems, whereas the closed-source models perform markedly better. By further equipping open-sourced VideoLLMs with the proposed TCD approach, evident performance improvements are achieved across most metrics in the EventHallusion benchmark. Our codes and benchmark data are available at https://github.com/Stevetich/EventHallusion. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16597v1-abstract-full').style.display = 'none'; document.getElementById('2409.16597v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14521">arXiv:2409.14521</a> <span> [<a href="https://arxiv.org/pdf/2409.14521">pdf</a>, <a href="https://arxiv.org/format/2409.14521">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> UAV-Enabled Data Collection for IoT Networks via Rainbow Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiao%2C+Y">Yingchao Jiao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuhui Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenchao Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yinyu Wu</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+J">Jinke Ren</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yanyan Shen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+B">Bo Yang</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+X">Xinping Guan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14521v1-abstract-short" style="display: inline;"> Unmanned aerial vehicles (UAVs) assisted Internet of things (IoT) systems have become an important part of future wireless communications. To achieve higher communication rate, the joint design of UAV trajectory and resource allocation is crucial. This letter considers a scenario where a multi-antenna UAV is dispatched to simultaneously collect data from multiple ground IoT nodes (GNs) within a ti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14521v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14521v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14521v1-abstract-full" style="display: none;"> Unmanned aerial vehicles (UAVs) assisted Internet of things (IoT) systems have become an important part of future wireless communications. To achieve higher communication rate, the joint design of UAV trajectory and resource allocation is crucial. This letter considers a scenario where a multi-antenna UAV is dispatched to simultaneously collect data from multiple ground IoT nodes (GNs) within a time interval. To improve the sum data collection (SDC) volume, i.e., the total data volume transmitted by the GNs, the UAV trajectory, the UAV receive beamforming, the scheduling of the GNs, and the transmit power of the GNs are jointly optimized. Since the problem is non-convex and the optimization variables are highly coupled, it is hard to solve using traditional optimization methods. To find a near-optimal solution, a double-loop structured optimization-driven deep reinforcement learning (DRL) algorithm and a fully DRL-based algorithm are proposed to solve the problem effectively. Simulation results verify that the proposed algorithms outperform two benchmarks with significant improvement in SDC volumes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14521v1-abstract-full').style.display = 'none'; document.getElementById('2409.14521v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 6 figures, this work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13655">arXiv:2409.13655</a> <span> [<a href="https://arxiv.org/pdf/2409.13655">pdf</a>, <a href="https://arxiv.org/format/2409.13655">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Applications">stat.AP</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Mixture Importance Sampling for Automated Ads Auction Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yimeng Jia</a>, <a href="/search/cs?searchtype=author&query=Paneri%2C+K">Kaushal Paneri</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+R">Rong Huang</a>, <a href="/search/cs?searchtype=author&query=Maurya%2C+K+S">Kailash Singh Maurya</a>, <a href="/search/cs?searchtype=author&query=Mallapragada%2C+P">Pavan Mallapragada</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yifan Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13655v1-abstract-short" style="display: inline;"> This paper introduces Adaptive Mixture Importance Sampling (AMIS) as a novel approach for optimizing key performance indicators (KPIs) in large-scale recommender systems, such as online ad auctions. Traditional importance sampling (IS) methods face challenges in dynamic environments, particularly in navigating through complexities of multi-modal landscapes and avoiding entrapment in local optima f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13655v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13655v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13655v1-abstract-full" style="display: none;"> This paper introduces Adaptive Mixture Importance Sampling (AMIS) as a novel approach for optimizing key performance indicators (KPIs) in large-scale recommender systems, such as online ad auctions. Traditional importance sampling (IS) methods face challenges in dynamic environments, particularly in navigating through complexities of multi-modal landscapes and avoiding entrapment in local optima for the optimization task. Instead of updating importance weights and mixing samples across iterations, as in canonical adaptive IS and multiple IS, our AMIS framework leverages a mixture distribution as the proposal distribution and dynamically adjusts both the mixture parameters and their mixing rates at each iteration, thereby enhancing search diversity and efficiency. Through extensive offline simulations, we demonstrate that AMIS significantly outperforms simple Gaussian Importance Sampling (GIS), particularly in noisy environments. Moreover, our approach is validated in real-world scenarios through online A/B experiments on a major search engine, where AMIS consistently identifies optimal tuning points that are more likely to be adopted as mainstream configurations. These findings indicate that AMIS enhances convergence in noisy environments, leading to more accurate and reliable decision-making in the context of importance sampling off-policy estimators. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13655v1-abstract-full').style.display = 'none'; document.getElementById('2409.13655v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at the CONSEQUENCES '24 workshop, co-located with ACM RecSys '24</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T05; 65C05; 68Q87 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> G.3; I.2.6; I.6.8 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12992">arXiv:2409.12992</a> <span> [<a href="https://arxiv.org/pdf/2409.12992">pdf</a>, <a href="https://arxiv.org/format/2409.12992">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> DiffEditor: Enhancing Speech Editing with Semantic Enrichment and Acoustic Consistency </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yang Chen</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yuhang Jia</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoran Li</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+J">Jiarong Kang</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12992v1-abstract-short" style="display: inline;"> As text-based speech editing becomes increasingly prevalent, the demand for unrestricted free-text editing continues to grow. However, existing speech editing techniques encounter significant challenges, particularly in maintaining intelligibility and acoustic consistency when dealing with out-of-domain (OOD) text. In this paper, we introduce, DiffEditor, a novel speech editing model designed to e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12992v1-abstract-full').style.display = 'inline'; document.getElementById('2409.12992v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12992v1-abstract-full" style="display: none;"> As text-based speech editing becomes increasingly prevalent, the demand for unrestricted free-text editing continues to grow. However, existing speech editing techniques encounter significant challenges, particularly in maintaining intelligibility and acoustic consistency when dealing with out-of-domain (OOD) text. In this paper, we introduce, DiffEditor, a novel speech editing model designed to enhance performance in OOD text scenarios through semantic enrichment and acoustic consistency. To improve the intelligibility of the edited speech, we enrich the semantic information of phoneme embeddings by integrating word embeddings extracted from a pretrained language model. Furthermore, we emphasize that interframe smoothing properties are critical for modeling acoustic consistency, and thus we propose a first-order loss function to promote smoother transitions at editing boundaries and enhance the overall fluency of the edited speech. Experimental results demonstrate that our model achieves state-of-the-art performance in both in-domain and OOD text scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12992v1-abstract-full').style.display = 'none'; document.getElementById('2409.12992v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12678">arXiv:2409.12678</a> <span> [<a href="https://arxiv.org/pdf/2409.12678">pdf</a>, <a href="https://arxiv.org/format/2409.12678">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PMR-Net: Parallel Multi-Resolution Encoder-Decoder Network Framework for Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Du%2C+X">Xiaogang Du</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+D">Dongxin Gu</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+T">Tao Lei</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+Y">Yipeng Jiao</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+Y">Yibin Zou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12678v1-abstract-short" style="display: inline;"> In recent years, encoder-decoder networks have focused on expanding receptive fields and incorporating multi-scale context to capture global features for objects of varying sizes. However, as networks deepen, they often discard fine spatial details, impairing precise object localization. Additionally, conventional decoders' use of interpolation for upsampling leads to a loss of global context, dim… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12678v1-abstract-full').style.display = 'inline'; document.getElementById('2409.12678v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12678v1-abstract-full" style="display: none;"> In recent years, encoder-decoder networks have focused on expanding receptive fields and incorporating multi-scale context to capture global features for objects of varying sizes. However, as networks deepen, they often discard fine spatial details, impairing precise object localization. Additionally, conventional decoders' use of interpolation for upsampling leads to a loss of global context, diminishing edge segmentation accuracy. To address the above problems, we propose a novel parallel multi-resolution encoder-decoder network, namely PMR-Net for short. First, we design a parallel multi-resolution encoder and a multi-resolution context encoder. The parallel multi-resolution encoder can extract and fuse multi-scale fine-grained local features in parallel for input images with different resolutions. The multi-resolution context encoder fuses the global context semantic features of different receptive fields from different encoder branches to maintain effectively the integrity of global information. Secondly, we design a parallel multi-resolution decoder symmetrical to the structure of parallel multi-resolution encoder. The decoder can continuously supplement the global context features of low-resolution branches to the feature maps of high-resolution branches, and effectively solve the problem of global context feature loss caused by upsampling operation in the decoding process. Extensive experiment results demonstrate that our proposed PMR-Net can achieve more accurate segmentation results than state-of-the-art methods on five public available datasets. Moreover, PMR-Net is also a flexible network framework, which can meet the requirements of different scenarios by adjusting the number of network layers and the number of parallel encoder-decoder branches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12678v1-abstract-full').style.display = 'none'; document.getElementById('2409.12678v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12466">arXiv:2409.12466</a> <span> [<a href="https://arxiv.org/pdf/2409.12466">pdf</a>, <a href="https://arxiv.org/format/2409.12466">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> AudioEditor: A Training-Free Diffusion-Based Audio Editing Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yuhang Jia</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jinghua Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+W">Wenjia Zeng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yong Chen</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12466v2-abstract-short" style="display: inline;"> Diffusion-based text-to-audio (TTA) generation has made substantial progress, leveraging latent diffusion model (LDM) to produce high-quality, diverse and instruction-relevant audios. However, beyond generation, the task of audio editing remains equally important but has received comparatively little attention. Audio editing tasks face two primary challenges: executing precise edits and preserving… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12466v2-abstract-full').style.display = 'inline'; document.getElementById('2409.12466v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12466v2-abstract-full" style="display: none;"> Diffusion-based text-to-audio (TTA) generation has made substantial progress, leveraging latent diffusion model (LDM) to produce high-quality, diverse and instruction-relevant audios. However, beyond generation, the task of audio editing remains equally important but has received comparatively little attention. Audio editing tasks face two primary challenges: executing precise edits and preserving the unedited sections. While workflows based on LDMs have effectively addressed these challenges in the field of image processing, similar approaches have been scarcely applied to audio editing. In this paper, we introduce AudioEditor, a training-free audio editing framework built on the pretrained diffusion-based TTA model. AudioEditor incorporates Null-text Inversion and EOT-suppression methods, enabling the model to preserve original audio features while executing accurate edits. Comprehensive objective and subjective experiments validate the effectiveness of AudioEditor in delivering high-quality audio edits. Code and demo can be found at https://github.com/NKU-HLT/AudioEditor. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12466v2-abstract-full').style.display = 'none'; document.getElementById('2409.12466v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08782">arXiv:2409.08782</a> <span> [<a href="https://arxiv.org/pdf/2409.08782">pdf</a>, <a href="https://arxiv.org/format/2409.08782">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Contactless Fingerprint Recognition Using 3D Graph Matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cui%2C+Z">Zhe Cui</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yuwei Jia</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Siyang Zheng</a>, <a href="/search/cs?searchtype=author&query=Su%2C+F">Fei Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08782v1-abstract-short" style="display: inline;"> Contactless fingerprint is a newly developed type of fingerprint, and has gained lots of attention in recent fingerprint studies. However, most existing contactless fingerprint algorithms treat contactless fingerprints as 2D plain fingerprints, and utilize similar recognition methods as traditional contact-based 2D fingerprints. This recognition approach does not consider the modality difference b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08782v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08782v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08782v1-abstract-full" style="display: none;"> Contactless fingerprint is a newly developed type of fingerprint, and has gained lots of attention in recent fingerprint studies. However, most existing contactless fingerprint algorithms treat contactless fingerprints as 2D plain fingerprints, and utilize similar recognition methods as traditional contact-based 2D fingerprints. This recognition approach does not consider the modality difference between contactless and contact fingerprints, especially the intrinsic 3D characteristic of contactless fingerprints. This paper proposes a novel contactless fingerprint recognition algorithm that captures the revealed 3D feature of contactless fingerprints rather than the plain 2D feature. The proposed method first recovers 3D features from the input contactless fingerprint, including the 3D shape model and 3D fingerprint feature (minutiae, orientation, etc.). Then, a novel 3D graph matching is conducted in 3D space according to the extracted 3D feature. Our method captures the real 3D nature of contactless fingerprints as the whole feature extraction and matching algorithms are completed in real 3D space. Experiments results on contactless fingerprint databases show that the proposed method successfully improves the matching accuracy of contactless fingerprints. Exceptionally, our method performs stably across multiple poses of contactless fingerprints due to 3D graph matching, which is a great advantage compared to previous contactless fingerprint recognition algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08782v1-abstract-full').style.display = 'none'; document.getElementById('2409.08782v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07494">arXiv:2409.07494</a> <span> [<a href="https://arxiv.org/pdf/2409.07494">pdf</a>, <a href="https://arxiv.org/format/2409.07494">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="General Finance">q-fin.GN</span> </div> </div> <p class="title is-5 mathjax"> Ethereum Fraud Detection via Joint Transaction Language Model and Graph Representation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yifan Jia</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanbin Wang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jianguo Sun</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yiwei Liu</a>, <a href="/search/cs?searchtype=author&query=Sheng%2C+Z">Zhang Sheng</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Ye Tian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07494v1-abstract-short" style="display: inline;"> Ethereum faces growing fraud threats. Current fraud detection methods, whether employing graph neural networks or sequence models, fail to consider the semantic information and similarity patterns within transactions. Moreover, these approaches do not leverage the potential synergistic benefits of combining both types of models. To address these challenges, we propose TLMG4Eth that combines a tran… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07494v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07494v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07494v1-abstract-full" style="display: none;"> Ethereum faces growing fraud threats. Current fraud detection methods, whether employing graph neural networks or sequence models, fail to consider the semantic information and similarity patterns within transactions. Moreover, these approaches do not leverage the potential synergistic benefits of combining both types of models. To address these challenges, we propose TLMG4Eth that combines a transaction language model with graph-based methods to capture semantic, similarity, and structural features of transaction data in Ethereum. We first propose a transaction language model that converts numerical transaction data into meaningful transaction sentences, enabling the model to learn explicit transaction semantics. Then, we propose a transaction attribute similarity graph to learn transaction similarity information, enabling us to capture intuitive insights into transaction anomalies. Additionally, we construct an account interaction graph to capture the structural information of the account transaction network. We employ a deep multi-head attention network to fuse transaction semantic and similarity embeddings, and ultimately propose a joint training approach for the multi-head attention network and the account interaction graph to obtain the synergistic benefits of both. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07494v1-abstract-full').style.display = 'none'; document.getElementById('2409.07494v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.06722">arXiv:2409.06722</a> <span> [<a href="https://arxiv.org/pdf/2409.06722">pdf</a>, <a href="https://arxiv.org/format/2409.06722">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/CCWC.2018.8301750">10.1109/CCWC.2018.8301750 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Automated Quantification of White Blood Cells in Light Microscopic Images of Injured Skeletal Muscle </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiao%2C+Y">Yang Jiao</a>, <a href="/search/cs?searchtype=author&query=Derakhshan%2C+H">Hananeh Derakhshan</a>, <a href="/search/cs?searchtype=author&query=Schneider%2C+B+S+P">Barbara St. Pierre Schneider</a>, <a href="/search/cs?searchtype=author&query=Regentova%2C+E">Emma Regentova</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Mei Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.06722v1-abstract-short" style="display: inline;"> White blood cells (WBCs) are the most diverse cell types observed in the healing process of injured skeletal muscles. In the course of healing, WBCs exhibit dynamic cellular response and undergo multiple protein expression changes. The progress of healing can be analyzed by quantifying the number of WBCs or the amount of specific proteins in light microscopic images obtained at different time poin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06722v1-abstract-full').style.display = 'inline'; document.getElementById('2409.06722v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.06722v1-abstract-full" style="display: none;"> White blood cells (WBCs) are the most diverse cell types observed in the healing process of injured skeletal muscles. In the course of healing, WBCs exhibit dynamic cellular response and undergo multiple protein expression changes. The progress of healing can be analyzed by quantifying the number of WBCs or the amount of specific proteins in light microscopic images obtained at different time points after injury. In this paper, we propose an automated quantifying and analysis framework to analyze WBCs using light microscopic images of uninjured and injured muscles. The proposed framework is based on the Localized Iterative Otsu's threshold method with muscle edge detection and region of interest extraction. Compared with the threshold methods used in ImageJ, the LI Otsu's threshold method has high resistance to background area and achieves better accuracy. The CD68-positive cell results are presented for demonstrating the effectiveness of the proposed work. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06722v1-abstract-full').style.display = 'none'; document.getElementById('2409.06722v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2 tables, 7 figures, 8 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05577">arXiv:2409.05577</a> <span> [<a href="https://arxiv.org/pdf/2409.05577">pdf</a>, <a href="https://arxiv.org/format/2409.05577">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Approximation Bounds for Recurrent Neural Networks with Application to Regression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiao%2C+Y">Yuling Jiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yang Wang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+B">Bokai Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05577v1-abstract-short" style="display: inline;"> We study the approximation capacity of deep ReLU recurrent neural networks (RNNs) and explore the convergence properties of nonparametric least squares regression using RNNs. We derive upper bounds on the approximation error of RNNs for H枚lder smooth functions, in the sense that the output at each time step of an RNN can approximate a H枚lder function that depends only on past and current informati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05577v1-abstract-full').style.display = 'inline'; document.getElementById('2409.05577v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05577v1-abstract-full" style="display: none;"> We study the approximation capacity of deep ReLU recurrent neural networks (RNNs) and explore the convergence properties of nonparametric least squares regression using RNNs. We derive upper bounds on the approximation error of RNNs for H枚lder smooth functions, in the sense that the output at each time step of an RNN can approximate a H枚lder function that depends only on past and current information, termed a past-dependent function. This allows a carefully constructed RNN to simultaneously approximate a sequence of past-dependent H枚lder functions. We apply these approximation results to derive non-asymptotic upper bounds for the prediction error of the empirical risk minimizer in regression problem. Our error bounds achieve minimax optimal rate under both exponentially $尾$-mixing and i.i.d. data assumptions, improving upon existing ones. Our results provide statistical guarantees on the performance of RNNs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05577v1-abstract-full').style.display = 'none'; document.getElementById('2409.05577v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03947">arXiv:2409.03947</a> <span> [<a href="https://arxiv.org/pdf/2409.03947">pdf</a>, <a href="https://arxiv.org/format/2409.03947">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> FODA-PG for Enhanced Medical Imaging Narrative Generation: Adaptive Differentiation of Normal and Abnormal Attributes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shu%2C+K">Kai Shu</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yuzhuo Jia</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Ziyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jiechao Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03947v1-abstract-short" style="display: inline;"> Automatic Medical Imaging Narrative generation aims to alleviate the workload of radiologists by producing accurate clinical descriptions directly from radiological images. However, the subtle visual nuances and domain-specific terminology in medical images pose significant challenges compared to generic image captioning tasks. Existing approaches often neglect the vital distinction between normal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03947v1-abstract-full').style.display = 'inline'; document.getElementById('2409.03947v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03947v1-abstract-full" style="display: none;"> Automatic Medical Imaging Narrative generation aims to alleviate the workload of radiologists by producing accurate clinical descriptions directly from radiological images. However, the subtle visual nuances and domain-specific terminology in medical images pose significant challenges compared to generic image captioning tasks. Existing approaches often neglect the vital distinction between normal and abnormal findings, leading to suboptimal performance. In this work, we propose FODA-PG, a novel Fine-grained Organ-Disease Adaptive Partitioning Graph framework that addresses these limitations through domain-adaptive learning. FODA-PG constructs a granular graphical representation of radiological findings by separating disease-related attributes into distinct "disease-specific" and "disease-free" categories based on their clinical significance and location. This adaptive partitioning enables our model to capture the nuanced differences between normal and pathological states, mitigating the impact of data biases. By integrating this fine-grained semantic knowledge into a powerful transformer-based architecture and providing rigorous mathematical justifications for its effectiveness, FODA-PG generates precise and clinically coherent reports with enhanced generalization capabilities. Extensive experiments on the IU-Xray and MIMIC-CXR benchmarks demonstrate the superiority of our approach over state-of-the-art methods, highlighting the importance of domain adaptation in medical report generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03947v1-abstract-full').style.display = 'none'; document.getElementById('2409.03947v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00968">arXiv:2409.00968</a> <span> [<a href="https://arxiv.org/pdf/2409.00968">pdf</a>, <a href="https://arxiv.org/format/2409.00968">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Solving Integrated Process Planning and Scheduling Problem via Graph Neural Network Based Deep Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongpei Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Han Zhang</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Ziyan He</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yunkai Jia</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bo Jiang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiang Huang</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+D">Dongdong Ge</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00968v1-abstract-short" style="display: inline;"> The Integrated Process Planning and Scheduling (IPPS) problem combines process route planning and shop scheduling to achieve high efficiency in manufacturing and maximize resource utilization, which is crucial for modern manufacturing systems. Traditional methods using Mixed Integer Linear Programming (MILP) and heuristic algorithms can not well balance solution quality and speed when solving IPPS… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00968v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00968v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00968v1-abstract-full" style="display: none;"> The Integrated Process Planning and Scheduling (IPPS) problem combines process route planning and shop scheduling to achieve high efficiency in manufacturing and maximize resource utilization, which is crucial for modern manufacturing systems. Traditional methods using Mixed Integer Linear Programming (MILP) and heuristic algorithms can not well balance solution quality and speed when solving IPPS. In this paper, we propose a novel end-to-end Deep Reinforcement Learning (DRL) method. We model the IPPS problem as a Markov Decision Process (MDP) and employ a Heterogeneous Graph Neural Network (GNN) to capture the complex relationships among operations, machines, and jobs. To optimize the scheduling strategy, we use Proximal Policy Optimization (PPO). Experimental results show that, compared to traditional methods, our approach significantly improves solution efficiency and quality in large-scale IPPS instances, providing superior scheduling strategies for modern intelligent manufacturing systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00968v1-abstract-full').style.display = 'none'; document.getElementById('2409.00968v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">24 pages, 13 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.14014">arXiv:2408.14014</a> <span> [<a href="https://arxiv.org/pdf/2408.14014">pdf</a>, <a href="https://arxiv.org/ps/2408.14014">ps</a>, <a href="https://arxiv.org/format/2408.14014">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Category-Theoretical and Topos-Theoretical Frameworks in Machine Learning: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yiyang Jia</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+G">Guohong Peng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zheng Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianhao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.14014v2-abstract-short" style="display: inline;"> In this survey, we provide an overview of category theory-derived machine learning from four mainstream perspectives: gradient-based learning, probability-based learning, invariance and equivalence-based learning, and topos-based learning. For the first three topics, we primarily review research in the past five years, updating and expanding on the previous survey by Shiebler et al.. The fourth to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14014v2-abstract-full').style.display = 'inline'; document.getElementById('2408.14014v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.14014v2-abstract-full" style="display: none;"> In this survey, we provide an overview of category theory-derived machine learning from four mainstream perspectives: gradient-based learning, probability-based learning, invariance and equivalence-based learning, and topos-based learning. For the first three topics, we primarily review research in the past five years, updating and expanding on the previous survey by Shiebler et al.. The fourth topic, which delves into higher category theory, particularly topos theory, is surveyed for the first time in this paper. In certain machine learning methods, the compositionality of functors plays a vital role, prompting the development of specific categorical frameworks. However, when considering how the global properties of a network reflect in local structures and how geometric properties are expressed with logic, the topos structure becomes particularly significant and profound. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14014v2-abstract-full').style.display = 'none'; document.getElementById('2408.14014v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13830">arXiv:2408.13830</a> <span> [<a href="https://arxiv.org/pdf/2408.13830">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multi-SIGATnet: A multimodal schizophrenia MRI classification algorithm using sparse interaction mechanisms and graph attention networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiao%2C+Y">Yuhong Jiao</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+J">Jiaqing Miao</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+J">Jinnan Gong</a>, <a href="/search/cs?searchtype=author&query=He%2C+H">Hui He</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+P">Ping Liang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+C">Cheng Luo</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Y">Ying Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13830v1-abstract-short" style="display: inline;"> Schizophrenia is a serious psychiatric disorder. Its pathogenesis is not completely clear, making it difficult to treat patients precisely. Because of the complicated non-Euclidean network structure of the human brain, learning critical information from brain networks remains difficult. To effectively capture the topological information of brain neural networks, a novel multimodal graph attention… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13830v1-abstract-full').style.display = 'inline'; document.getElementById('2408.13830v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13830v1-abstract-full" style="display: none;"> Schizophrenia is a serious psychiatric disorder. Its pathogenesis is not completely clear, making it difficult to treat patients precisely. Because of the complicated non-Euclidean network structure of the human brain, learning critical information from brain networks remains difficult. To effectively capture the topological information of brain neural networks, a novel multimodal graph attention network based on sparse interaction mechanism (Multi-SIGATnet) was proposed for SZ classification was proposed for SZ classification. Firstly, structural and functional information were fused into multimodal data to obtain more comprehensive and abundant features for patients with SZ. Subsequently, a sparse interaction mechanism was proposed to effectively extract salient features and enhance the feature representation capability. By enhancing the strong connections and weakening the weak connections between feature information based on an asymmetric convolutional network, high-order interactive features were captured. Moreover, sparse learning strategies were designed to filter out redundant connections to improve model performance. Finally, local and global features were updated in accordance with the topological features and connection weight constraints of the higher-order brain network, the features being projected to the classification target space for disorder classification. The effectiveness of the model is verified on the Center for Biomedical Research Excellence (COBRE) and University of California Los Angeles (UCLA) datasets, achieving 81.9\% and 75.8\% average accuracy, respectively, 4.6\% and 5.5\% higher than the graph attention network (GAT) method. Experiments showed that the Multi-SIGATnet method exhibited good performance in identifying SZ. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13830v1-abstract-full').style.display = 'none'; document.getElementById('2408.13830v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09452">arXiv:2408.09452</a> <span> [<a href="https://arxiv.org/pdf/2408.09452">pdf</a>, <a href="https://arxiv.org/format/2408.09452">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Identifying Speakers and Addressees of Quotations in Novels with Prompt Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yuchen Yan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hanjie Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+S">Senbin Zhu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hongde Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhihong Zhang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yuxiang Jia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09452v1-abstract-short" style="display: inline;"> Quotations in literary works, especially novels, are important to create characters, reflect character relationships, and drive plot development. Current research on quotation extraction in novels primarily focuses on quotation attribution, i.e., identifying the speaker of the quotation. However, the addressee of the quotation is also important to construct the relationship between the speaker and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09452v1-abstract-full').style.display = 'inline'; document.getElementById('2408.09452v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09452v1-abstract-full" style="display: none;"> Quotations in literary works, especially novels, are important to create characters, reflect character relationships, and drive plot development. Current research on quotation extraction in novels primarily focuses on quotation attribution, i.e., identifying the speaker of the quotation. However, the addressee of the quotation is also important to construct the relationship between the speaker and the addressee. To tackle the problem of dataset scarcity, we annotate the first Chinese quotation corpus with elements including speaker, addressee, speaking mode and linguistic cue. We propose prompt learning-based methods for speaker and addressee identification based on fine-tuned pre-trained models. Experiments on both Chinese and English datasets show the effectiveness of the proposed methods, which outperform methods based on zero-shot and few-shot large language models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09452v1-abstract-full').style.display = 'none'; document.getElementById('2408.09452v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted by NLPCC 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09253">arXiv:2408.09253</a> <span> [<a href="https://arxiv.org/pdf/2408.09253">pdf</a>, <a href="https://arxiv.org/format/2408.09253">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Reinforcement Learning Compensated Model Predictive Control for Off-road Driving on Unknown Deformable Terrain </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gupta%2C+P">Prakhar Gupta</a>, <a href="/search/cs?searchtype=author&query=Smereka%2C+J+M">Jonathon M. Smereka</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yunyi Jia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09253v1-abstract-short" style="display: inline;"> This study presents an Actor-Critic reinforcement learning Compensated Model Predictive Controller (AC2MPC) designed for high-speed, off-road autonomous driving on deformable terrains. Addressing the difficulty of modeling unknown tire-terrain interaction and ensuring real-time control feasibility and performance, this framework integrates deep reinforcement learning with a model predictive contro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09253v1-abstract-full').style.display = 'inline'; document.getElementById('2408.09253v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09253v1-abstract-full" style="display: none;"> This study presents an Actor-Critic reinforcement learning Compensated Model Predictive Controller (AC2MPC) designed for high-speed, off-road autonomous driving on deformable terrains. Addressing the difficulty of modeling unknown tire-terrain interaction and ensuring real-time control feasibility and performance, this framework integrates deep reinforcement learning with a model predictive controller to manage unmodeled nonlinear dynamics. We evaluate the controller framework over constant and varying velocity profiles using high-fidelity simulator Project Chrono. Our findings demonstrate that our controller statistically outperforms standalone model-based and learning-based controllers over three unknown terrains that represent sandy deformable track, sandy and rocky track and cohesive clay-like deformable soil track. Despite varied and previously unseen terrain characteristics, this framework generalized well enough to track longitudinal reference speeds with the least error. Furthermore, this framework required significantly less training data compared to purely learning based controller, converging in fewer steps while delivering better performance. Even when under-trained, this controller outperformed the standalone controllers, highlighting its potential for safer and more efficient real-world deployment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09253v1-abstract-full').style.display = 'none'; document.getElementById('2408.09253v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to IEEE Transactions on Intelligent Vehicles as a Regular Paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08852">arXiv:2408.08852</a> <span> [<a href="https://arxiv.org/pdf/2408.08852">pdf</a>, <a href="https://arxiv.org/format/2408.08852">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> GeoTransformer: Enhancing Urban Forecasting with Geospatial Attention Mechanisms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yuhao Jia</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zile Wu</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+S">Shengao Yi</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yifei Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08852v1-abstract-short" style="display: inline;"> Recent advancements have focused on encoding urban spatial information into high-dimensional spaces, with notable efforts dedicated to integrating sociodemographic data and satellite imagery. These efforts have established foundational models in this field. However, the effective utilization of these spatial representations for urban forecasting applications remains under-explored. To address this… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08852v1-abstract-full').style.display = 'inline'; document.getElementById('2408.08852v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08852v1-abstract-full" style="display: none;"> Recent advancements have focused on encoding urban spatial information into high-dimensional spaces, with notable efforts dedicated to integrating sociodemographic data and satellite imagery. These efforts have established foundational models in this field. However, the effective utilization of these spatial representations for urban forecasting applications remains under-explored. To address this gap, we introduce GeoTransformer, a novel structure that synergizes the Transformer architecture with geospatial statistics prior. GeoTransformer employs an innovative geospatial attention mechanism to incorporate extensive urban information and spatial dependencies into a unified predictive model. Specifically, we compute geospatial weighted attention scores between the target region and surrounding regions and leverage the integrated urban information for predictions. Extensive experiments on GDP and ride-share demand prediction tasks demonstrate that GeoTransformer significantly outperforms existing baseline models, showcasing its potential to enhance urban forecasting tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08852v1-abstract-full').style.display = 'none'; document.getElementById('2408.08852v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08533">arXiv:2408.08533</a> <span> [<a href="https://arxiv.org/pdf/2408.08533">pdf</a>, <a href="https://arxiv.org/ps/2408.08533">ps</a>, <a href="https://arxiv.org/format/2408.08533">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Unsupervised Transfer Learning via Adversarial Contrastive Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Duan%2C+C">Chenguang Duan</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+Y">Yuling Jiao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Huazhen Lin</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+W">Wensen Ma</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J+Z">Jerry Zhijian Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08533v1-abstract-short" style="display: inline;"> Learning a data representation for downstream supervised learning tasks under unlabeled scenario is both critical and challenging. In this paper, we propose a novel unsupervised transfer learning approach using adversarial contrastive training (ACT). Our experimental results demonstrate outstanding classification accuracy with both fine-tuned linear probe and K-NN protocol across various datasets,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08533v1-abstract-full').style.display = 'inline'; document.getElementById('2408.08533v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08533v1-abstract-full" style="display: none;"> Learning a data representation for downstream supervised learning tasks under unlabeled scenario is both critical and challenging. In this paper, we propose a novel unsupervised transfer learning approach using adversarial contrastive training (ACT). Our experimental results demonstrate outstanding classification accuracy with both fine-tuned linear probe and K-NN protocol across various datasets, showing competitiveness with existing state-of-the-art self-supervised learning methods. Moreover, we provide an end-to-end theoretical guarantee for downstream classification tasks in a misspecified, over-parameterized setting, highlighting how a large amount of unlabeled data contributes to prediction accuracy. Our theoretical findings suggest that the testing error of downstream tasks depends solely on the efficiency of data augmentation used in ACT when the unlabeled sample size is sufficiently large. This offers a theoretical understanding of learning downstream tasks with a small sample size. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08533v1-abstract-full').style.display = 'none'; document.getElementById('2408.08533v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.07773">arXiv:2408.07773</a> <span> [<a href="https://arxiv.org/pdf/2408.07773">pdf</a>, <a href="https://arxiv.org/format/2408.07773">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MedTsLLM: Leveraging LLMs for Multimodal Medical Time Series Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chan%2C+N">Nimeesha Chan</a>, <a href="/search/cs?searchtype=author&query=Parker%2C+F">Felix Parker</a>, <a href="/search/cs?searchtype=author&query=Bennett%2C+W">William Bennett</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tianyi Wu</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+M+Y">Mung Yao Jia</a>, <a href="/search/cs?searchtype=author&query=Fackler%2C+J">James Fackler</a>, <a href="/search/cs?searchtype=author&query=Ghobadi%2C+K">Kimia Ghobadi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.07773v1-abstract-short" style="display: inline;"> The complexity and heterogeneity of data in many real-world applications pose significant challenges for traditional machine learning and signal processing techniques. For instance, in medicine, effective analysis of diverse physiological signals is crucial for patient monitoring and clinical decision-making and yet highly challenging. We introduce MedTsLLM, a general multimodal large language mod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07773v1-abstract-full').style.display = 'inline'; document.getElementById('2408.07773v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.07773v1-abstract-full" style="display: none;"> The complexity and heterogeneity of data in many real-world applications pose significant challenges for traditional machine learning and signal processing techniques. For instance, in medicine, effective analysis of diverse physiological signals is crucial for patient monitoring and clinical decision-making and yet highly challenging. We introduce MedTsLLM, a general multimodal large language model (LLM) framework that effectively integrates time series data and rich contextual information in the form of text to analyze physiological signals, performing three tasks with clinical relevance: semantic segmentation, boundary detection, and anomaly detection in time series. These critical tasks enable deeper analysis of physiological signals and can provide actionable insights for clinicians. We utilize a reprogramming layer to align embeddings of time series patches with a pretrained LLM's embedding space and make effective use of raw time series, in conjunction with textual context. Given the multivariate nature of medical datasets, we develop methods to handle multiple covariates. We additionally tailor the text prompt to include patient-specific information. Our model outperforms state-of-the-art baselines, including deep learning models, other LLMs, and clinical methods across multiple medical domains, specifically electrocardiograms and respiratory waveforms. MedTsLLM presents a promising step towards harnessing the power of LLMs for medical time series analysis that can elevate data-driven tools for clinicians and improve patient outcomes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07773v1-abstract-full').style.display = 'none'; document.getElementById('2408.07773v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">published in Proceedings of Machine Learning Research, MLHC 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.07369">arXiv:2408.07369</a> <span> [<a href="https://arxiv.org/pdf/2408.07369">pdf</a>, <a href="https://arxiv.org/format/2408.07369">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> ProCom: A Few-shot Targeted Community Detection Algorithm </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xixi Wu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+K">Kaiyu Xiong</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yun Xiong</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiaoxin He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yao Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+Y">Yizhu Jiao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiawei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.07369v1-abstract-short" style="display: inline;"> Targeted community detection aims to distinguish a particular type of community in the network. This is an important task with a lot of real-world applications, e.g., identifying fraud groups in transaction networks. Traditional community detection methods fail to capture the specific features of the targeted community and detect all types of communities indiscriminately. Semi-supervised community… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07369v1-abstract-full').style.display = 'inline'; document.getElementById('2408.07369v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.07369v1-abstract-full" style="display: none;"> Targeted community detection aims to distinguish a particular type of community in the network. This is an important task with a lot of real-world applications, e.g., identifying fraud groups in transaction networks. Traditional community detection methods fail to capture the specific features of the targeted community and detect all types of communities indiscriminately. Semi-supervised community detection algorithms, emerged as a feasible alternative, are inherently constrained by their limited adaptability and substantial reliance on a large amount of labeled data, which demands extensive domain knowledge and manual effort. In this paper, we address the aforementioned weaknesses in targeted community detection by focusing on few-shot scenarios. We propose ProCom, a novel framework that extends the ``pre-train, prompt'' paradigm, offering a low-resource, high-efficiency, and transferable solution. Within the framework, we devise a dual-level context-aware pre-training method that fosters a deep understanding of latent communities in the network, establishing a rich knowledge foundation for downstream task. In the prompt learning stage, we reformulate the targeted community detection task into pre-training objectives, allowing the extraction of specific knowledge relevant to the targeted community to facilitate effective and efficient inference. By leveraging both the general community knowledge acquired during pre-training and the specific insights gained from the prompt communities, ProCom exhibits remarkable adaptability across different datasets. We conduct extensive experiments on five benchmarks to evaluate the ProCom framework, demonstrating its SOTA performance under few-shot scenarios, strong efficiency, and transferability across diverse datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07369v1-abstract-full').style.display = 'none'; document.getElementById('2408.07369v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by SIGKDD'2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.07291">arXiv:2408.07291</a> <span> [<a href="https://arxiv.org/pdf/2408.07291">pdf</a>, <a href="https://arxiv.org/format/2408.07291">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Evaluating Large Language Model based Personal Information Extraction and Countermeasures </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yupei Liu</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yuqi Jia</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+J">Jinyuan Jia</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+N+Z">Neil Zhenqiang Gong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.07291v1-abstract-short" style="display: inline;"> Automatically extracting personal information--such as name, phone number, and email address--from publicly available profiles at a large scale is a stepstone to many other security attacks including spear phishing. Traditional methods--such as regular expression, keyword search, and entity detection--achieve limited success at such personal information extraction. In this work, we perform a syste… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07291v1-abstract-full').style.display = 'inline'; document.getElementById('2408.07291v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.07291v1-abstract-full" style="display: none;"> Automatically extracting personal information--such as name, phone number, and email address--from publicly available profiles at a large scale is a stepstone to many other security attacks including spear phishing. Traditional methods--such as regular expression, keyword search, and entity detection--achieve limited success at such personal information extraction. In this work, we perform a systematic measurement study to benchmark large language model (LLM) based personal information extraction and countermeasures. Towards this goal, we present a framework for LLM-based extraction attacks; collect three datasets including a synthetic dataset generated by GPT-4 and two real-world datasets with manually labeled 8 categories of personal information; introduce a novel mitigation strategy based on \emph{prompt injection}; and systematically benchmark LLM-based attacks and countermeasures using 10 LLMs and our 3 datasets. Our key findings include: LLM can be misused by attackers to accurately extract various personal information from personal profiles; LLM outperforms conventional methods at such extraction; and prompt injection can mitigate such risk to a large extent and outperforms conventional countermeasures. Our code and data are available at: \url{https://github.com/liu00222/LLM-Based-Personal-Profile-Extraction}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07291v1-abstract-full').style.display = 'none'; document.getElementById('2408.07291v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05981">arXiv:2408.05981</a> <span> [<a href="https://arxiv.org/pdf/2408.05981">pdf</a>, <a href="https://arxiv.org/format/2408.05981">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> CAD-Mesher: A Convenient, Accurate, Dense Mesh-based Mapping Module in SLAM for Dynamic Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yanpeng Jia</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+F">Fengkui Cao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Ting Wang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yandong Tang</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+S">Shiliang Shao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lianqing Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05981v1-abstract-short" style="display: inline;"> Most LiDAR odometry and SLAM systems construct maps in point clouds, which are discrete and sparse when zoomed in, making them not directly suitable for navigation. Mesh maps represent a dense and continuous map format with low memory consumption, which can approximate complex structures with simple elements, attracting significant attention of researchers in recent years. However, most implementa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05981v1-abstract-full').style.display = 'inline'; document.getElementById('2408.05981v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05981v1-abstract-full" style="display: none;"> Most LiDAR odometry and SLAM systems construct maps in point clouds, which are discrete and sparse when zoomed in, making them not directly suitable for navigation. Mesh maps represent a dense and continuous map format with low memory consumption, which can approximate complex structures with simple elements, attracting significant attention of researchers in recent years. However, most implementations operate under a static environment assumption. In effect, moving objects cause ghosting, potentially degrading the quality of meshing. To address these issues, we propose a plug-and-play meshing module adapting to dynamic environments, which can easily integrate with various LiDAR odometry to generally improve the pose estimation accuracy of odometry. In our meshing module, a novel two-stage coarse-to-fine dynamic removal method is designed to effectively filter dynamic objects, generating consistent, accurate, and dense mesh maps. To our best know, this is the first mesh construction method with explicit dynamic removal. Additionally, conducive to Gaussian process in mesh construction, sliding window-based keyframe aggregation and adaptive downsampling strategies are used to ensure the uniformity of point cloud. We evaluate the localization and mapping accuracy on five publicly available datasets. Both qualitative and quantitative results demonstrate the superiority of our method compared with the state-of-the-art algorithms. The code and introduction video are publicly available at https://yaepiii.github.io/CAD-Mesher/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05981v1-abstract-full').style.display = 'none'; document.getElementById('2408.05981v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05457">arXiv:2408.05457</a> <span> [<a href="https://arxiv.org/pdf/2408.05457">pdf</a>, <a href="https://arxiv.org/format/2408.05457">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Investigating Instruction Tuning Large Language Models on Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+K">Kerui Zhu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+B">Bo-Wei Huang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+B">Bowen Jin</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+Y">Yizhu Jiao</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+M">Ming Zhong</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kevin Chang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+S">Shou-De Lin</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jiawei Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05457v1-abstract-short" style="display: inline;"> Inspired by the recent advancements of Large Language Models (LLMs) in NLP tasks, there's growing interest in applying LLMs to graph-related tasks. This study delves into the capabilities of instruction-following LLMs for engaging with real-world graphs, aiming to offer empirical insights into how LLMs can effectively interact with graphs and generalize across graph tasks. We begin by constructing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05457v1-abstract-full').style.display = 'inline'; document.getElementById('2408.05457v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05457v1-abstract-full" style="display: none;"> Inspired by the recent advancements of Large Language Models (LLMs) in NLP tasks, there's growing interest in applying LLMs to graph-related tasks. This study delves into the capabilities of instruction-following LLMs for engaging with real-world graphs, aiming to offer empirical insights into how LLMs can effectively interact with graphs and generalize across graph tasks. We begin by constructing a dataset designed for instruction tuning, which comprises a diverse collection of 79 graph-related tasks from academic and e-commerce domains, featuring 44,240 training instances and 18,960 test samples. Utilizing this benchmark, our initial investigation focuses on identifying the optimal graph representation that serves as a conduit for LLMs to understand complex graph structures. Our findings indicate that JSON format for graph representation consistently outperforms natural language and code formats across various LLMs and graph types. Furthermore, we examine the key factors that influence the generalization abilities of instruction-tuned LLMs by evaluating their performance on both in-domain and out-of-domain graph tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05457v1-abstract-full').style.display = 'none'; document.getElementById('2408.05457v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">COLM 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05404">arXiv:2408.05404</a> <span> [<a href="https://arxiv.org/pdf/2408.05404">pdf</a>, <a href="https://arxiv.org/format/2408.05404">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LaiDA: Linguistics-aware In-context Learning with Data Augmentation for Metaphor Components Identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hongde Liu</a>, <a href="/search/cs?searchtype=author&query=He%2C+C">Chenyuan He</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+F">Feiyang Meng</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+C">Changyong Niu</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yuxiang Jia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05404v1-abstract-short" style="display: inline;"> Metaphor Components Identification (MCI) contributes to enhancing machine understanding of metaphors, thereby advancing downstream natural language processing tasks. However, the complexity, diversity, and dependency on context and background knowledge pose significant challenges for MCI. Large language models (LLMs) offer new avenues for accurate comprehension of complex natural language texts du… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05404v1-abstract-full').style.display = 'inline'; document.getElementById('2408.05404v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05404v1-abstract-full" style="display: none;"> Metaphor Components Identification (MCI) contributes to enhancing machine understanding of metaphors, thereby advancing downstream natural language processing tasks. However, the complexity, diversity, and dependency on context and background knowledge pose significant challenges for MCI. Large language models (LLMs) offer new avenues for accurate comprehension of complex natural language texts due to their strong semantic analysis and extensive commonsense knowledge. In this research, a new LLM-based framework is proposed, named Linguistics-aware In-context Learning with Data Augmentation (LaiDA). Specifically, ChatGPT and supervised fine-tuning are utilized to tailor a high-quality dataset. LaiDA incorporates a simile dataset for pre-training. A graph attention network encoder generates linguistically rich feature representations to retrieve similar examples. Subsequently, LLM is fine-tuned with prompts that integrate linguistically similar examples. LaiDA ranked 2nd in Subtask 2 of NLPCC2024 Shared Task 9, demonstrating its effectiveness. Code and data are available at https://github.com/WXLJZ/LaiDA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05404v1-abstract-full').style.display = 'none'; document.getElementById('2408.05404v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted by NLPCC 2024 Shared Tasks</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.04873">arXiv:2408.04873</a> <span> [<a href="https://arxiv.org/pdf/2408.04873">pdf</a>, <a href="https://arxiv.org/format/2408.04873">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Unsupervised Episode Detection for Large-Scale News Events </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kargupta%2C+P">Priyanka Kargupta</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yunyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+Y">Yizhu Jiao</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+S">Siru Ouyang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jiawei Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.04873v1-abstract-short" style="display: inline;"> Episodic structures are inherently interpretable and adaptable to evolving large-scale key events. However, state-of-the-art automatic event detection methods overlook event episodes and, therefore, struggle with these crucial characteristics. This paper introduces a novel task, episode detection, aimed at identifying episodes from a news corpus containing key event articles. An episode describes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04873v1-abstract-full').style.display = 'inline'; document.getElementById('2408.04873v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.04873v1-abstract-full" style="display: none;"> Episodic structures are inherently interpretable and adaptable to evolving large-scale key events. However, state-of-the-art automatic event detection methods overlook event episodes and, therefore, struggle with these crucial characteristics. This paper introduces a novel task, episode detection, aimed at identifying episodes from a news corpus containing key event articles. An episode describes a cohesive cluster of core entities (e.g., "protesters", "police") performing actions at a specific time and location. Furthermore, an episode is a significant part of a larger group of episodes under a particular key event. Automatically detecting episodes is challenging because, unlike key events and atomic actions, we cannot rely on explicit mentions of times and locations to distinguish between episodes or use semantic similarity to merge inconsistent episode co-references. To address these challenges, we introduce EpiMine, an unsupervised episode detection framework that (1) automatically identifies the most salient, key-event-relevant terms and segments, (2) determines candidate episodes in an article based on natural episodic partitions estimated through shifts in discriminative term combinations, and (3) refines and forms final episode clusters using large language model-based reasoning on the candidate episodes. We construct three diverse, real-world event datasets annotated at the episode level. EpiMine outperforms all baselines on these datasets by an average 59.2% increase across all metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04873v1-abstract-full').style.display = 'none'; document.getElementById('2408.04873v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Jia%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Jia%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Jia%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Jia%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Jia%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Jia%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository