Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 659 results for author: <span class="mathjax">Yuan, X</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Yuan%2C+X">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Yuan, X"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Yuan%2C+X&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Yuan, X"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yuan%2C+X&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yuan%2C+X&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yuan%2C+X&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yuan%2C+X&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yuan%2C+X&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yuan%2C+X&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09817">arXiv:2502.09817</a> <span> [<a href="https://arxiv.org/pdf/2502.09817">pdf</a>, <a href="https://arxiv.org/ps/2502.09817">ps</a>, <a href="https://arxiv.org/format/2502.09817">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Vector Linear Secure Aggregation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xihang Yuan</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Hua Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09817v1-abstract-short" style="display: inline;"> The secure summation problem, where $K$ users wish to compute the sum of their inputs at a server while revealing nothing about all $K$ inputs beyond the desired sum, is generalized in two aspects - first, the desired function is an arbitrary linear function (multiple linear combinations) of the $K$ inputs instead of just the sum; second, rather than protecting all $K$ inputs, we wish to guarantee… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09817v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09817v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09817v1-abstract-full" style="display: none;"> The secure summation problem, where $K$ users wish to compute the sum of their inputs at a server while revealing nothing about all $K$ inputs beyond the desired sum, is generalized in two aspects - first, the desired function is an arbitrary linear function (multiple linear combinations) of the $K$ inputs instead of just the sum; second, rather than protecting all $K$ inputs, we wish to guarantee that no information is leaked about an arbitrary linear function of the $K$ inputs. For this vector linear generalization of the secure summation problem, we characterize the optimal randomness cost, i.e., to compute one instance of the desired vector linear function, the minimum number of the random key variables held by the users is equal to the dimension of the vector space that is in the span of the vectors formed by the coefficients of the linear function to protect but not in the span of the vectors formed by the coefficients of the linear function to compute. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09817v1-abstract-full').style.display = 'none'; document.getElementById('2502.09817v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09082">arXiv:2502.09082</a> <span> [<a href="https://arxiv.org/pdf/2502.09082">pdf</a>, <a href="https://arxiv.org/format/2502.09082">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CoSER: Coordinating LLM-Based Persona Simulation of Established Roles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xintao Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Heng Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yifei Zhang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xinfeng Yuan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Rui Xu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jen-tse Huang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+S">Siyu Yuan</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Haoran Guo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiangjie Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Y">Yanghua Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Shuchang Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09082v1-abstract-short" style="display: inline;"> Role-playing language agents (RPLAs) have emerged as promising applications of large language models (LLMs). However, simulating established characters presents a challenging task for RPLAs, due to the lack of authentic character datasets and nuanced evaluation methods using such data. In this paper, we present CoSER, a collection of a high-quality dataset, open models, and an evaluation protocol… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09082v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09082v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09082v1-abstract-full" style="display: none;"> Role-playing language agents (RPLAs) have emerged as promising applications of large language models (LLMs). However, simulating established characters presents a challenging task for RPLAs, due to the lack of authentic character datasets and nuanced evaluation methods using such data. In this paper, we present CoSER, a collection of a high-quality dataset, open models, and an evaluation protocol towards effective RPLAs of established characters. The CoSER dataset covers 17,966 characters from 771 renowned books. It provides authentic dialogues with real-world intricacies, as well as diverse data types such as conversation setups, character experiences and internal thoughts. Drawing from acting methodology, we introduce given-circumstance acting for training and evaluating role-playing LLMs, where LLMs sequentially portray multiple characters in book scenes. Using our dataset, we develop CoSER 8B and CoSER 70B, i.e., advanced open role-playing LLMs built on LLaMA-3.1 models. Extensive experiments demonstrate the value of the CoSER dataset for RPLA training, evaluation and retrieval. Moreover, CoSER 70B exhibits state-of-the-art performance surpassing or matching GPT-4o on our evaluation and three existing benchmarks, i.e., achieving 75.80% and 93.47% accuracy on the InCharacter and LifeChoice benchmarks respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09082v1-abstract-full').style.display = 'none'; document.getElementById('2502.09082v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08381">arXiv:2502.08381</a> <span> [<a href="https://arxiv.org/pdf/2502.08381">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> The MoE-Empowered Edge LLMs Deployment: Architecture, Challenges, and Opportunities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+N">Ning Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Song Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tuo Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Muqing Li</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+Z">Zicong Hong</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Q">Qihua Zhou</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xin Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haijun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08381v1-abstract-short" style="display: inline;"> The powerfulness of LLMs indicates that deploying various LLMs with different scales and architectures on end, edge, and cloud to satisfy different requirements and adaptive heterogeneous hardware is the critical way to achieve ubiquitous intelligence for 6G. However, the massive parameter scale of LLMs poses significant challenges in deploying them on edge devices due to high computational and st… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08381v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08381v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08381v1-abstract-full" style="display: none;"> The powerfulness of LLMs indicates that deploying various LLMs with different scales and architectures on end, edge, and cloud to satisfy different requirements and adaptive heterogeneous hardware is the critical way to achieve ubiquitous intelligence for 6G. However, the massive parameter scale of LLMs poses significant challenges in deploying them on edge devices due to high computational and storage demands. Considering that the sparse activation in Mixture of Experts (MoE) is effective on scalable and dynamic allocation of computational and communications resources at the edge, this paper proposes a novel MoE-empowered collaborative deployment framework for edge LLMs, denoted as CoEL. This framework fully leverages the properties of MoE architecture and encompasses four key aspects: Perception, Deployment, Compression, and Updating. Edge servers broadcast their resource status and the specific resource requirements of LLMs to their neighbors. Then, utilizing this data, two sophisticated deployment strategies are proposed for satisfying varying model scales, ensuring that each model is deployed effectively. One for deploying LLMs on a single edge device through intra-device resource collaboration, and another for a distributed deployment across multiple edge devices via inter-device resource collaboration. Furthermore, both the models and the intermediate data are compressed for reducing memory footprint by quantization and reducing the volume of intermediate data by token fusion and pruning. Finally, given the dynamic of network topology, resource status, and user requirements, the deployment strategies are regularly updated to maintain its relevance and effectiveness. This paper also delineates the challenges and potential research directions for the deployment of edge LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08381v1-abstract-full').style.display = 'none'; document.getElementById('2502.08381v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7pages, 1 table, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04711">arXiv:2502.04711</a> <span> [<a href="https://arxiv.org/pdf/2502.04711">pdf</a>, <a href="https://arxiv.org/format/2502.04711">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Frequency-Adaptive Knowledge Distillation for Speech Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xihao Yuan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Siqi Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hanting Chen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Lu Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jian Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jie Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04711v1-abstract-short" style="display: inline;"> Deep learning-based speech enhancement (SE) models have recently outperformed traditional techniques, yet their deployment on resource-constrained devices remains challenging due to high computational and memory demands. This paper introduces a novel dynamic frequency-adaptive knowledge distillation (DFKD) approach to effectively compress SE models. Our method dynamically assesses the model's outp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04711v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04711v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04711v1-abstract-full" style="display: none;"> Deep learning-based speech enhancement (SE) models have recently outperformed traditional techniques, yet their deployment on resource-constrained devices remains challenging due to high computational and memory demands. This paper introduces a novel dynamic frequency-adaptive knowledge distillation (DFKD) approach to effectively compress SE models. Our method dynamically assesses the model's output, distinguishing between high and low-frequency components, and adapts the learning objectives to meet the unique requirements of different frequency bands, capitalizing on the SE task's inherent characteristics. To evaluate the DFKD's efficacy, we conducted experiments on three state-of-the-art models: DCCRN, ConTasNet, and DPTNet. The results demonstrate that our method not only significantly enhances the performance of the compressed model (student model) but also surpasses other logit-based knowledge distillation methods specifically for SE tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04711v1-abstract-full').style.display = 'none'; document.getElementById('2502.04711v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2 figures, accepted by ICASSP2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03674">arXiv:2502.03674</a> <span> [<a href="https://arxiv.org/pdf/2502.03674">pdf</a>, <a href="https://arxiv.org/format/2502.03674">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> An Empirical Study of Methods for Small Object Detection from Satellite Imagery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xiaohui Yuan</a>, <a href="/search/cs?searchtype=author&query=Chakravarty%2C+A">Aniv Chakravarty</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+L">Lichuan Gu</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Z">Zhenchun Wei</a>, <a href="/search/cs?searchtype=author&query=Lichtenberg%2C+E">Elinor Lichtenberg</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tian Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03674v1-abstract-short" style="display: inline;"> This paper reviews object detection methods for finding small objects from remote sensing imagery and provides an empirical evaluation of four state-of-the-art methods to gain insights into method performance and technical challenges. In particular, we use car detection from urban satellite images and bee box detection from satellite images of agricultural lands as application scenarios. Drawing f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03674v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03674v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03674v1-abstract-full" style="display: none;"> This paper reviews object detection methods for finding small objects from remote sensing imagery and provides an empirical evaluation of four state-of-the-art methods to gain insights into method performance and technical challenges. In particular, we use car detection from urban satellite images and bee box detection from satellite images of agricultural lands as application scenarios. Drawing from the existing surveys and literature, we identify several top-performing methods for the empirical study. Public, high-resolution satellite image datasets are used in our experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03674v1-abstract-full').style.display = 'none'; document.getElementById('2502.03674v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16823">arXiv:2501.16823</a> <span> [<a href="https://arxiv.org/pdf/2501.16823">pdf</a>, <a href="https://arxiv.org/format/2501.16823">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Phase Noise Resilient Codebook Design for Sparse Code Multiple Access </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haibo Liu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Q">Qu Luo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zilong Liu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+S">Shan Luo</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+P">Pei Xiao</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xiaojun Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16823v1-abstract-short" style="display: inline;"> Sparse code multiple access (SCMA) is a promising technique for future machine type communication systems due to its superior spectral efficiency and capability for supporting massive connectivity. This paper proposes a novel class of sparse codebooks to improve the error rate performance of SCMA in the presence of phase noise (PN). Specifically, we first analyze the error rate performance of SCMA… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16823v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16823v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16823v1-abstract-full" style="display: none;"> Sparse code multiple access (SCMA) is a promising technique for future machine type communication systems due to its superior spectral efficiency and capability for supporting massive connectivity. This paper proposes a novel class of sparse codebooks to improve the error rate performance of SCMA in the presence of phase noise (PN). Specifically, we first analyze the error rate performance of SCMA impaired by looking into the pair-wise error probability. Then, a novel codebook design metric, called minimum PN metric (MPNM), is proposed. In addition, to design PN resilient codebooks, we propose a novel pulse-amplitude modulation (PAM)-based low projection mother constellation (LP-MC), called LP-PAM. The codebooks for different users are obtained by rotating and scaling the MC, where the phase rotation angles and scaling factors for different users are optimized by maximizing the proposed MPNM. Numerical results show that the proposed PNCBs have larger MPNM values and achieve improved error rate performance than the-state-of-the-art codebooks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16823v1-abstract-full').style.display = 'none'; document.getElementById('2501.16823v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14249">arXiv:2501.14249</a> <span> [<a href="https://arxiv.org/pdf/2501.14249">pdf</a>, <a href="https://arxiv.org/format/2501.14249">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Humanity's Last Exam </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Phan%2C+L">Long Phan</a>, <a href="/search/cs?searchtype=author&query=Gatti%2C+A">Alice Gatti</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Ziwen Han</a>, <a href="/search/cs?searchtype=author&query=Li%2C+N">Nathaniel Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Josephina Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hugh Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C+B+C">Chen Bo Calvin Zhang</a>, <a href="/search/cs?searchtype=author&query=Shaaban%2C+M">Mohamed Shaaban</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+J">John Ling</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+S">Sean Shi</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+M">Michael Choi</a>, <a href="/search/cs?searchtype=author&query=Agrawal%2C+A">Anish Agrawal</a>, <a href="/search/cs?searchtype=author&query=Chopra%2C+A">Arnav Chopra</a>, <a href="/search/cs?searchtype=author&query=Khoja%2C+A">Adam Khoja</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+R">Ryan Kim</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+R">Richard Ren</a>, <a href="/search/cs?searchtype=author&query=Hausenloy%2C+J">Jason Hausenloy</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+O">Oliver Zhang</a>, <a href="/search/cs?searchtype=author&query=Mazeika%2C+M">Mantas Mazeika</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+T">Tung Nguyen</a>, <a href="/search/cs?searchtype=author&query=Anderson%2C+D">Daron Anderson</a>, <a href="/search/cs?searchtype=author&query=Shah%2C+I+A">Imad Ali Shah</a>, <a href="/search/cs?searchtype=author&query=Doroshenko%2C+M">Mikhail Doroshenko</a>, <a href="/search/cs?searchtype=author&query=Stokes%2C+A+C">Alun Cennyth Stokes</a>, <a href="/search/cs?searchtype=author&query=Mahmood%2C+M">Mobeen Mahmood</a> , et al. (710 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14249v3-abstract-short" style="display: inline;"> Benchmarks are important tools for tracking the rapid advancements in large language model (LLM) capabilities. However, benchmarks are not keeping pace in difficulty: LLMs now achieve over 90\% accuracy on popular benchmarks like MMLU, limiting informed measurement of state-of-the-art LLM capabilities. In response, we introduce Humanity's Last Exam (HLE), a multi-modal benchmark at the frontier of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14249v3-abstract-full').style.display = 'inline'; document.getElementById('2501.14249v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14249v3-abstract-full" style="display: none;"> Benchmarks are important tools for tracking the rapid advancements in large language model (LLM) capabilities. However, benchmarks are not keeping pace in difficulty: LLMs now achieve over 90\% accuracy on popular benchmarks like MMLU, limiting informed measurement of state-of-the-art LLM capabilities. In response, we introduce Humanity's Last Exam (HLE), a multi-modal benchmark at the frontier of human knowledge, designed to be the final closed-ended academic benchmark of its kind with broad subject coverage. HLE consists of 3,000 questions across dozens of subjects, including mathematics, humanities, and the natural sciences. HLE is developed globally by subject-matter experts and consists of multiple-choice and short-answer questions suitable for automated grading. Each question has a known solution that is unambiguous and easily verifiable, but cannot be quickly answered via internet retrieval. State-of-the-art LLMs demonstrate low accuracy and calibration on HLE, highlighting a significant gap between current LLM capabilities and the expert human frontier on closed-ended academic questions. To inform research and policymaking upon a clear understanding of model capabilities, we publicly release HLE at https://lastexam.ai. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14249v3-abstract-full').style.display = 'none'; document.getElementById('2501.14249v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13351">arXiv:2501.13351</a> <span> [<a href="https://arxiv.org/pdf/2501.13351">pdf</a>, <a href="https://arxiv.org/format/2501.13351">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3696410.3714593">10.1145/3696410.3714593 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> 50 Shades of Deceptive Patterns: A Unified Taxonomy, Multimodal Detection, and Security Implications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zewei Shi</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+R">Ruoxi Sun</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jieshan Chen</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jiamou Sun</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+M">Minhui Xue</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yansong Gao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+F">Feng Liu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xingliang Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13351v3-abstract-short" style="display: inline;"> Deceptive patterns (DPs) are user interface designs deliberately crafted to manipulate users into unintended decisions, often by exploiting cognitive biases for the benefit of companies or services. While numerous studies have explored ways to identify these deceptive patterns, many existing solutions require significant human intervention and struggle to keep pace with the evolving nature of dece… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13351v3-abstract-full').style.display = 'inline'; document.getElementById('2501.13351v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13351v3-abstract-full" style="display: none;"> Deceptive patterns (DPs) are user interface designs deliberately crafted to manipulate users into unintended decisions, often by exploiting cognitive biases for the benefit of companies or services. While numerous studies have explored ways to identify these deceptive patterns, many existing solutions require significant human intervention and struggle to keep pace with the evolving nature of deceptive designs. To address these challenges, we expanded the deceptive pattern taxonomy from security and privacy perspectives, refining its categories and scope. We created a comprehensive dataset of deceptive patterns by integrating existing small-scale datasets with new samples, resulting in 6,725 images and 10,421 DP instances from mobile apps and websites. We then developed DPGuard, a novel automatic tool leveraging commercial multimodal large language models (MLLMs) for deceptive pattern detection. Experimental results show that DPGuard outperforms state-of-the-art methods. Finally, we conducted an extensive empirical evaluation on 2,000 popular mobile apps and websites, revealing that 23.61% of mobile screenshots and 47.27% of website screenshots feature at least one deceptive pattern instance. Through four unexplored case studies that inform security implications, we highlight the critical importance of the unified taxonomy in addressing the growing challenges of Internet deception. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13351v3-abstract-full').style.display = 'none'; document.getElementById('2501.13351v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted by The Web Conference 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12956">arXiv:2501.12956</a> <span> [<a href="https://arxiv.org/pdf/2501.12956">pdf</a>, <a href="https://arxiv.org/format/2501.12956">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> GANQ: GPU-Adaptive Non-Uniform Quantization for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+P">Pengxiang Zhao</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xiaoming Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12956v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) face significant deployment challenges due to their substantial resource requirements. While low-bit quantized weights can reduce memory usage and improve inference efficiency, current hardware lacks native support for mixed-precision General Matrix Multiplication (mpGEMM), resulting in inefficient dequantization-based implementations. Moreover, uniform quantization me… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12956v2-abstract-full').style.display = 'inline'; document.getElementById('2501.12956v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12956v2-abstract-full" style="display: none;"> Large Language Models (LLMs) face significant deployment challenges due to their substantial resource requirements. While low-bit quantized weights can reduce memory usage and improve inference efficiency, current hardware lacks native support for mixed-precision General Matrix Multiplication (mpGEMM), resulting in inefficient dequantization-based implementations. Moreover, uniform quantization methods often fail to capture weight distributions adequately, leading to performance degradation. We propose GANQ (GPU-Adaptive Non-Uniform Quantization), a layer-wise post-training non-uniform quantization framework optimized for hardware-efficient lookup table-based mpGEMM. GANQ achieves superior quantization performance by utilizing a training-free, GPU-adaptive optimization algorithm to efficiently reduce layer-wise quantization errors. Extensive experiments demonstrate GANQ's ability to reduce the perplexity gap from the FP16 baseline compared to state-of-the-art methods for both 3-bit and 4-bit quantization. Furthermore, when deployed on a single NVIDIA RTX 4090 GPU, GANQ's quantized models achieve up to 2.57$\times$ speedup over the baseline, advancing memory and inference efficiency in LLM deployment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12956v2-abstract-full').style.display = 'none'; document.getElementById('2501.12956v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12314">arXiv:2501.12314</a> <span> [<a href="https://arxiv.org/pdf/2501.12314">pdf</a>, <a href="https://arxiv.org/format/2501.12314">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Uncertainty Quantification With Noise Injection in Neural Networks: A Bayesian Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xueqiong Yuan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jipeng Li</a>, <a href="/search/cs?searchtype=author&query=Kuruoglu%2C+E+E">Ercan Engin Kuruoglu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12314v1-abstract-short" style="display: inline;"> Model uncertainty quantification involves measuring and evaluating the uncertainty linked to a model's predictions, helping assess their reliability and confidence. Noise injection is a technique used to enhance the robustness of neural networks by introducing randomness. In this paper, we establish a connection between noise injection and uncertainty quantification from a Bayesian standpoint. We… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12314v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12314v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12314v1-abstract-full" style="display: none;"> Model uncertainty quantification involves measuring and evaluating the uncertainty linked to a model's predictions, helping assess their reliability and confidence. Noise injection is a technique used to enhance the robustness of neural networks by introducing randomness. In this paper, we establish a connection between noise injection and uncertainty quantification from a Bayesian standpoint. We theoretically demonstrate that injecting noise into the weights of a neural network is equivalent to Bayesian inference on a deep Gaussian process. Consequently, we introduce a Monte Carlo Noise Injection (MCNI) method, which involves injecting noise into the parameters during training and performing multiple forward propagations during inference to estimate the uncertainty of the prediction. Through simulation and experiments on regression and classification tasks, our method demonstrates superior performance compared to the baseline model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12314v1-abstract-full').style.display = 'none'; document.getElementById('2501.12314v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10985">arXiv:2501.10985</a> <span> [<a href="https://arxiv.org/pdf/2501.10985">pdf</a>, <a href="https://arxiv.org/format/2501.10985">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> GRID: Protecting Training Graph from Link Stealing Attacks on GNN Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lou%2C+J">Jiadong Lou</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xu Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Rui Zhang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xingliang Yuan</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+N">Neil Gong</a>, <a href="/search/cs?searchtype=author&query=Tzeng%2C+N">Nian-Feng Tzeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10985v1-abstract-short" style="display: inline;"> Graph neural networks (GNNs) have exhibited superior performance in various classification tasks on graph-structured data. However, they encounter the potential vulnerability from the link stealing attacks, which can infer the presence of a link between two nodes via measuring the similarity of its incident nodes' prediction vectors produced by a GNN model. Such attacks pose severe security and pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10985v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10985v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10985v1-abstract-full" style="display: none;"> Graph neural networks (GNNs) have exhibited superior performance in various classification tasks on graph-structured data. However, they encounter the potential vulnerability from the link stealing attacks, which can infer the presence of a link between two nodes via measuring the similarity of its incident nodes' prediction vectors produced by a GNN model. Such attacks pose severe security and privacy threats to the training graph used in GNN models. In this work, we propose a novel solution, called Graph Link Disguise (GRID), to defend against link stealing attacks with the formal guarantee of GNN model utility for retaining prediction accuracy. The key idea of GRID is to add carefully crafted noises to the nodes' prediction vectors for disguising adjacent nodes as n-hop indirect neighboring nodes. We take into account the graph topology and select only a subset of nodes (called core nodes) covering all links for adding noises, which can avert the noises offset and have the further advantages of reducing both the distortion loss and the computation cost. Our crafted noises can ensure 1) the noisy prediction vectors of any two adjacent nodes have their similarity level like that of two non-adjacent nodes and 2) the model prediction is unchanged to ensure zero utility loss. Extensive experiments on five datasets are conducted to show the effectiveness of our proposed GRID solution against different representative link-stealing attacks under transductive settings and inductive settings respectively, as well as two influence-based attacks. Meanwhile, it achieves a much better privacy-utility trade-off than existing methods when extended to GNNs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10985v1-abstract-full').style.display = 'none'; document.getElementById('2501.10985v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10011">arXiv:2501.10011</a> <span> [<a href="https://arxiv.org/pdf/2501.10011">pdf</a>, <a href="https://arxiv.org/format/2501.10011">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Mitigating Hallucinations on Object Attributes using Multiview Images and Negative Instructions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhijie Tan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuzhi Li</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+S">Shengwei Meng</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xiang Yuan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weiping Li</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+T">Tong Mo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingce Wang</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+X">Xu Chu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10011v1-abstract-short" style="display: inline;"> Current popular Large Vision-Language Models (LVLMs) are suffering from Hallucinations on Object Attributes (HoOA), leading to incorrect determination of fine-grained attributes in the input images. Leveraging significant advancements in 3D generation from a single image, this paper proposes a novel method to mitigate HoOA in LVLMs. This method utilizes multiview images sampled from generated 3D r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10011v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10011v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10011v1-abstract-full" style="display: none;"> Current popular Large Vision-Language Models (LVLMs) are suffering from Hallucinations on Object Attributes (HoOA), leading to incorrect determination of fine-grained attributes in the input images. Leveraging significant advancements in 3D generation from a single image, this paper proposes a novel method to mitigate HoOA in LVLMs. This method utilizes multiview images sampled from generated 3D representations as visual prompts for LVLMs, thereby providing more visual information from other viewpoints. Furthermore, we observe the input order of multiple multiview images significantly affects the performance of LVLMs. Consequently, we have devised Multiview Image Augmented VLM (MIAVLM), incorporating a Multiview Attributes Perceiver (MAP) submodule capable of simultaneously eliminating the influence of input image order and aligning visual information from multiview images with Large Language Models (LLMs). Besides, we designed and employed negative instructions to mitigate LVLMs' bias towards ``Yes" responses. Comprehensive experiments demonstrate the effectiveness of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10011v1-abstract-full').style.display = 'none'; document.getElementById('2501.10011v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2025 IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP 2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09412">arXiv:2501.09412</a> <span> [<a href="https://arxiv.org/pdf/2501.09412">pdf</a>, <a href="https://arxiv.org/format/2501.09412">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> FASP: Fast and Accurate Structured Pruning of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hanyu Hu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+P">Pengxiang Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Ping Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yi Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhefeng Wang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xiaoming Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09412v1-abstract-short" style="display: inline;"> The rapid increase in the size of large language models (LLMs) has significantly escalated their computational and memory demands, posing challenges for efficient deployment, especially on resource-constrained devices. Structured pruning has emerged as an effective model compression method that can reduce these demands while preserving performance. In this paper, we introduce FASP (Fast and Accura… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09412v1-abstract-full').style.display = 'inline'; document.getElementById('2501.09412v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09412v1-abstract-full" style="display: none;"> The rapid increase in the size of large language models (LLMs) has significantly escalated their computational and memory demands, posing challenges for efficient deployment, especially on resource-constrained devices. Structured pruning has emerged as an effective model compression method that can reduce these demands while preserving performance. In this paper, we introduce FASP (Fast and Accurate Structured Pruning), a novel structured pruning framework for LLMs that emphasizes both speed and accuracy. FASP employs a distinctive pruning structure that interlinks sequential layers, allowing for the removal of columns in one layer while simultaneously eliminating corresponding rows in the preceding layer without incurring additional performance loss. The pruning metric, inspired by Wanda, is computationally efficient and effectively selects components to prune. Additionally, we propose a restoration mechanism that enhances model fidelity by adjusting the remaining weights post-pruning. We evaluate FASP on the OPT and LLaMA model families, demonstrating superior performance in terms of perplexity and accuracy on downstream tasks compared to state-of-the-art methods. Our approach achieves significant speed-ups, pruning models such as OPT-125M in 17 seconds and LLaMA-30B in 15 minutes on a single NVIDIA RTX 4090 GPU, making it a highly practical solution for optimizing LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09412v1-abstract-full').style.display = 'none'; document.getElementById('2501.09412v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07296">arXiv:2501.07296</a> <span> [<a href="https://arxiv.org/pdf/2501.07296">pdf</a>, <a href="https://arxiv.org/format/2501.07296">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Event-based Video Person Re-identification via Cross-Modality and Temporal Collaboration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+R">Renkai Li</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xin Yuan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wei Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xin Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07296v1-abstract-short" style="display: inline;"> Video-based person re-identification (ReID) has become increasingly important due to its applications in video surveillance applications. By employing events in video-based person ReID, more motion information can be provided between continuous frames to improve recognition accuracy. Previous approaches have assisted by introducing event data into the video person ReID task, but they still cannot… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07296v1-abstract-full').style.display = 'inline'; document.getElementById('2501.07296v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07296v1-abstract-full" style="display: none;"> Video-based person re-identification (ReID) has become increasingly important due to its applications in video surveillance applications. By employing events in video-based person ReID, more motion information can be provided between continuous frames to improve recognition accuracy. Previous approaches have assisted by introducing event data into the video person ReID task, but they still cannot avoid the privacy leakage problem caused by RGB images. In order to avoid privacy attacks and to take advantage of the benefits of event data, we consider using only event data. To make full use of the information in the event stream, we propose a Cross-Modality and Temporal Collaboration (CMTC) network for event-based video person ReID. First, we design an event transform network to obtain corresponding auxiliary information from the input of raw events. Additionally, we propose a differential modality collaboration module to balance the roles of events and auxiliaries to achieve complementary effects. Furthermore, we introduce a temporal collaboration module to exploit motion information and appearance cues. Experimental results demonstrate that our method outperforms others in the task of event-based video person ReID. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07296v1-abstract-full').style.display = 'none'; document.getElementById('2501.07296v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07262">arXiv:2501.07262</a> <span> [<a href="https://arxiv.org/pdf/2501.07262">pdf</a>, <a href="https://arxiv.org/format/2501.07262">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3708821.3710826">10.1145/3708821.3710826 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> OblivCDN: A Practical Privacy-preserving CDN with Oblivious Content Access </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Vo%2C+V">Viet Vo</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+S">Shangqi Lai</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xingliang Yuan</a>, <a href="/search/cs?searchtype=author&query=Nepal%2C+S">Surya Nepal</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qi Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07262v1-abstract-short" style="display: inline;"> Content providers increasingly utilise Content Delivery Networks (CDNs) to enhance users' content download experience. However, this deployment scenario raises significant security concerns regarding content confidentiality and user privacy due to the involvement of third-party providers. Prior proposals using private information retrieval (PIR) and oblivious RAM (ORAM) have proven impractical due… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07262v1-abstract-full').style.display = 'inline'; document.getElementById('2501.07262v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07262v1-abstract-full" style="display: none;"> Content providers increasingly utilise Content Delivery Networks (CDNs) to enhance users' content download experience. However, this deployment scenario raises significant security concerns regarding content confidentiality and user privacy due to the involvement of third-party providers. Prior proposals using private information retrieval (PIR) and oblivious RAM (ORAM) have proven impractical due to high computation and communication costs, as well as integration challenges within distributed CDN architectures. In response, we present \textsf{OblivCDN}, a practical privacy-preserving system meticulously designed for seamless integration with the existing real-world Internet-CDN infrastructure. Our design strategically adapts Range ORAM primitives to optimise memory and disk seeks when accessing contiguous blocks of CDN content, both at the origin and edge servers, while preserving both content confidentiality and user access pattern hiding features. Also, we carefully customise several oblivious building blocks that integrate the distributed trust model into the ORAM client, thereby eliminating the computational bottleneck in the origin server and reducing communication costs between the origin server and edge servers. Moreover, the newly-designed ORAM client also eliminates the need for trusted hardware on edge servers, and thus significantly ameliorates the compatibility towards networks with massive legacy devices.In real-world streaming evaluations, OblivCDN} demonstrates remarkable performance, downloading a $256$ MB video in just $5.6$ seconds. This achievement represents a speedup of $90\times$ compared to a strawman approach (direct ORAM adoption) and a $366\times$ improvement over the prior art, OblivP2P. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07262v1-abstract-full').style.display = 'none'; document.getElementById('2501.07262v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The 20th ACM ASIA Conference on Computer and Communications Security (ACM ASIACCS 2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07194">arXiv:2501.07194</a> <span> [<a href="https://arxiv.org/pdf/2501.07194">pdf</a>, <a href="https://arxiv.org/format/2501.07194">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VAGeo: View-specific Attention for Cross-View Object Geo-Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhongyang Li</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xin Yuan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wei Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xin Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07194v1-abstract-short" style="display: inline;"> Cross-view object geo-localization (CVOGL) aims to locate an object of interest in a captured ground- or drone-view image within the satellite image. However, existing works treat ground-view and drone-view query images equivalently, overlooking their inherent viewpoint discrepancies and the spatial correlation between the query image and the satellite-view reference image. To this end, this paper… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07194v1-abstract-full').style.display = 'inline'; document.getElementById('2501.07194v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07194v1-abstract-full" style="display: none;"> Cross-view object geo-localization (CVOGL) aims to locate an object of interest in a captured ground- or drone-view image within the satellite image. However, existing works treat ground-view and drone-view query images equivalently, overlooking their inherent viewpoint discrepancies and the spatial correlation between the query image and the satellite-view reference image. To this end, this paper proposes a novel View-specific Attention Geo-localization method (VAGeo) for accurate CVOGL. Specifically, VAGeo contains two key modules: view-specific positional encoding (VSPE) module and channel-spatial hybrid attention (CSHA) module. In object-level, according to the characteristics of different viewpoints of ground and drone query images, viewpoint-specific positional codings are designed to more accurately identify the click-point object of the query image in the VSPE module. In feature-level, a hybrid attention in the CSHA module is introduced by combining channel attention and spatial attention mechanisms simultaneously for learning discriminative features. Extensive experimental results demonstrate that the proposed VAGeo gains a significant performance improvement, i.e., improving acc@0.25/acc@0.5 on the CVOGL dataset from 45.43%/42.24% to 48.21%/45.22% for ground-view, and from 61.97%/57.66% to 66.19%/61.87% for drone-view. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07194v1-abstract-full').style.display = 'none'; document.getElementById('2501.07194v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03775">arXiv:2501.03775</a> <span> [<a href="https://arxiv.org/pdf/2501.03775">pdf</a>, <a href="https://arxiv.org/format/2501.03775">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Strip R-CNN: Large Strip Convolution for Remote Sensing Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xinbin Yuan</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zhaohui Zheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuxuan Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xialei Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Li Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+Q">Qibin Hou</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+M">Ming-Ming Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03775v3-abstract-short" style="display: inline;"> While witnessed with rapid development, remote sensing object detection remains challenging for detecting high aspect ratio objects. This paper shows that large strip convolutions are good feature representation learners for remote sensing object detection and can detect objects of various aspect ratios well. Based on large strip convolutions, we build a new network architecture called Strip R-CNN… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03775v3-abstract-full').style.display = 'inline'; document.getElementById('2501.03775v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03775v3-abstract-full" style="display: none;"> While witnessed with rapid development, remote sensing object detection remains challenging for detecting high aspect ratio objects. This paper shows that large strip convolutions are good feature representation learners for remote sensing object detection and can detect objects of various aspect ratios well. Based on large strip convolutions, we build a new network architecture called Strip R-CNN, which is simple, efficient, and powerful. Unlike recent remote sensing object detectors that leverage large-kernel convolutions with square shapes, our Strip R-CNN takes advantage of sequential orthogonal large strip convolutions to capture spatial information. In addition, we enhance the localization capability of remote-sensing object detectors by decoupling the detection heads and equipping the localization head with strip convolutions to better localize the target objects. Extensive experiments on several benchmarks, e.g., DOTA, FAIR1M, HRSC2016, and DIOR, show that our Strip R-CNN can largely improve previous works. Notably, our 30M model achieves 82.75% mAP on DOTA-v1.0, setting a new state-of-the-art record.Code is available at https://github.com/YXB-NKU/Strip-R-CNN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03775v3-abstract-full').style.display = 'none'; document.getElementById('2501.03775v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03571">arXiv:2501.03571</a> <span> [<a href="https://arxiv.org/pdf/2501.03571">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> AADNet: Exploring EEG Spatiotemporal Information for Fast and Accurate Orientation and Timbre Detection of Auditory Attention Based on A Cue-Masked Paradigm </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+K">Keren Shi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xu Liu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xue Yuan</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+H">Haijie Shang</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+R">Ruiting Dai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hanbin Wang</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yunfa Fu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+N">Ning Jiang</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jiayuan He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03571v1-abstract-short" style="display: inline;"> Auditory attention decoding from electroencephalogram (EEG) could infer to which source the user is attending in noisy environments. Decoding algorithms and experimental paradigm designs are crucial for the development of technology in practical applications. To simulate real-world scenarios, this study proposed a cue-masked auditory attention paradigm to avoid information leakage before the exper… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03571v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03571v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03571v1-abstract-full" style="display: none;"> Auditory attention decoding from electroencephalogram (EEG) could infer to which source the user is attending in noisy environments. Decoding algorithms and experimental paradigm designs are crucial for the development of technology in practical applications. To simulate real-world scenarios, this study proposed a cue-masked auditory attention paradigm to avoid information leakage before the experiment. To obtain high decoding accuracy with low latency, an end-to-end deep learning model, AADNet, was proposed to exploit the spatiotemporal information from the short time window of EEG signals. The results showed that with a 0.5-second EEG window, AADNet achieved an average accuracy of 93.46% and 91.09% in decoding auditory orientation attention (OA) and timbre attention (TA), respectively. It significantly outperformed five previous methods and did not need the knowledge of the original audio source. This work demonstrated that it was possible to detect the orientation and timbre of auditory attention from EEG signals fast and accurately. The results are promising for the real-time multi-property auditory attention decoding, facilitating the application of the neuro-steered hearing aids and other assistive listening devices. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03571v1-abstract-full').style.display = 'none'; document.getElementById('2501.03571v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02905">arXiv:2501.02905</a> <span> [<a href="https://arxiv.org/pdf/2501.02905">pdf</a>, <a href="https://arxiv.org/format/2501.02905">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Skillful High-Resolution Ensemble Precipitation Forecasting with an Integrated Deep Learning Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+S">Shuangshuang He</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+H">Hongli Liang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuanting Zhang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xingyuan Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02905v1-abstract-short" style="display: inline;"> High-resolution precipitation forecasts are crucial for providing accurate weather prediction and supporting effective responses to extreme weather events. Traditional numerical models struggle with stochastic subgrid-scale processes, while recent deep learning models often produce blurry results. To address these challenges, we propose a physics-inspired deep learning framework for high-resolutio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02905v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02905v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02905v1-abstract-full" style="display: none;"> High-resolution precipitation forecasts are crucial for providing accurate weather prediction and supporting effective responses to extreme weather events. Traditional numerical models struggle with stochastic subgrid-scale processes, while recent deep learning models often produce blurry results. To address these challenges, we propose a physics-inspired deep learning framework for high-resolution (0.05\textdegree{} $\times$ 0.05\textdegree{}) ensemble precipitation forecasting. Trained on ERA5 and CMPA high-resolution precipitation datasets, the framework integrates deterministic and probabilistic components. The deterministic model, based on a 3D SwinTransformer, captures average precipitation at mesoscale resolution and incorporates strategies to enhance performance, particularly for moderate to heavy rainfall. The probabilistic model employs conditional diffusion in latent space to account for uncertainties in residual precipitation at convective scales. During inference, ensemble members are generated by repeatedly sampling latent variables, enabling the model to represent precipitation uncertainty. Our model significantly enhances spatial resolution and forecast accuracy. Rank histogram shows that the ensemble system is reliable and unbiased. In a case study of heavy precipitation in southern China, the model outputs align more closely with observed precipitation distributions than ERA5, demonstrating superior capability in capturing extreme precipitation events. Additionally, 5-day real-time forecasts show good performance in terms of CSI scores. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02905v1-abstract-full').style.display = 'none'; document.getElementById('2501.02905v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.01262">arXiv:2501.01262</a> <span> [<a href="https://arxiv.org/pdf/2501.01262">pdf</a>, <a href="https://arxiv.org/format/2501.01262">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Detail Matters: Mamba-Inspired Joint Unfolding Network for Snapshot Spectral Compressive Imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qin%2C+M">Mengjie Qin</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yuchao Feng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zongliang Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yulun Zhang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xin Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.01262v1-abstract-short" style="display: inline;"> In the coded aperture snapshot spectral imaging system, Deep Unfolding Networks (DUNs) have made impressive progress in recovering 3D hyperspectral images (HSIs) from a single 2D measurement. However, the inherent nonlinear and ill-posed characteristics of HSI reconstruction still pose challenges to existing methods in terms of accuracy and stability. To address this issue, we propose a Mamba-insp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01262v1-abstract-full').style.display = 'inline'; document.getElementById('2501.01262v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.01262v1-abstract-full" style="display: none;"> In the coded aperture snapshot spectral imaging system, Deep Unfolding Networks (DUNs) have made impressive progress in recovering 3D hyperspectral images (HSIs) from a single 2D measurement. However, the inherent nonlinear and ill-posed characteristics of HSI reconstruction still pose challenges to existing methods in terms of accuracy and stability. To address this issue, we propose a Mamba-inspired Joint Unfolding Network (MiJUN), which integrates physics-embedded DUNs with learning-based HSI imaging. Firstly, leveraging the concept of trapezoid discretization to expand the representation space of unfolding networks, we introduce an accelerated unfolding network scheme. This approach can be interpreted as a generalized accelerated half-quadratic splitting with a second-order differential equation, which reduces the reliance on initial optimization stages and addresses challenges related to long-range interactions. Crucially, within the Mamba framework, we restructure the Mamba-inspired global-to-local attention mechanism by incorporating a selective state space model and an attention mechanism. This effectively reinterprets Mamba as a variant of the Transformer} architecture, improving its adaptability and efficiency. Furthermore, we refine the scanning strategy with Mamba by integrating the tensor mode-$k$ unfolding into the Mamba network. This approach emphasizes the low-rank properties of tensors along various modes, while conveniently facilitating 12 scanning directions. Numerical and visual comparisons on both simulation and real datasets demonstrate the superiority of our proposed MiJUN, and achieving overwhelming detail representation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01262v1-abstract-full').style.display = 'none'; document.getElementById('2501.01262v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 7 figures, AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00751">arXiv:2501.00751</a> <span> [<a href="https://arxiv.org/pdf/2501.00751">pdf</a>, <a href="https://arxiv.org/format/2501.00751">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HCMA-UNet: A Hybrid CNN-Mamba UNet with Inter-Slice Self-Attention for Efficient Breast Cancer Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoxuan Li</a>, <a href="/search/cs?searchtype=author&query=song%2C+W">Wei song</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+P">Peiwu Qin</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xi Yuan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhenglin Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00751v1-abstract-short" style="display: inline;"> Breast cancer lesion segmentation in DCE-MRI remains challenging due to heterogeneous tumor morphology and indistinct boundaries. To address these challenges, this study proposes a novel hybrid segmentation network, HCMA-UNet, for lesion segmentation of breast cancer. Our network consists of a lightweight CNN backbone and a Multi-view Inter-Slice Self-Attention Mamba (MISM) module. The MISM module… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00751v1-abstract-full').style.display = 'inline'; document.getElementById('2501.00751v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00751v1-abstract-full" style="display: none;"> Breast cancer lesion segmentation in DCE-MRI remains challenging due to heterogeneous tumor morphology and indistinct boundaries. To address these challenges, this study proposes a novel hybrid segmentation network, HCMA-UNet, for lesion segmentation of breast cancer. Our network consists of a lightweight CNN backbone and a Multi-view Inter-Slice Self-Attention Mamba (MISM) module. The MISM module integrates Visual State Space Block (VSSB) and Inter-Slice Self-Attention (ISSA) mechanism, effectively reducing parameters through Asymmetric Split Channel (ASC) strategy to achieve efficient tri-directional feature extraction. Our lightweight model achieves superior performance with 2.87M parameters and 126.44 GFLOPs. A Feature-guided Region-aware loss function (FRLoss) is proposed to enhance segmentation accuracy. Extensive experiments on one private and two public DCE-MRI breast cancer datasets demonstrate that our approach achieves state-of-the-art performance while maintaining computational efficiency. FRLoss also exhibits good cross-architecture generalization capabilities. The source code and dataset is available on this link. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00751v1-abstract-full').style.display = 'none'; document.getElementById('2501.00751v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19483">arXiv:2412.19483</a> <span> [<a href="https://arxiv.org/pdf/2412.19483">pdf</a>, <a href="https://arxiv.org/format/2412.19483">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Learning Radiance Fields from a Single Snapshot Compressive Image </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunhao Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiang Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaodong Wang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xin Yuan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peidong Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19483v1-abstract-short" style="display: inline;"> In this paper, we explore the potential of Snapshot Compressive Imaging (SCI) technique for recovering the underlying 3D scene structure from a single temporal compressed image. SCI is a cost-effective method that enables the recording of high-dimensional data, such as hyperspectral or temporal information, into a single image using low-cost 2D imaging sensors. To achieve this, a series of special… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19483v1-abstract-full').style.display = 'inline'; document.getElementById('2412.19483v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19483v1-abstract-full" style="display: none;"> In this paper, we explore the potential of Snapshot Compressive Imaging (SCI) technique for recovering the underlying 3D scene structure from a single temporal compressed image. SCI is a cost-effective method that enables the recording of high-dimensional data, such as hyperspectral or temporal information, into a single image using low-cost 2D imaging sensors. To achieve this, a series of specially designed 2D masks are usually employed, reducing storage and transmission requirements and offering potential privacy protection. Inspired by this, we take one step further to recover the encoded 3D scene information leveraging powerful 3D scene representation capabilities of neural radiance fields (NeRF). Specifically, we propose SCINeRF, in which we formulate the physical imaging process of SCI as part of the training of NeRF, allowing us to exploit its impressive performance in capturing complex scene structures. In addition, we further integrate the popular 3D Gaussian Splatting (3DGS) framework and propose SCISplat to improve 3D scene reconstruction quality and training/rendering speed by explicitly optimizing point clouds into 3D Gaussian representations. To assess the effectiveness of our method, we conduct extensive evaluations using both synthetic data and real data captured by our SCI system. Experimental results demonstrate that our proposed approach surpasses the state-of-the-art methods in terms of image reconstruction and novel view synthesis. Moreover, our method also exhibits the ability to render high frame-rate multi-view consistent images in real time by leveraging SCI and the rendering capabilities of 3DGS. Codes will be available at: https://github.com/WU- CVGL/SCISplat. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19483v1-abstract-full').style.display = 'none'; document.getElementById('2412.19483v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18535">arXiv:2412.18535</a> <span> [<a href="https://arxiv.org/pdf/2412.18535">pdf</a>, <a href="https://arxiv.org/format/2412.18535">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> Graph Structure Learning for Spatial-Temporal Imputation: Adapting to Node and Feature Scales </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xinyu Yang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yu Sun</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinyang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Ying Zhang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xiaojie Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18535v2-abstract-short" style="display: inline;"> Spatial-temporal data collected across different geographic locations often suffer from missing values, posing challenges to data analysis. Existing methods primarily leverage fixed spatial graphs to impute missing values, which implicitly assume that the spatial relationship is roughly the same for all features across different locations. However, they may overlook the different spatial relations… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18535v2-abstract-full').style.display = 'inline'; document.getElementById('2412.18535v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18535v2-abstract-full" style="display: none;"> Spatial-temporal data collected across different geographic locations often suffer from missing values, posing challenges to data analysis. Existing methods primarily leverage fixed spatial graphs to impute missing values, which implicitly assume that the spatial relationship is roughly the same for all features across different locations. However, they may overlook the different spatial relationships of diverse features recorded by sensors in different locations. To address this, we introduce the multi-scale Graph Structure Learning framework for spatial-temporal Imputation (GSLI) that dynamically adapts to the heterogeneous spatial correlations. Our framework encompasses node-scale graph structure learning to cater to the distinct global spatial correlations of different features, and feature-scale graph structure learning to unveil common spatial correlation across features within all stations. Integrated with prominence modeling, our framework emphasizes nodes and features with greater significance in the imputation process. Furthermore, GSLI incorporates cross-feature and cross-temporal representation learning to capture spatial-temporal dependencies. Evaluated on six real incomplete spatial-temporal datasets, GSLI showcases the improvement in data imputation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18535v2-abstract-full').style.display = 'none'; document.getElementById('2412.18535v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted as a full paper at AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16985">arXiv:2412.16985</a> <span> [<a href="https://arxiv.org/pdf/2412.16985">pdf</a>, <a href="https://arxiv.org/format/2412.16985">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> BladeDISC++: Memory Optimizations Based On Symbolic Shape </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xiulong Yuan</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xu Yan</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+W">Wenting Shen</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xiafei Qiu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A">Ang Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jie Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yong Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Wei Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16985v1-abstract-short" style="display: inline;"> Recent deep learning workloads exhibit dynamic characteristics, leading to the rising adoption of dynamic shape compilers. These compilers can generate efficient kernels for dynamic shape graphs characterized by a fixed graph topology and uncertain tensor shapes. However, memory optimization, although particularly crucial in this large model era, remains relatively underexplored for dynamic shape… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16985v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16985v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16985v1-abstract-full" style="display: none;"> Recent deep learning workloads exhibit dynamic characteristics, leading to the rising adoption of dynamic shape compilers. These compilers can generate efficient kernels for dynamic shape graphs characterized by a fixed graph topology and uncertain tensor shapes. However, memory optimization, although particularly crucial in this large model era, remains relatively underexplored for dynamic shape graphs. The fundamental challenge lies in the lack of precise tensor shapes which are essential in conventional methods such as operation scheduling(op scheduling) and rematerialization. To address this challenge, we propose op scheduling and rematerialization approaches based on symbolic shapes and developed BladeDISC++. Besides, since rematerialization decisions cannot be made solely at compile time when tensor shapes are unknown, BladeDISC++ employs a compilation-runtime combined strategy to optimally address shape dynamics. Evaluations indicate that BladeDISC++ effectively reduces memory usage for dynamic shape graphs, achieving memory consumption comparable to optimizations using precise shapes, thereby promoting the broader adoption of dynamic shape compilers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16985v1-abstract-full').style.display = 'none'; document.getElementById('2412.16985v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> [1]"NeurIPS BladeDISC++: Memory Optimizations Based On Symbolic Shape" Neurips.cc, 2024. https://neurips.cc/virtual/2024/103601 (accessed Dec. 22, 2024) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16656">arXiv:2412.16656</a> <span> [<a href="https://arxiv.org/pdf/2412.16656">pdf</a>, <a href="https://arxiv.org/format/2412.16656">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Generalizable Articulated Object Perception with Superpoints </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+Q">Qiaojun Yu</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+C">Ce Hao</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xibin Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Li Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Liu Liu</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+Y">Yukang Huo</a>, <a href="/search/cs?searchtype=author&query=Agarwal%2C+R">Rohit Agarwal</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+C">Cewu Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16656v1-abstract-short" style="display: inline;"> Manipulating articulated objects with robotic arms is challenging due to the complex kinematic structure, which requires precise part segmentation for efficient manipulation. In this work, we introduce a novel superpoint-based perception method designed to improve part segmentation in 3D point clouds of articulated objects. We propose a learnable, part-aware superpoint generation technique that ef… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16656v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16656v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16656v1-abstract-full" style="display: none;"> Manipulating articulated objects with robotic arms is challenging due to the complex kinematic structure, which requires precise part segmentation for efficient manipulation. In this work, we introduce a novel superpoint-based perception method designed to improve part segmentation in 3D point clouds of articulated objects. We propose a learnable, part-aware superpoint generation technique that efficiently groups points based on their geometric and semantic similarities, resulting in clearer part boundaries. Furthermore, by leveraging the segmentation capabilities of the 2D foundation model SAM, we identify the centers of pixel regions and select corresponding superpoints as candidate query points. Integrating a query-based transformer decoder further enhances our method's ability to achieve precise part segmentation. Experimental results on the GAPartNet dataset show that our method outperforms existing state-of-the-art approaches in cross-category part segmentation, achieving AP50 scores of 77.9% for seen categories (4.4% improvement) and $39.3\%$ for unseen categories (11.6% improvement), with superior results in 5 out of 9 part categories for seen objects and outperforming all previous methods across all part categories for unseen objects. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16656v1-abstract-full').style.display = 'none'; document.getElementById('2412.16656v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13662">arXiv:2412.13662</a> <span> [<a href="https://arxiv.org/pdf/2412.13662">pdf</a>, <a href="https://arxiv.org/format/2412.13662">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> When Should We Prefer State-to-Visual DAgger Over Visual Reinforcement Learning? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mu%2C+T">Tongzhou Mu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhaoyang Li</a>, <a href="/search/cs?searchtype=author&query=Strzelecki%2C+S+W">Stanis艂aw Wiktor Strzelecki</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xiu Yuan</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yunchao Yao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+L">Litian Liang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hao Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13662v1-abstract-short" style="display: inline;"> Learning policies from high-dimensional visual inputs, such as pixels and point clouds, is crucial in various applications. Visual reinforcement learning is a promising approach that directly trains policies from visual observations, although it faces challenges in sample efficiency and computational costs. This study conducts an empirical comparison of State-to-Visual DAgger, a two-stage framewor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13662v1-abstract-full').style.display = 'inline'; document.getElementById('2412.13662v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13662v1-abstract-full" style="display: none;"> Learning policies from high-dimensional visual inputs, such as pixels and point clouds, is crucial in various applications. Visual reinforcement learning is a promising approach that directly trains policies from visual observations, although it faces challenges in sample efficiency and computational costs. This study conducts an empirical comparison of State-to-Visual DAgger, a two-stage framework that initially trains a state policy before adopting online imitation to learn a visual policy, and Visual RL across a diverse set of tasks. We evaluate both methods across 16 tasks from three benchmarks, focusing on their asymptotic performance, sample efficiency, and computational costs. Surprisingly, our findings reveal that State-to-Visual DAgger does not universally outperform Visual RL but shows significant advantages in challenging tasks, offering more consistent performance. In contrast, its benefits in sample efficiency are less pronounced, although it often reduces the overall wall-clock time required for training. Based on our findings, we provide recommendations for practitioners and hope that our results contribute valuable perspectives for future research in visual policy learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13662v1-abstract-full').style.display = 'none'; document.getElementById('2412.13662v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by The 39th Annual AAAI Conference on Artificial Intelligence (AAAI 2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13630">arXiv:2412.13630</a> <span> [<a href="https://arxiv.org/pdf/2412.13630">pdf</a>, <a href="https://arxiv.org/format/2412.13630">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Policy Decorator: Model-Agnostic Online Refinement for Large Policy Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xiu Yuan</a>, <a href="/search/cs?searchtype=author&query=Mu%2C+T">Tongzhou Mu</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+S">Stone Tao</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yunhao Fang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mengke Zhang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hao Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13630v1-abstract-short" style="display: inline;"> Recent advancements in robot learning have used imitation learning with large models and extensive demonstrations to develop effective policies. However, these models are often limited by the quantity, quality, and diversity of demonstrations. This paper explores improving offline-trained imitation learning models through online interactions with the environment. We introduce Policy Decorator, whi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13630v1-abstract-full').style.display = 'inline'; document.getElementById('2412.13630v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13630v1-abstract-full" style="display: none;"> Recent advancements in robot learning have used imitation learning with large models and extensive demonstrations to develop effective policies. However, these models are often limited by the quantity, quality, and diversity of demonstrations. This paper explores improving offline-trained imitation learning models through online interactions with the environment. We introduce Policy Decorator, which uses a model-agnostic residual policy to refine large imitation learning models during online interactions. By implementing controlled exploration strategies, Policy Decorator enables stable, sample-efficient online learning. Our evaluation spans eight tasks across two benchmarks-ManiSkill and Adroit-and involves two state-of-the-art imitation learning models (Behavior Transformer and Diffusion Policy). The results show Policy Decorator effectively improves the offline-trained policies and preserves the smooth motion of imitation learning models, avoiding the erratic behaviors of pure RL policies. See our project page (https://policydecorator.github.io) for videos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13630v1-abstract-full').style.display = 'none'; document.getElementById('2412.13630v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Explore videos, data, code, and more at https://policydecorator.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10450">arXiv:2412.10450</a> <span> [<a href="https://arxiv.org/pdf/2412.10450">pdf</a>, <a href="https://arxiv.org/format/2412.10450">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Atmospheric and Oceanic Physics">physics.ao-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Regional Weather Variable Predictions by Machine Learning with Near-Surface Observational and Atmospheric Numerical Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yihe Zhang</a>, <a href="/search/cs?searchtype=author&query=Turney%2C+B">Bryce Turney</a>, <a href="/search/cs?searchtype=author&query=Sigdel%2C+P">Purushottam Sigdel</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xu Yuan</a>, <a href="/search/cs?searchtype=author&query=Rappin%2C+E">Eric Rappin</a>, <a href="/search/cs?searchtype=author&query=Lago%2C+A">Adrian Lago</a>, <a href="/search/cs?searchtype=author&query=Kimball%2C+S">Sytske Kimball</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Li Chen</a>, <a href="/search/cs?searchtype=author&query=Darby%2C+P">Paul Darby</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+L">Lu Peng</a>, <a href="/search/cs?searchtype=author&query=Aygun%2C+S">Sercan Aygun</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+Y">Yazhou Tu</a>, <a href="/search/cs?searchtype=author&query=Najafi%2C+M+H">M. Hassan Najafi</a>, <a href="/search/cs?searchtype=author&query=Tzeng%2C+N">Nian-Feng Tzeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10450v2-abstract-short" style="display: inline;"> Accurate and timely regional weather prediction is vital for sectors dependent on weather-related decisions. Traditional prediction methods, based on atmospheric equations, often struggle with coarse temporal resolutions and inaccuracies. This paper presents a novel machine learning (ML) model, called MiMa (short for Micro-Macro), that integrates both near-surface observational data from Kentucky… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10450v2-abstract-full').style.display = 'inline'; document.getElementById('2412.10450v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10450v2-abstract-full" style="display: none;"> Accurate and timely regional weather prediction is vital for sectors dependent on weather-related decisions. Traditional prediction methods, based on atmospheric equations, often struggle with coarse temporal resolutions and inaccuracies. This paper presents a novel machine learning (ML) model, called MiMa (short for Micro-Macro), that integrates both near-surface observational data from Kentucky Mesonet stations (collected every five minutes, known as Micro data) and hourly atmospheric numerical outputs (termed as Macro data) for fine-resolution weather forecasting. The MiMa model employs an encoder-decoder transformer structure, with two encoders for processing multivariate data from both datasets and a decoder for forecasting weather variables over short time horizons. Each instance of the MiMa model, called a modelet, predicts the values of a specific weather parameter at an individual Mesonet station. The approach is extended with Re-MiMa modelets, which are designed to predict weather variables at ungauged locations by training on multivariate data from a few representative stations in a region, tagged with their elevations. Re-MiMa (short for Regional-MiMa) can provide highly accurate predictions across an entire region, even in areas without observational stations. Experimental results show that MiMa significantly outperforms current models, with Re-MiMa offering precise short-term forecasts for ungauged locations, marking a significant advancement in weather forecasting accuracy and applicability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10450v2-abstract-full').style.display = 'none'; document.getElementById('2412.10450v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10116">arXiv:2412.10116</a> <span> [<a href="https://arxiv.org/pdf/2412.10116">pdf</a>, <a href="https://arxiv.org/format/2412.10116">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HS-FPN: High Frequency and Spatial Perception FPN for Tiny Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zican Shi</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jing Hu</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+J">Jie Ren</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+H">Hengkang Ye</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xuyang Yuan</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+Y">Yan Ouyang</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jia He</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+B">Bo Ji</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Junyu Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10116v2-abstract-short" style="display: inline;"> The introduction of Feature Pyramid Network (FPN) has significantly improved object detection performance. However, substantial challenges remain in detecting tiny objects, as their features occupy only a very small proportion of the feature maps. Although FPN integrates multi-scale features, it does not directly enhance or enrich the features of tiny objects. Furthermore, FPN lacks spatial percep… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10116v2-abstract-full').style.display = 'inline'; document.getElementById('2412.10116v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10116v2-abstract-full" style="display: none;"> The introduction of Feature Pyramid Network (FPN) has significantly improved object detection performance. However, substantial challenges remain in detecting tiny objects, as their features occupy only a very small proportion of the feature maps. Although FPN integrates multi-scale features, it does not directly enhance or enrich the features of tiny objects. Furthermore, FPN lacks spatial perception ability. To address these issues, we propose a novel High Frequency and Spatial Perception Feature Pyramid Network (HS-FPN) with two innovative modules. First, we designed a high frequency perception module (HFP) that generates high frequency responses through high pass filters. These high frequency responses are used as mask weights from both spatial and channel perspectives to enrich and highlight the features of tiny objects in the original feature maps. Second, we developed a spatial dependency perception module (SDP) to capture the spatial dependencies that FPN lacks. Our experiments demonstrate that detectors based on HS-FPN exhibit competitive advantages over state-of-the-art models on the AI-TOD dataset for tiny object detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10116v2-abstract-full').style.display = 'none'; document.getElementById('2412.10116v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages,12 figures,7 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06138">arXiv:2412.06138</a> <span> [<a href="https://arxiv.org/pdf/2412.06138">pdf</a>, <a href="https://arxiv.org/format/2412.06138">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SGIA: Enhancing Fine-Grained Visual Classification with Sequence Generative Image Augmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liao%2C+Q">Qiyu Liao</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xin Yuan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Min Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Dadong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06138v1-abstract-short" style="display: inline;"> In Fine-Grained Visual Classification (FGVC), distinguishing highly similar subcategories remains a formidable challenge, often necessitating datasets with extensive variability. The acquisition and annotation of such FGVC datasets are notably difficult and costly, demanding specialized knowledge to identify subtle distinctions among closely related categories. Our study introduces a novel approac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06138v1-abstract-full').style.display = 'inline'; document.getElementById('2412.06138v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06138v1-abstract-full" style="display: none;"> In Fine-Grained Visual Classification (FGVC), distinguishing highly similar subcategories remains a formidable challenge, often necessitating datasets with extensive variability. The acquisition and annotation of such FGVC datasets are notably difficult and costly, demanding specialized knowledge to identify subtle distinctions among closely related categories. Our study introduces a novel approach employing the Sequence Latent Diffusion Model (SLDM) for augmenting FGVC datasets, called Sequence Generative Image Augmentation (SGIA). Our method features a unique Bridging Transfer Learning (BTL) process, designed to minimize the domain gap between real and synthetically augmented data. This approach notably surpasses existing methods in generating more realistic image samples, providing a diverse range of pose transformations that extend beyond the traditional rigid transformations and style changes in generative augmentation. We demonstrate the effectiveness of our augmented dataset with substantial improvements in FGVC tasks on various datasets, models, and training strategies, especially in few-shot learning scenarios. Our method outperforms conventional image augmentation techniques in benchmark tests on three FGVC datasets, showcasing superior realism, variability, and representational quality. Our work sets a new benchmark and outperforms the previous state-of-the-art models in classification accuracy by 0.5% for the CUB-200-2011 dataset and advances the application of generative models in FGVC data augmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06138v1-abstract-full').style.display = 'none'; document.getElementById('2412.06138v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05908">arXiv:2412.05908</a> <span> [<a href="https://arxiv.org/pdf/2412.05908">pdf</a>, <a href="https://arxiv.org/format/2412.05908">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GBR: Generative Bundle Refinement for High-fidelity Gaussian Splatting and Meshing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianing Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yuchao Zheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Ziwei Li</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+Q">Qionghai Dai</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xiaoyun Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05908v1-abstract-short" style="display: inline;"> Gaussian splatting has gained attention for its efficient representation and rendering of 3D scenes using continuous Gaussian primitives. However, it struggles with sparse-view inputs due to limited geometric and photometric information, causing ambiguities in depth, shape, and texture. we propose GBR: Generative Bundle Refinement, a method for high-fidelity Gaussian splatting and meshing using… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05908v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05908v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05908v1-abstract-full" style="display: none;"> Gaussian splatting has gained attention for its efficient representation and rendering of 3D scenes using continuous Gaussian primitives. However, it struggles with sparse-view inputs due to limited geometric and photometric information, causing ambiguities in depth, shape, and texture. we propose GBR: Generative Bundle Refinement, a method for high-fidelity Gaussian splatting and meshing using only 4-6 input views. GBR integrates a neural bundle adjustment module to enhance geometry accuracy and a generative depth refinement module to improve geometry fidelity. More specifically, the neural bundle adjustment module integrates a foundation network to produce initial 3D point maps and point matches from unposed images, followed by bundle adjustment optimization to improve multiview consistency and point cloud accuracy. The generative depth refinement module employs a diffusion-based strategy to enhance geometric details and fidelity while preserving the scale. Finally, for Gaussian splatting optimization, we propose a multimodal loss function incorporating depth and normal consistency, geometric regularization, and pseudo-view supervision, providing robust guidance under sparse-view conditions. Experiments on widely used datasets show that GBR significantly outperforms existing methods under sparse-view inputs. Additionally, GBR demonstrates the ability to reconstruct and render large-scale real-world scenes, such as the Pavilion of Prince Teng and the Great Wall, with remarkable details using only 6 views. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05908v1-abstract-full').style.display = 'none'; document.getElementById('2412.05908v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.04719">arXiv:2412.04719</a> <span> [<a href="https://arxiv.org/pdf/2412.04719">pdf</a>, <a href="https://arxiv.org/format/2412.04719">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Mix-Modality Person Re-Identification: A New and Practical Paradigm </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wei Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xin Xu</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+H">Hua Chang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xin Yuan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zheng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.04719v1-abstract-short" style="display: inline;"> Current visible-infrared cross-modality person re-identification research has only focused on exploring the bi-modality mutual retrieval paradigm, and we propose a new and more practical mix-modality retrieval paradigm. Existing Visible-Infrared person re-identification (VI-ReID) methods have achieved some results in the bi-modality mutual retrieval paradigm by learning the correspondence between… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04719v1-abstract-full').style.display = 'inline'; document.getElementById('2412.04719v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.04719v1-abstract-full" style="display: none;"> Current visible-infrared cross-modality person re-identification research has only focused on exploring the bi-modality mutual retrieval paradigm, and we propose a new and more practical mix-modality retrieval paradigm. Existing Visible-Infrared person re-identification (VI-ReID) methods have achieved some results in the bi-modality mutual retrieval paradigm by learning the correspondence between visible and infrared modalities. However, significant performance degradation occurs due to the modality confusion problem when these methods are applied to the new mix-modality paradigm. Therefore, this paper proposes a Mix-Modality person re-identification (MM-ReID) task, explores the influence of modality mixing ratio on performance, and constructs mix-modality test sets for existing datasets according to the new mix-modality testing paradigm. To solve the modality confusion problem in MM-ReID, we propose a Cross-Identity Discrimination Harmonization Loss (CIDHL) adjusting the distribution of samples in the hyperspherical feature space, pulling the centers of samples with the same identity closer, and pushing away the centers of samples with different identities while aggregating samples with the same modality and the same identity. Furthermore, we propose a Modality Bridge Similarity Optimization Strategy (MBSOS) to optimize the cross-modality similarity between the query and queried samples with the help of the similar bridge sample in the gallery. Extensive experiments demonstrate that compared to the original performance of existing cross-modality methods on MM-ReID, the addition of our CIDHL and MBSOS demonstrates a general improvement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04719v1-abstract-full').style.display = 'none'; document.getElementById('2412.04719v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03701">arXiv:2412.03701</a> <span> [<a href="https://arxiv.org/pdf/2412.03701">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Interpretable Hierarchical Attention Network for Medical Condition Identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fang%2C+D">Dongping Fang</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+L">Lian Duan</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xiaojing Yuan</a>, <a href="/search/cs?searchtype=author&query=Klunder%2C+A">Allyn Klunder</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+K">Kevin Tan</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+S">Suiting Cao</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+Y">Yeqing Ji</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mike Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03701v1-abstract-short" style="display: inline;"> Accurate prediction of medical conditions with straight past clinical evidence is a long-sought topic in the medical management and health insurance field. Although great progress has been made with machine learning algorithms, the medical community is still skeptical about the model accuracy and interpretability. This paper presents an innovative hierarchical attention deep learning model to achi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03701v1-abstract-full').style.display = 'inline'; document.getElementById('2412.03701v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03701v1-abstract-full" style="display: none;"> Accurate prediction of medical conditions with straight past clinical evidence is a long-sought topic in the medical management and health insurance field. Although great progress has been made with machine learning algorithms, the medical community is still skeptical about the model accuracy and interpretability. This paper presents an innovative hierarchical attention deep learning model to achieve better prediction and clear interpretability that can be easily understood by medical professionals. This paper developed an Interpretable Hierarchical Attention Network (IHAN). IHAN uses a hierarchical attention structure that matches naturally with the medical history data structure and reflects patients encounter (date of service) sequence. The model attention structure consists of 3 levels: (1) attention on the medical code types (diagnosis codes, procedure codes, lab test results, and prescription drugs), (2) attention on the sequential medical encounters within a type, (3) attention on the individual medical codes within an encounter and type. This model is applied to predict the occurrence of stage 3 chronic kidney disease (CKD), using three years medical history of Medicare Advantage (MA) members from an American nationwide health insurance company. The model takes members medical events, both claims and Electronic Medical Records (EMR) data, as input, makes a prediction of stage 3 CKD and calculates contribution from individual events to the predicted outcome. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03701v1-abstract-full').style.display = 'none'; document.getElementById('2412.03701v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03611">arXiv:2412.03611</a> <span> [<a href="https://arxiv.org/pdf/2412.03611">pdf</a>, <a href="https://arxiv.org/format/2412.03611">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> Learning-based Sketches for Frequency Estimation in Data Streams without Ground Truth </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xinyu Yuan</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yan Qiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Meng Li</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Z">Zhenchun Wei</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Cuiying Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03611v2-abstract-short" style="display: inline;"> Estimating the frequency of items on the high-volume, fast data stream has been extensively studied in many areas, such as database and network measurement. Traditional sketch algorithms only allow to give very rough estimates with limited memory cost, whereas some learning-augmented algorithms have been proposed recently, their offline framework requires actual frequencies that are challenging to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03611v2-abstract-full').style.display = 'inline'; document.getElementById('2412.03611v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03611v2-abstract-full" style="display: none;"> Estimating the frequency of items on the high-volume, fast data stream has been extensively studied in many areas, such as database and network measurement. Traditional sketch algorithms only allow to give very rough estimates with limited memory cost, whereas some learning-augmented algorithms have been proposed recently, their offline framework requires actual frequencies that are challenging to access in general for training, and speed is too slow for real-time processing, despite the still coarse-grained accuracy. To this end, we propose a more practical learning-based estimation framework namely UCL-sketch, by following the line of equation-based sketch to estimate per-key frequencies. In a nutshell, there are two key techniques: online training via equivalent learning without ground truth, and highly scalable architecture with logical estimation buckets. We implemented experiments on both real-world and synthetic datasets. The results demonstrate that our method greatly outperforms existing state-of-the-art sketches regarding per-key accuracy and distribution, while preserving resource efficiency. Our code is attached in the supplementary material, and will be made publicly available at https://github.com/Y-debug-sys/UCL-sketch. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03611v2-abstract-full').style.display = 'none'; document.getElementById('2412.03611v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03106">arXiv:2412.03106</a> <span> [<a href="https://arxiv.org/pdf/2412.03106">pdf</a>, <a href="https://arxiv.org/format/2412.03106">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Improved Turbo Message Passing for Compressive Robust Principal Component Analysis: Algorithm Design and Asymptotic Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhuohang He</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Junjie Ma</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xiaojun Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03106v1-abstract-short" style="display: inline;"> Compressive Robust Principal Component Analysis (CRPCA) naturally arises in various applications as a means to recover a low-rank matrix low-rank matrix $\boldsymbol{L}$ and a sparse matrix $\boldsymbol{S}$ from compressive measurements. In this paper, we approach the problem from a Bayesian inference perspective. We establish a probabilistic model for the problem and develop an improved turbo mes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03106v1-abstract-full').style.display = 'inline'; document.getElementById('2412.03106v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03106v1-abstract-full" style="display: none;"> Compressive Robust Principal Component Analysis (CRPCA) naturally arises in various applications as a means to recover a low-rank matrix low-rank matrix $\boldsymbol{L}$ and a sparse matrix $\boldsymbol{S}$ from compressive measurements. In this paper, we approach the problem from a Bayesian inference perspective. We establish a probabilistic model for the problem and develop an improved turbo message passing (ITMP) algorithm based on the sum-product rule and the appropriate approximations. Additionally, we establish a state evolution framework to characterize the asymptotic behavior of the ITMP algorithm in the large-system limit. By analyzing the established state evolution, we further propose sufficient conditions for the global convergence of our algorithm. Our numerical results validate the theoretical results, demonstrating that the proposed asymptotic framework accurately characterize the dynamical behavior of the ITMP algorithm, and the phase transition curve specified by the sufficient condition agrees well with numerical simulations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03106v1-abstract-full').style.display = 'none'; document.getElementById('2412.03106v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00084">arXiv:2412.00084</a> <span> [<a href="https://arxiv.org/pdf/2412.00084">pdf</a>, <a href="https://arxiv.org/format/2412.00084">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Unpacking the Individual Components of Diffusion Policy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xiu Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00084v1-abstract-short" style="display: inline;"> Imitation Learning presents a promising approach for learning generalizable and complex robotic skills. The recently proposed Diffusion Policy generates robot action sequences through a conditional denoising diffusion process, achieving state-of-the-art performance compared to other imitation learning methods. This paper summarizes five key components of Diffusion Policy: 1) observation sequence i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00084v1-abstract-full').style.display = 'inline'; document.getElementById('2412.00084v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00084v1-abstract-full" style="display: none;"> Imitation Learning presents a promising approach for learning generalizable and complex robotic skills. The recently proposed Diffusion Policy generates robot action sequences through a conditional denoising diffusion process, achieving state-of-the-art performance compared to other imitation learning methods. This paper summarizes five key components of Diffusion Policy: 1) observation sequence input; 2) action sequence execution; 3) receding horizon; 4) U-Net or Transformer network architecture; and 5) FiLM conditioning. By conducting experiments across ManiSkill and Adroit benchmarks, this study aims to elucidate the contribution of each component to the success of Diffusion Policy in various scenarios. We hope our findings will provide valuable insights for the application of Diffusion Policy in future research and industry. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00084v1-abstract-full').style.display = 'none'; document.getElementById('2412.00084v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.19493">arXiv:2411.19493</a> <span> [<a href="https://arxiv.org/pdf/2411.19493">pdf</a>, <a href="https://arxiv.org/format/2411.19493">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Diffusion Models Meet Network Management: Improving Traffic Matrix Analysis with Diffusion-based Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xinyu Yuan</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yan Qiao</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Z">Zhenchun Wei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zeyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Minyue Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+P">Pei Zhao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+R">Rongyao Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenjing Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.19493v1-abstract-short" style="display: inline;"> Due to network operation and maintenance relying heavily on network traffic monitoring, traffic matrix analysis has been one of the most crucial issues for network management related tasks. However, it is challenging to reliably obtain the precise measurement in computer networks because of the high measurement cost, and the unavoidable transmission loss. Although some methods proposed in recent y… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19493v1-abstract-full').style.display = 'inline'; document.getElementById('2411.19493v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.19493v1-abstract-full" style="display: none;"> Due to network operation and maintenance relying heavily on network traffic monitoring, traffic matrix analysis has been one of the most crucial issues for network management related tasks. However, it is challenging to reliably obtain the precise measurement in computer networks because of the high measurement cost, and the unavoidable transmission loss. Although some methods proposed in recent years allowed estimating network traffic from partial flow-level or link-level measurements, they often perform poorly for traffic matrix estimation nowadays. Despite strong assumptions like low-rank structure and the prior distribution, existing techniques are usually task-specific and tend to be significantly worse as modern network communication is extremely complicated and dynamic. To address the dilemma, this paper proposed a diffusion-based traffic matrix analysis framework named Diffusion-TM, which leverages problem-agnostic diffusion to notably elevate the estimation performance in both traffic distribution and accuracy. The novel framework not only takes advantage of the powerful generative ability of diffusion models to produce realistic network traffic, but also leverages the denoising process to unbiasedly estimate all end-to-end traffic in a plug-and-play manner under theoretical guarantee. Moreover, taking into account that compiling an intact traffic dataset is usually infeasible, we also propose a two-stage training scheme to make our framework be insensitive to missing values in the dataset. With extensive experiments with real-world datasets, we illustrate the effectiveness of Diffusion-TM on several tasks. Moreover, the results also demonstrate that our method can obtain promising results even with $5\%$ known values left in the datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19493v1-abstract-full').style.display = 'none'; document.getElementById('2411.19493v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18023">arXiv:2411.18023</a> <span> [<a href="https://arxiv.org/pdf/2411.18023">pdf</a>, <a href="https://arxiv.org/format/2411.18023">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Leveraging A New GAN-based Transformer with ECDH Crypto-system for Enhancing Energy Theft Detection in Smart Grid </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yang Yang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xun Yuan</a>, <a href="/search/cs?searchtype=author&query=Alromih%2C+A">Arwa Alromih</a>, <a href="/search/cs?searchtype=author&query=Pasikhani%2C+A+M">Aryan Mohammadi Pasikhani</a>, <a href="/search/cs?searchtype=author&query=Gope%2C+P">Prosanta Gope</a>, <a href="/search/cs?searchtype=author&query=Sikdar%2C+B">Biplab Sikdar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18023v1-abstract-short" style="display: inline;"> Detecting energy theft is vital for effectively managing power grids, as it ensures precise billing and prevents financial losses. Split-learning emerges as a promising decentralized machine learning technique for identifying energy theft while preserving user data confidentiality. Nevertheless, traditional split learning approaches are vulnerable to privacy leakage attacks, which significantly th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18023v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18023v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18023v1-abstract-full" style="display: none;"> Detecting energy theft is vital for effectively managing power grids, as it ensures precise billing and prevents financial losses. Split-learning emerges as a promising decentralized machine learning technique for identifying energy theft while preserving user data confidentiality. Nevertheless, traditional split learning approaches are vulnerable to privacy leakage attacks, which significantly threaten data confidentiality. To address this challenge, we propose a novel GAN-Transformer-based split learning framework in this paper. This framework leverages the strengths of the transformer architecture, which is known for its capability to process long-range dependencies in energy consumption data. Thus, it enhances the accuracy of energy theft detection without compromising user privacy. A distinctive feature of our approach is the deployment of a novel mask-based method, marking a first in its field to effectively combat privacy leakage in split learning scenarios targeted at AI-enabled adversaries. This method protects sensitive information during the model's training phase. Our experimental evaluations indicate that the proposed framework not only achieves accuracy levels comparable to conventional methods but also significantly enhances privacy protection. The results underscore the potential of the GAN-Transformer split learning framework as an effective and secure tool in the domain of energy theft detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18023v1-abstract-full').style.display = 'none'; document.getElementById('2411.18023v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17237">arXiv:2411.17237</a> <span> [<a href="https://arxiv.org/pdf/2411.17237">pdf</a>, <a href="https://arxiv.org/format/2411.17237">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Grounding-IQA: Multimodal Language Grounding Model for Image Quality Assessment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zheng Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xun Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenbo Li</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+R">Renjing Pei</a>, <a href="/search/cs?searchtype=author&query=Song%2C+F">Fenglong Song</a>, <a href="/search/cs?searchtype=author&query=Min%2C+X">Xiongkuo Min</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaohong Liu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xin Yuan</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yong Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yulun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17237v1-abstract-short" style="display: inline;"> The development of multimodal large language models (MLLMs) enables the evaluation of image quality through natural language descriptions. This advancement allows for more detailed assessments. However, these MLLM-based IQA methods primarily rely on general contextual descriptions, sometimes limiting fine-grained quality assessment. To address this limitation, we introduce a new image quality asse… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17237v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17237v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17237v1-abstract-full" style="display: none;"> The development of multimodal large language models (MLLMs) enables the evaluation of image quality through natural language descriptions. This advancement allows for more detailed assessments. However, these MLLM-based IQA methods primarily rely on general contextual descriptions, sometimes limiting fine-grained quality assessment. To address this limitation, we introduce a new image quality assessment (IQA) task paradigm, grounding-IQA. This paradigm integrates multimodal referring and grounding with IQA to realize more fine-grained quality perception. Specifically, grounding-IQA comprises two subtasks: grounding-IQA-description (GIQA-DES) and visual question answering (GIQA-VQA). GIQA-DES involves detailed descriptions with precise locations (e.g., bounding boxes), while GIQA-VQA focuses on quality QA for local regions. To realize grounding-IQA, we construct a corresponding dataset, GIQA-160K, through our proposed automated annotation pipeline. Furthermore, we develop a well-designed benchmark, GIQA-Bench. The benchmark comprehensively evaluates the model grounding-IQA performance from three perspectives: description quality, VQA accuracy, and grounding precision. Experiments demonstrate that our proposed task paradigm, dataset, and benchmark facilitate the more fine-grained IQA application. Code: https://github.com/zhengchen1999/Grounding-IQA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17237v1-abstract-full').style.display = 'none'; document.getElementById('2411.17237v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code is available at: https://github.com/zhengchen1999/Grounding-IQA</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14001">arXiv:2411.14001</a> <span> [<a href="https://arxiv.org/pdf/2411.14001">pdf</a>, <a href="https://arxiv.org/format/2411.14001">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Graph Domain Adaptation with Dual-branch Encoder and Two-level Alignment for Whole Slide Image-based Survival Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shou%2C+Y">Yuntao Shou</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+P">Peiqiang Yan</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xingjian Yuan</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiangyong Cao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Q">Qian Zhao</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+D">Deyu Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14001v1-abstract-short" style="display: inline;"> In recent years, histopathological whole slide image (WSI)- based survival analysis has attracted much attention in medical image analysis. In practice, WSIs usually come from different hospitals or laboratories, which can be seen as different domains, and thus may have significant differences in imaging equipment, processing procedures, and sample sources. These differences generally result in la… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14001v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14001v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14001v1-abstract-full" style="display: none;"> In recent years, histopathological whole slide image (WSI)- based survival analysis has attracted much attention in medical image analysis. In practice, WSIs usually come from different hospitals or laboratories, which can be seen as different domains, and thus may have significant differences in imaging equipment, processing procedures, and sample sources. These differences generally result in large gaps in distribution between different WSI domains, and thus the survival analysis models trained on one domain may fail to transfer to another. To address this issue, we propose a Dual-branch Encoder and Two-level Alignment (DETA) framework to explore both feature and category-level alignment between different WSI domains. Specifically, we first formulate the concerned problem as graph domain adaptation (GDA) by virtue the graph representation of WSIs. Then we construct a dual-branch graph encoder, including the message passing branch and the shortest path branch, to explicitly and implicitly extract semantic information from the graph-represented WSIs. To realize GDA, we propose a two-level alignment approach: at the category level, we develop a coupling technique by virtue of the dual-branch structure, leading to reduced divergence between the category distributions of the two domains; at the feature level, we introduce an adversarial perturbation strategy to better augment source domain feature, resulting in improved alignment in feature distribution. To the best of our knowledge, our work is the first attempt to alleviate the domain shift issue for WSI data analysis. Extensive experiments on four TCGA datasets have validated the effectiveness of our proposed DETA framework and demonstrated its superior performance in WSI-based survival analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14001v1-abstract-full').style.display = 'none'; document.getElementById('2411.14001v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13888">arXiv:2411.13888</a> <span> [<a href="https://arxiv.org/pdf/2411.13888">pdf</a>, <a href="https://arxiv.org/format/2411.13888">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Discrete Mathematics">cs.DM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> A Hierarchical Poisson Generator for Universal Graphs under Limited Resources </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qi%2C+X">Xiaorui Qi</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Y">Yanlong Wen</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xiaojie Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13888v1-abstract-short" style="display: inline;"> Graph generation is one of the most challenging tasks in recent years, and its core is to learn the ground truth distribution hiding in the training data. However, training data may not be available due to security concerns or unaffordable costs, which severely blows the learning models, especially the deep generative models. The dilemma leads us to rethink non-learned generation methods based on… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13888v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13888v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13888v1-abstract-full" style="display: none;"> Graph generation is one of the most challenging tasks in recent years, and its core is to learn the ground truth distribution hiding in the training data. However, training data may not be available due to security concerns or unaffordable costs, which severely blows the learning models, especially the deep generative models. The dilemma leads us to rethink non-learned generation methods based on graph invariant features. Based on the observation of scale-free property, we propose a hierarchical Poisson graph generation algorithm. Specifically, we design a two-stage generation strategy. In the first stage, we sample multiple anchor nodes according to the Poisson distribution to further guide the formation of substructures, splitting the initial node set into multiple ones. Next, we progressively generate edges by sampling nodes through a degree mixing distribution, adjusting the tolerance towards exotic structures via two thresholds. We provide theoretical guarantees for hierarchical generation and verify the effectiveness of our method under 12 datasets of three categories. Experimental results show that our method fits the ground truth distribution better than various generation strategies and other distribution observations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13888v1-abstract-full').style.display = 'none'; document.getElementById('2411.13888v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13162">arXiv:2411.13162</a> <span> [<a href="https://arxiv.org/pdf/2411.13162">pdf</a>, <a href="https://arxiv.org/format/2411.13162">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> </div> </div> <p class="title is-5 mathjax"> IC Mechanisms for Risk-Averse Advertisers in the Online Advertising System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingzhe Wang</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+R">Ruohan Qian</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+Y">Yuejia Dou</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Q">Qi Qi</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+B">Bo Shen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Changyuan Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yixuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yixin Su</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xin Yuan</a>, <a href="/search/cs?searchtype=author&query=liu%2C+W">Wenqiang liu</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+B">Bin Zou</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+W">Wen Yi</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zhi Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuanglong Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+L">Liu Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13162v1-abstract-short" style="display: inline;"> The autobidding system generates huge revenue for advertising platforms, garnering substantial research attention. Existing studies in autobidding systems focus on designing Autobidding Incentive Compatible (AIC) mechanisms, where the mechanism is Incentive Compatible (IC) under ex ante expectations. However, upon deploying AIC mechanisms in advertising platforms, we observe a notable deviation be… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13162v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13162v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13162v1-abstract-full" style="display: none;"> The autobidding system generates huge revenue for advertising platforms, garnering substantial research attention. Existing studies in autobidding systems focus on designing Autobidding Incentive Compatible (AIC) mechanisms, where the mechanism is Incentive Compatible (IC) under ex ante expectations. However, upon deploying AIC mechanisms in advertising platforms, we observe a notable deviation between the actual auction outcomes and these expectations during runtime, particularly in the scene with few clicks (sparse-click). This discrepancy undermines truthful bidding among advertisers in AIC mechanisms, especially for risk-averse advertisers who are averse to outcomes that do not align with the expectations. To address this issue, we propose a mechanism, Decoupled First-Price Auction (DFP), that retains its IC property even during runtime. DFP dynamically adjusts the payment based on real-time user conversion outcomes, ensuring that advertisers' realized utilities closely approximate their expected utilities during runtime. To realize the payment mechanism of DFP, we propose a PPO-based RL algorithm, with a meticulously crafted reward function. This algorithm dynamically adjusts the payment to fit DFP mechanism. We conduct extensive experiments leveraging real-world data to validate our findings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13162v1-abstract-full').style.display = 'none'; document.getElementById('2411.13162v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10606">arXiv:2411.10606</a> <span> [<a href="https://arxiv.org/pdf/2411.10606">pdf</a>, <a href="https://arxiv.org/format/2411.10606">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AmoebaLLM: Constructing Any-Shape Large Language Models for Efficient and Instant Deployment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yonggan Fu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhongzhi Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Junwei Li</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+J">Jiayi Qian</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yongan Zhang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xiangchi Yuan</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+D">Dachuan Shi</a>, <a href="/search/cs?searchtype=author&query=Yakunin%2C+R">Roman Yakunin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y+C">Yingyan Celine Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10606v1-abstract-short" style="display: inline;"> Motivated by the transformative capabilities of large language models (LLMs) across various natural language tasks, there has been a growing demand to deploy these models effectively across diverse real-world applications and platforms. However, the challenge of efficiently deploying LLMs has become increasingly pronounced due to the varying application-specific performance requirements and the ra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10606v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10606v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10606v1-abstract-full" style="display: none;"> Motivated by the transformative capabilities of large language models (LLMs) across various natural language tasks, there has been a growing demand to deploy these models effectively across diverse real-world applications and platforms. However, the challenge of efficiently deploying LLMs has become increasingly pronounced due to the varying application-specific performance requirements and the rapid evolution of computational platforms, which feature diverse resource constraints and deployment flows. These varying requirements necessitate LLMs that can adapt their structures (depth and width) for optimal efficiency across different platforms and application specifications. To address this critical gap, we propose AmoebaLLM, a novel framework designed to enable the instant derivation of LLM subnets of arbitrary shapes, which achieve the accuracy-efficiency frontier and can be extracted immediately after a one-time fine-tuning. In this way, AmoebaLLM significantly facilitates rapid deployment tailored to various platforms and applications. Specifically, AmoebaLLM integrates three innovative components: (1) a knowledge-preserving subnet selection strategy that features a dynamic-programming approach for depth shrinking and an importance-driven method for width shrinking; (2) a shape-aware mixture of LoRAs to mitigate gradient conflicts among subnets during fine-tuning; and (3) an in-place distillation scheme with loss-magnitude balancing as the fine-tuning objective. Extensive experiments validate that AmoebaLLM not only sets new standards in LLM adaptability but also successfully delivers subnets that achieve state-of-the-art trade-offs between accuracy and efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10606v1-abstract-full').style.display = 'none'; document.getElementById('2411.10606v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09968">arXiv:2411.09968</a> <span> [<a href="https://arxiv.org/pdf/2411.09968">pdf</a>, <a href="https://arxiv.org/format/2411.09968">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Seeing Clearly by Layer Two: Enhancing Attention Heads to Alleviate Hallucination in LVLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaofeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Quan%2C+Y">Yihao Quan</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+C">Chaochen Gu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chen Shen</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xiaosong Yuan</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+S">Shaotian Yan</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hao Cheng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+K">Kaijie Wu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jieping Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09968v1-abstract-short" style="display: inline;"> The hallucination problem in multimodal large language models (MLLMs) remains a common issue. Although image tokens occupy a majority of the input sequence of MLLMs, there is limited research to explore the relationship between image tokens and hallucinations. In this paper, we analyze the distribution of attention scores for image tokens across each layer and head of the model, revealing an intri… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09968v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09968v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09968v1-abstract-full" style="display: none;"> The hallucination problem in multimodal large language models (MLLMs) remains a common issue. Although image tokens occupy a majority of the input sequence of MLLMs, there is limited research to explore the relationship between image tokens and hallucinations. In this paper, we analyze the distribution of attention scores for image tokens across each layer and head of the model, revealing an intriguing and common phenomenon: most hallucinations are closely linked to the pattern of attention sinks in the self-attention matrix of image tokens, where shallow layers exhibit dense attention sinks and deeper layers show sparse attention sinks. We further analyze the attention heads of different layers and find that heads with high-density attention sink in the image part play a positive role in alleviating hallucinations. In this paper, we propose a training-free method named \textcolor{red}{\textbf{E}}nhancing \textcolor{red}{\textbf{A}}ttention \textcolor{red}{\textbf{H}}eads (EAH), an approach designed to enhance the convergence of image tokens attention sinks in the shallow layers. EAH identifies the attention head that shows the vision sink in a shallow layer and extracts its attention matrix. This attention map is then broadcast to other heads in the layer, thereby strengthening the layer to pay more attention to the image itself. With extensive experiments, EAH shows significant hallucination-mitigating performance on different MLLMs and metrics, proving its effectiveness and generality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09968v1-abstract-full').style.display = 'none'; document.getElementById('2411.09968v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06733">arXiv:2411.06733</a> <span> [<a href="https://arxiv.org/pdf/2411.06733">pdf</a>, <a href="https://arxiv.org/format/2411.06733">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> GSL-PCD: Improving Generalist-Specialist Learning with Point Cloud Feature-based Task Partitioning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xiu Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06733v1-abstract-short" style="display: inline;"> Generalization in Deep Reinforcement Learning (DRL) across unseen environment variations often requires training over a diverse set of scenarios. Many existing DRL algorithms struggle with efficiency when handling numerous variations. The Generalist-Specialist Learning (GSL) framework addresses this by first training a generalist model on all variations, then creating specialists from the generali… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06733v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06733v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06733v1-abstract-full" style="display: none;"> Generalization in Deep Reinforcement Learning (DRL) across unseen environment variations often requires training over a diverse set of scenarios. Many existing DRL algorithms struggle with efficiency when handling numerous variations. The Generalist-Specialist Learning (GSL) framework addresses this by first training a generalist model on all variations, then creating specialists from the generalist's weights, each focusing on a subset of variations. The generalist then refines its learning with assistance from the specialists. However, random task partitioning in GSL can impede performance by assigning vastly different variations to the same specialist, often resulting in each specialist focusing on only one variation, which raises computational costs. To improve this, we propose Generalist-Specialist Learning with Point Cloud Feature-based Task Partitioning (GSL-PCD). Our approach clusters environment variations based on features extracted from object point clouds and uses balanced clustering with a greedy algorithm to assign similar variations to the same specialist. Evaluations on robotic manipulation tasks from the ManiSkill benchmark demonstrate that point cloud feature-based partitioning outperforms vanilla partitioning by 9.4%, with a fixed number of specialists, and reduces computational and sample requirements by 50% to achieve comparable performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06733v1-abstract-full').style.display = 'none'; document.getElementById('2411.06733v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04669">arXiv:2411.04669</a> <span> [<a href="https://arxiv.org/pdf/2411.04669">pdf</a>, <a href="https://arxiv.org/format/2411.04669">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> EffiCANet: Efficient Time Series Forecasting with Convolutional Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xinxing Zhou</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jiaqi Ye</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shubao Zhao</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+M">Ming Jin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chengyi Yang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Y">Yanlong Wen</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xiaojie Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04669v1-abstract-short" style="display: inline;"> The exponential growth of multivariate time series data from sensor networks in domains like industrial monitoring and smart cities requires efficient and accurate forecasting models. Current deep learning methods often fail to adequately capture long-range dependencies and complex inter-variable relationships, especially under real-time processing constraints. These limitations arise as many mode… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04669v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04669v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04669v1-abstract-full" style="display: none;"> The exponential growth of multivariate time series data from sensor networks in domains like industrial monitoring and smart cities requires efficient and accurate forecasting models. Current deep learning methods often fail to adequately capture long-range dependencies and complex inter-variable relationships, especially under real-time processing constraints. These limitations arise as many models are optimized for either short-term forecasting with limited receptive fields or long-term accuracy at the cost of efficiency. Additionally, dynamic and intricate interactions between variables in real-world data further complicate modeling efforts. To address these limitations, we propose EffiCANet, an Efficient Convolutional Attention Network designed to enhance forecasting accuracy while maintaining computational efficiency. EffiCANet integrates three key components: (1) a Temporal Large-kernel Decomposed Convolution (TLDC) module that captures long-term temporal dependencies while reducing computational overhead; (2) an Inter-Variable Group Convolution (IVGC) module that captures complex and evolving relationships among variables; and (3) a Global Temporal-Variable Attention (GTVA) mechanism that prioritizes critical temporal and inter-variable features. Extensive evaluations across nine benchmark datasets show that EffiCANet achieves the maximum reduction of 10.02% in MAE over state-of-the-art models, while cutting computational costs by 26.2% relative to conventional large-kernel convolution methods, thanks to its efficient decomposition strategy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04669v1-abstract-full').style.display = 'none'; document.getElementById('2411.04669v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03255">arXiv:2411.03255</a> <span> [<a href="https://arxiv.org/pdf/2411.03255">pdf</a>, <a href="https://arxiv.org/format/2411.03255">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> Error Interference in Quantum Simulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+B">Boyang Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jue Xu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Q">Qi Zhao</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xiao Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03255v2-abstract-short" style="display: inline;"> Understanding algorithmic error accumulation in quantum simulation is crucial due to its fundamental significance and practical applications in simulating quantum many-body system dynamics. Conventional theories typically apply the triangle inequality to provide an upper bound for the error. However, these often yield overly conservative and inaccurate estimates as they neglect error interference… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03255v2-abstract-full').style.display = 'inline'; document.getElementById('2411.03255v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03255v2-abstract-full" style="display: none;"> Understanding algorithmic error accumulation in quantum simulation is crucial due to its fundamental significance and practical applications in simulating quantum many-body system dynamics. Conventional theories typically apply the triangle inequality to provide an upper bound for the error. However, these often yield overly conservative and inaccurate estimates as they neglect error interference -- a phenomenon where errors in different segments can destructively interfere. Here, we introduce a novel method that directly estimates the long-time algorithmic errors with multiple segments, thereby establishing a comprehensive framework for characterizing algorithmic error interference. We identify the sufficient and necessary condition for strict error interference and introduce the concept of approximate error interference, which is more broadly applicable to scenarios such as power-law interaction models, the Fermi-Hubbard model, and higher-order Trotter formulas. Our work demonstrates significant improvements over prior ones and opens new avenues for error analysis in quantum simulation, offering potential advancements in both theoretical algorithm design and experimental implementation of Hamiltonian simulation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03255v2-abstract-full').style.display = 'none'; document.getElementById('2411.03255v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23325">arXiv:2410.23325</a> <span> [<a href="https://arxiv.org/pdf/2410.23325">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Transfer Learning in Vocal Education: Technical Evaluation of Limited Samples Describing Mezzo-soprano </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hou%2C+Z">Zhenyi Hou</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xu Zhao</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+K">Kejie Ye</a>, <a href="/search/cs?searchtype=author&query=Sheng%2C+X">Xinyu Sheng</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+S">Shanggerile Jiang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+J">Jiajing Xia</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yitao Zhang</a>, <a href="/search/cs?searchtype=author&query=Ban%2C+C">Chenxi Ban</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+D">Daijun Luo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiaxing Chen</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+Y">Yan Zou</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yuchao Feng</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+G">Guangyu Fan</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xin Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23325v1-abstract-short" style="display: inline;"> Vocal education in the music field is difficult to quantify due to the individual differences in singers' voices and the different quantitative criteria of singing techniques. Deep learning has great potential to be applied in music education due to its efficiency to handle complex data and perform quantitative analysis. However, accurate evaluations with limited samples over rare vocal types, suc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23325v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23325v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23325v1-abstract-full" style="display: none;"> Vocal education in the music field is difficult to quantify due to the individual differences in singers' voices and the different quantitative criteria of singing techniques. Deep learning has great potential to be applied in music education due to its efficiency to handle complex data and perform quantitative analysis. However, accurate evaluations with limited samples over rare vocal types, such as Mezzo-soprano, requires extensive well-annotated data support using deep learning models. In order to attain the objective, we perform transfer learning by employing deep learning models pre-trained on the ImageNet and Urbansound8k datasets for the improvement on the precision of vocal technique evaluation. Furthermore, we tackle the problem of the lack of samples by constructing a dedicated dataset, the Mezzo-soprano Vocal Set (MVS), for vocal technique assessment. Our experimental results indicate that transfer learning increases the overall accuracy (OAcc) of all models by an average of 8.3%, with the highest accuracy at 94.2%. We not only provide a novel approach to evaluating Mezzo-soprano vocal techniques but also introduce a new quantitative assessment method for music education. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23325v1-abstract-full').style.display = 'none'; document.getElementById('2410.23325v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19079">arXiv:2410.19079</a> <span> [<a href="https://arxiv.org/pdf/2410.19079">pdf</a>, <a href="https://arxiv.org/format/2410.19079">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> BIFR脰ST: 3D-Aware Image compositing with Language Instructions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+L">Lingxiao Li</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+K">Kaixiong Gong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weihong Li</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+X">Xili Dai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tao Chen</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xiaojun Yuan</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+X">Xiangyu Yue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19079v2-abstract-short" style="display: inline;"> This paper introduces Bifr枚st, a novel 3D-aware framework that is built upon diffusion models to perform instruction-based image composition. Previous methods concentrate on image compositing at the 2D level, which fall short in handling complex spatial relationships ($\textit{e.g.}$, occlusion). Bifr枚st addresses these issues by training MLLM as a 2.5D location predictor and integrating depth map… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19079v2-abstract-full').style.display = 'inline'; document.getElementById('2410.19079v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19079v2-abstract-full" style="display: none;"> This paper introduces Bifr枚st, a novel 3D-aware framework that is built upon diffusion models to perform instruction-based image composition. Previous methods concentrate on image compositing at the 2D level, which fall short in handling complex spatial relationships ($\textit{e.g.}$, occlusion). Bifr枚st addresses these issues by training MLLM as a 2.5D location predictor and integrating depth maps as an extra condition during the generation process to bridge the gap between 2D and 3D, which enhances spatial comprehension and supports sophisticated spatial interactions. Our method begins by fine-tuning MLLM with a custom counterfactual dataset to predict 2.5D object locations in complex backgrounds from language instructions. Then, the image-compositing model is uniquely designed to process multiple types of input features, enabling it to perform high-fidelity image compositions that consider occlusion, depth blur, and image harmonization. Extensive qualitative and quantitative evaluations demonstrate that Bifr枚st significantly outperforms existing methods, providing a robust solution for generating realistically composited images in scenarios demanding intricate spatial understanding. This work not only pushes the boundaries of generative image compositing but also reduces reliance on expensive annotated datasets by effectively utilizing existing resources in innovative ways. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19079v2-abstract-full').style.display = 'none'; document.getElementById('2410.19079v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024, Code Available: https://github.com/lingxiao-li/Bifrost</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18551">arXiv:2410.18551</a> <span> [<a href="https://arxiv.org/pdf/2410.18551">pdf</a>, <a href="https://arxiv.org/format/2410.18551">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> IMAN: An Adaptive Network for Robust NPC Mortality Prediction with Missing Modalities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huo%2C+Y">Yejing Huo</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+G">Guoheng Huang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+L">Lianglun Cheng</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jianbin He</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xuhang Chen</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xiaochen Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+G">Guo Zhong</a>, <a href="/search/cs?searchtype=author&query=Pun%2C+C">Chi-Man Pun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18551v1-abstract-short" style="display: inline;"> Accurate prediction of mortality in nasopharyngeal carcinoma (NPC), a complex malignancy particularly challenging in advanced stages, is crucial for optimizing treatment strategies and improving patient outcomes. However, this predictive process is often compromised by the high-dimensional and heterogeneous nature of NPC-related data, coupled with the pervasive issue of incomplete multi-modal data… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18551v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18551v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18551v1-abstract-full" style="display: none;"> Accurate prediction of mortality in nasopharyngeal carcinoma (NPC), a complex malignancy particularly challenging in advanced stages, is crucial for optimizing treatment strategies and improving patient outcomes. However, this predictive process is often compromised by the high-dimensional and heterogeneous nature of NPC-related data, coupled with the pervasive issue of incomplete multi-modal data, manifesting as missing radiological images or incomplete diagnostic reports. Traditional machine learning approaches suffer significant performance degradation when faced with such incomplete data, as they fail to effectively handle the high-dimensionality and intricate correlations across modalities. Even advanced multi-modal learning techniques like Transformers struggle to maintain robust performance in the presence of missing modalities, as they lack specialized mechanisms to adaptively integrate and align the diverse data types, while also capturing nuanced patterns and contextual relationships within the complex NPC data. To address these problem, we introduce IMAN: an adaptive network for robust NPC mortality prediction with missing modalities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18551v1-abstract-full').style.display = 'none'; document.getElementById('2410.18551v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The paper has been accepted by BIBM 2024</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yuan%2C+X&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yuan%2C+X&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yuan%2C+X&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yuan%2C+X&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yuan%2C+X&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yuan%2C+X&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository