Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 878 results for author: <span class="mathjax">Xiao, J</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Xiao%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Xiao, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Xiao%2C+J&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Xiao, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Xiao%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Xiao%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Xiao%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Xiao%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Xiao%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Xiao%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14327">arXiv:2502.14327</a> <span> [<a href="https://arxiv.org/pdf/2502.14327">pdf</a>, <a href="https://arxiv.org/format/2502.14327">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> ChemHTS: Hierarchical Tool Stacking for Enhancing Chemical Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhucong Li</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jin Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bowei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhijian Zhou</a>, <a href="/search/cs?searchtype=author&query=He%2C+Q">Qianyu He</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+F">Fenglei Cao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+J">Jiaqing Liang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Y">Yuan Qi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14327v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated remarkable potential in scientific research, particularly in chemistry-related tasks such as molecular design, reaction prediction, and property estimation. While tool-augmented LLMs have been introduced to enhance reasoning and computation in these domains, existing approaches suffer from tool invocation errors and lack effective collaboration among… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14327v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14327v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14327v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated remarkable potential in scientific research, particularly in chemistry-related tasks such as molecular design, reaction prediction, and property estimation. While tool-augmented LLMs have been introduced to enhance reasoning and computation in these domains, existing approaches suffer from tool invocation errors and lack effective collaboration among diverse tools, limiting their overall performance. To address these challenges, we propose ChemHTS (Chemical Hierarchical Tool Stacking), a novel method that optimizes tool invocation pathways through a hierarchical stacking strategy. ChemHTS consists of two key stages: tool self-stacking warmup and multi-layer decision optimization, enabling LLMs to refine tool usage dynamically. We evaluate ChemHTS across four classical chemistry tasks and demonstrate its superiority over strong baselines, including GPT-4o, DeepSeek-R1, and chemistry-specific models, including ChemDFM. Furthermore, we define four distinct tool-stacking behaviors to enhance interpretability, providing insights into the effectiveness of tool collaboration. Our dataset and code are publicly available at \url{https://github.com/Chang-pw/ChemHTS}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14327v1-abstract-full').style.display = 'none'; document.getElementById('2502.14327v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12744">arXiv:2502.12744</a> <span> [<a href="https://arxiv.org/pdf/2502.12744">pdf</a>, <a href="https://arxiv.org/format/2502.12744">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Self-Enhanced Reasoning Training: Activating Latent Reasoning in Small Models for Enhanced Reasoning Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yong Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bingyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhitao Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Ming Li</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+N">Ning Cheng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Minchuan Chen</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+T">Tao Wei</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jun Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shaojun Wang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jing Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12744v1-abstract-short" style="display: inline;"> The rapid advancement of large language models (LLMs) has significantly enhanced their reasoning abilities, enabling increasingly complex tasks. However, these capabilities often diminish in smaller, more computationally efficient models like GPT-2. Recent research shows that reasoning distillation can help small models acquire reasoning capabilities, but most existing methods focus primarily on i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12744v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12744v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12744v1-abstract-full" style="display: none;"> The rapid advancement of large language models (LLMs) has significantly enhanced their reasoning abilities, enabling increasingly complex tasks. However, these capabilities often diminish in smaller, more computationally efficient models like GPT-2. Recent research shows that reasoning distillation can help small models acquire reasoning capabilities, but most existing methods focus primarily on improving teacher-generated reasoning paths. Our observations reveal that small models can generate high-quality reasoning paths during sampling, even without chain-of-thought prompting, though these paths are often latent due to their low probability under standard decoding strategies. To address this, we propose Self-Enhanced Reasoning Training (SERT), which activates and leverages latent reasoning capabilities in small models through self-training on filtered, self-generated reasoning paths under zero-shot conditions. Experiments using OpenAI's GPT-3.5 as the teacher model and GPT-2 models as the student models demonstrate that SERT enhances the reasoning abilities of small models, improving their performance in reasoning distillation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12744v1-abstract-full').style.display = 'none'; document.getElementById('2502.12744v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the 50th IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP 2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12582">arXiv:2502.12582</a> <span> [<a href="https://arxiv.org/pdf/2502.12582">pdf</a>, <a href="https://arxiv.org/format/2502.12582">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Prototype Model for Attribute-based Multi-label Few-shot Action Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Juefeng Xiao</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+T">Tianqi Xiang</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+Z">Zhigang Tu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12582v1-abstract-short" style="display: inline;"> In real-world action recognition systems, incorporating more attributes helps achieve a more comprehensive understanding of human behavior. However, using a single model to simultaneously recognize multiple attributes can lead to a decrease in accuracy. In this work, we propose a novel method i.e. Adaptive Attribute Prototype Model (AAPM) for human action recognition, which captures rich action-re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12582v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12582v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12582v1-abstract-full" style="display: none;"> In real-world action recognition systems, incorporating more attributes helps achieve a more comprehensive understanding of human behavior. However, using a single model to simultaneously recognize multiple attributes can lead to a decrease in accuracy. In this work, we propose a novel method i.e. Adaptive Attribute Prototype Model (AAPM) for human action recognition, which captures rich action-relevant attribute information and strikes a balance between accuracy and robustness. Firstly, we introduce the Text-Constrain Module (TCM) to incorporate textual information from potential labels, and constrain the construction of different attributes prototype representations. In addition, we explore the Attribute Assignment Method (AAM) to address the issue of training bias and increase robustness during the training process.Furthermore, we construct a new video dataset with attribute-based multi-label called Multi-Kinetics for evaluation, which contains various attribute labels (e.g. action, scene, object, etc.) related to human behavior. Extensive experiments demonstrate that our AAPM achieves the state-of-the-art performance in both attribute-based multi-label few-shot action recognition and single-label few-shot action recognition. The project and dataset are available at an anonymous account https://github.com/theAAPM/AAPM <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12582v1-abstract-full').style.display = 'none'; document.getElementById('2502.12582v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12574">arXiv:2502.12574</a> <span> [<a href="https://arxiv.org/pdf/2502.12574">pdf</a>, <a href="https://arxiv.org/format/2502.12574">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> HeadInfer: Memory-Efficient LLM Inference by Head-wise Offloading </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+C">Cheng Luo</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Z">Zefan Cai</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Hanshi Sun</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jinqi Xiao</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+B">Bo Yuan</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+W">Wen Xiao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Junjie Hu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jiawei Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Beidi Chen</a>, <a href="/search/cs?searchtype=author&query=Anandkumar%2C+A">Anima Anandkumar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12574v1-abstract-short" style="display: inline;"> Transformer-based large language models (LLMs) demonstrate impressive performance in long context generation. Extending the context length has disproportionately shifted the memory footprint of LLMs during inference to the key-value cache (KV cache). In this paper, we propose HEADINFER, which offloads the KV cache to CPU RAM while avoiding the need to fully store the KV cache for any transformer l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12574v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12574v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12574v1-abstract-full" style="display: none;"> Transformer-based large language models (LLMs) demonstrate impressive performance in long context generation. Extending the context length has disproportionately shifted the memory footprint of LLMs during inference to the key-value cache (KV cache). In this paper, we propose HEADINFER, which offloads the KV cache to CPU RAM while avoiding the need to fully store the KV cache for any transformer layer on the GPU. HEADINFER employs a fine-grained, head-wise offloading strategy, maintaining only selective attention heads KV cache on the GPU while computing attention output dynamically. Through roofline analysis, we demonstrate that HEADINFER maintains computational efficiency while significantly reducing memory footprint. We evaluate HEADINFER on the Llama-3-8B model with a 1-million-token sequence, reducing the GPU memory footprint of the KV cache from 128 GB to 1 GB and the total GPU memory usage from 207 GB to 17 GB, achieving a 92% reduction compared to BF16 baseline inference. Notably, HEADINFER enables 4-million-token inference with an 8B model on a single consumer GPU with 24GB memory (e.g., NVIDIA RTX 4090) without approximation methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12574v1-abstract-full').style.display = 'none'; document.getElementById('2502.12574v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12526">arXiv:2502.12526</a> <span> [<a href="https://arxiv.org/pdf/2502.12526">pdf</a>, <a href="https://arxiv.org/format/2502.12526">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> AnimAlte:Designing AI-Infused Cartoon Videos to Improve Preschoolers' Language Learning with Family Engagement at Home </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tsang%2C+S">Shiya Tsang</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+R">Ruiyao Miao</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Junren Xiao</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+H">Hui Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12526v1-abstract-short" style="display: inline;"> Cartoon videos have proven to be effective in learning vocabulary to preschool children.However, we have little knowledge about integrating AI into cartoon videos to provide systematic, multimodal vocabulary learning support. This late-breaking work present \name{}, an AI-powered cartoon video system that enables real-time Q\&A, vocabulary review, and contextual learning. Preliminary findings cont… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12526v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12526v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12526v1-abstract-full" style="display: none;"> Cartoon videos have proven to be effective in learning vocabulary to preschool children.However, we have little knowledge about integrating AI into cartoon videos to provide systematic, multimodal vocabulary learning support. This late-breaking work present \name{}, an AI-powered cartoon video system that enables real-time Q\&A, vocabulary review, and contextual learning. Preliminary findings contextualized how families interact with \name{} to support vocabulary learning. Parents appreciated the system for its personalized, engaging experiences, fostering collaboration, and encouraging self-reflection on parenting. This study offers valuable design implications for informing future video systems to support vocabulary learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12526v1-abstract-full').style.display = 'none'; document.getElementById('2502.12526v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09838">arXiv:2502.09838</a> <span> [<a href="https://arxiv.org/pdf/2502.09838">pdf</a>, <a href="https://arxiv.org/format/2502.09838">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> HealthGPT: A Medical Large Vision-Language Model for Unifying Comprehension and Generation via Heterogeneous Knowledge Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+T">Tianwei Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenqiao Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Sijing Li</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yuqian Yuan</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Binhe Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoyuan Li</a>, <a href="/search/cs?searchtype=author&query=He%2C+W">Wanggui He</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Hao Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mengze Li</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xiaohui Song</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Siliang Tang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jun Xiao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hui Lin</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+Y">Yueting Zhuang</a>, <a href="/search/cs?searchtype=author&query=Ooi%2C+B+C">Beng Chin Ooi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09838v2-abstract-short" style="display: inline;"> We present HealthGPT, a powerful Medical Large Vision-Language Model (Med-LVLM) that integrates medical visual comprehension and generation capabilities within a unified autoregressive paradigm. Our bootstrapping philosophy is to progressively adapt heterogeneous comprehension and generation knowledge to pre-trained large language models (LLMs). This is achieved through a novel heterogeneous low-r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09838v2-abstract-full').style.display = 'inline'; document.getElementById('2502.09838v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09838v2-abstract-full" style="display: none;"> We present HealthGPT, a powerful Medical Large Vision-Language Model (Med-LVLM) that integrates medical visual comprehension and generation capabilities within a unified autoregressive paradigm. Our bootstrapping philosophy is to progressively adapt heterogeneous comprehension and generation knowledge to pre-trained large language models (LLMs). This is achieved through a novel heterogeneous low-rank adaptation (H-LoRA) technique, which is complemented by a tailored hierarchical visual perception approach and a three-stage learning strategy. To effectively learn the HealthGPT, we devise a comprehensive medical domain-specific comprehension and generation dataset called VL-Health. Experimental results demonstrate exceptional performance and scalability of HealthGPT in medical visual unified tasks. Our project can be accessed at https://github.com/DCDmllm/HealthGPT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09838v2-abstract-full').style.display = 'none'; document.getElementById('2502.09838v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Comments: added project page</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09723">arXiv:2502.09723</a> <span> [<a href="https://arxiv.org/pdf/2502.09723">pdf</a>, <a href="https://arxiv.org/format/2502.09723">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Making Them a Malicious Database: Exploiting Query Code to Jailbreak Aligned Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zou%2C+Q">Qingsong Zou</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jingyu Xiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qing Li</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Z">Zhi Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuhang Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+L">Li Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+K">Kuofeng Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruoyu Li</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yong Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09723v2-abstract-short" style="display: inline;"> Recent advances in large language models (LLMs) have demonstrated remarkable potential in the field of natural language processing. Unfortunately, LLMs face significant security and ethical risks. Although techniques such as safety alignment are developed for defense, prior researches reveal the possibility of bypassing such defenses through well-designed jailbreak attacks. In this paper, we propo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09723v2-abstract-full').style.display = 'inline'; document.getElementById('2502.09723v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09723v2-abstract-full" style="display: none;"> Recent advances in large language models (LLMs) have demonstrated remarkable potential in the field of natural language processing. Unfortunately, LLMs face significant security and ethical risks. Although techniques such as safety alignment are developed for defense, prior researches reveal the possibility of bypassing such defenses through well-designed jailbreak attacks. In this paper, we propose QueryAttack, a novel framework to examine the generalizability of safety alignment. By treating LLMs as knowledge databases, we translate malicious queries in natural language into structured non-natural query language to bypass the safety alignment mechanisms of LLMs. We conduct extensive experiments on mainstream LLMs, and the results show that QueryAttack not only can achieve high attack success rates (ASRs), but also can jailbreak various defense methods. Furthermore, we tailor a defense method against QueryAttack, which can reduce ASR by up to 64% on GPT-4-1106. Our code is available at https://github.com/horizonsinzqs/QueryAttack. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09723v2-abstract-full').style.display = 'none'; document.getElementById('2502.09723v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07556">arXiv:2502.07556</a> <span> [<a href="https://arxiv.org/pdf/2502.07556">pdf</a>, <a href="https://arxiv.org/format/2502.07556">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SketchFlex: Facilitating Spatial-Semantic Coherence in Text-to-Image Generation with Region-Based Sketches </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haichuan Lin</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yilin Ye</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+J">Jiazhi Xia</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+W">Wei Zeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07556v1-abstract-short" style="display: inline;"> Text-to-image models can generate visually appealing images from text descriptions. Efforts have been devoted to improving model controls with prompt tuning and spatial conditioning. However, our formative study highlights the challenges for non-expert users in crafting appropriate prompts and specifying fine-grained spatial conditions (e.g., depth or canny references) to generate semantically coh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07556v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07556v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07556v1-abstract-full" style="display: none;"> Text-to-image models can generate visually appealing images from text descriptions. Efforts have been devoted to improving model controls with prompt tuning and spatial conditioning. However, our formative study highlights the challenges for non-expert users in crafting appropriate prompts and specifying fine-grained spatial conditions (e.g., depth or canny references) to generate semantically cohesive images, especially when multiple objects are involved. In response, we introduce SketchFlex, an interactive system designed to improve the flexibility of spatially conditioned image generation using rough region sketches. The system automatically infers user prompts with rational descriptions within a semantic space enriched by crowd-sourced object attributes and relationships. Additionally, SketchFlex refines users' rough sketches into canny-based shape anchors, ensuring the generation quality and alignment of user intentions. Experimental results demonstrate that SketchFlex achieves more cohesive image generations than end-to-end models, meanwhile significantly reducing cognitive load and better matching user intentions compared to region-based generation baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07556v1-abstract-full').style.display = 'none'; document.getElementById('2502.07556v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">conference: CHI2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07411">arXiv:2502.07411</a> <span> [<a href="https://arxiv.org/pdf/2502.07411">pdf</a>, <a href="https://arxiv.org/format/2502.07411">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> EgoTextVQA: Towards Egocentric Scene-Text Aware Video Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Sheng Zhou</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Junbin Xiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qingyun Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yicong Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xun Yang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D">Dan Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Meng Wang</a>, <a href="/search/cs?searchtype=author&query=Chua%2C+T">Tat-Seng Chua</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+A">Angela Yao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07411v1-abstract-short" style="display: inline;"> We introduce EgoTextVQA, a novel and rigorously constructed benchmark for egocentric QA assistance involving scene text. EgoTextVQA contains 1.5K ego-view videos and 7K scene-text aware questions that reflect real-user needs in outdoor driving and indoor house-keeping activities. The questions are designed to elicit identification and reasoning on scene text in an egocentric and dynamic environmen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07411v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07411v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07411v1-abstract-full" style="display: none;"> We introduce EgoTextVQA, a novel and rigorously constructed benchmark for egocentric QA assistance involving scene text. EgoTextVQA contains 1.5K ego-view videos and 7K scene-text aware questions that reflect real-user needs in outdoor driving and indoor house-keeping activities. The questions are designed to elicit identification and reasoning on scene text in an egocentric and dynamic environment. With EgoTextVQA, we comprehensively evaluate 10 prominent multimodal large language models. Currently, all models struggle, and the best results (Gemini 1.5 Pro) are around 33% accuracy, highlighting the severe deficiency of these techniques in egocentric QA assistance. Our further investigations suggest that precise temporal grounding and multi-frame reasoning, along with high resolution and auxiliary scene-text inputs, are key for better performance. With thorough analyses and heuristic suggestions, we hope EgoTextVQA can serve as a solid testbed for research in egocentric scene-text QA assistance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07411v1-abstract-full').style.display = 'none'; document.getElementById('2502.07411v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05824">arXiv:2502.05824</a> <span> [<a href="https://arxiv.org/pdf/2502.05824">pdf</a>, <a href="https://arxiv.org/format/2502.05824">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Aerial Reliable Collaborative Communications for Terrestrial Mobile Users via Evolutionary Multi-Objective Deep Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+G">Geng Sun</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jian Xiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiahui Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiacheng Wang</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/cs?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+S">Shiwen Mao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05824v1-abstract-short" style="display: inline;"> Unmanned aerial vehicles (UAVs) have emerged as the potential aerial base stations (BSs) to improve terrestrial communications. However, the limited onboard energy and antenna power of a UAV restrict its communication range and transmission capability. To address these limitations, this work employs collaborative beamforming through a UAV-enabled virtual antenna array to improve transmission perfo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05824v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05824v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05824v1-abstract-full" style="display: none;"> Unmanned aerial vehicles (UAVs) have emerged as the potential aerial base stations (BSs) to improve terrestrial communications. However, the limited onboard energy and antenna power of a UAV restrict its communication range and transmission capability. To address these limitations, this work employs collaborative beamforming through a UAV-enabled virtual antenna array to improve transmission performance from the UAV to terrestrial mobile users, under interference from non-associated BSs and dynamic channel conditions. Specifically, we introduce a memory-based random walk model to more accurately depict the mobility patterns of terrestrial mobile users. Following this, we formulate a multi-objective optimization problem (MOP) focused on maximizing the transmission rate while minimizing the flight energy consumption of the UAV swarm. Given the NP-hard nature of the formulated MOP and the highly dynamic environment, we transform this problem into a multi-objective Markov decision process and propose an improved evolutionary multi-objective reinforcement learning algorithm. Specifically, this algorithm introduces an evolutionary learning approach to obtain the approximate Pareto set for the formulated MOP. Moreover, the algorithm incorporates a long short-term memory network and hyper-sphere-based task selection method to discern the movement patterns of terrestrial mobile users and improve the diversity of the obtained Pareto set. Simulation results demonstrate that the proposed method effectively generates a diverse range of non-dominated policies and outperforms existing methods. Additional simulations demonstrate the scalability and robustness of the proposed CB-based method under different system parameters and various unexpected circumstances. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05824v1-abstract-full').style.display = 'none'; document.getElementById('2502.05824v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02988">arXiv:2502.02988</a> <span> [<a href="https://arxiv.org/pdf/2502.02988">pdf</a>, <a href="https://arxiv.org/format/2502.02988">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3701716.3715265">10.1145/3701716.3715265 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Training an LLM-as-a-Judge Model: Pipeline, Insights, and Practical Lessons </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+R">Renjun Hu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yi Cheng</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+L">Libin Meng</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+J">Jiaxin Xia</a>, <a href="/search/cs?searchtype=author&query=Zong%2C+Y">Yi Zong</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xing Shi</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Wei Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02988v1-abstract-short" style="display: inline;"> The rapid advancement of large language models (LLMs) has opened new possibilities for their adoption as evaluative judges. This paper introduces Themis, a fine-tuned LLM judge that delivers sophisticated context-aware evaluations. We provide a comprehensive overview of the development pipeline for Themis, highlighting its scenario-dependent evaluation prompts and two novel methods for controlled… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02988v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02988v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02988v1-abstract-full" style="display: none;"> The rapid advancement of large language models (LLMs) has opened new possibilities for their adoption as evaluative judges. This paper introduces Themis, a fine-tuned LLM judge that delivers sophisticated context-aware evaluations. We provide a comprehensive overview of the development pipeline for Themis, highlighting its scenario-dependent evaluation prompts and two novel methods for controlled instruction generation. These designs enable Themis to effectively distill evaluative skills from teacher models, while retaining flexibility for continuous development. We introduce two human-labeled benchmarks for meta-evaluation, demonstrating that Themis can achieve high alignment with human preferences in an economical manner. Additionally, we explore insights into the LLM-as-a-judge paradigm, revealing nuances in performance and the varied effects of reference answers. Notably, we observe that pure knowledge distillation from strong LLMs, though common, does not guarantee performance improvement through scaling. We propose a mitigation strategy based on instruction-following difficulty. Furthermore, we provide practical guidelines covering data balancing, prompt customization, multi-objective training, and metric aggregation. We aim for our method and findings, along with the fine-tuning data, benchmarks, and model checkpoints, to support future research and development in this area. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02988v1-abstract-full').style.display = 'none'; document.getElementById('2502.02988v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted at WWW'25 (Industrial Track), extended version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02862">arXiv:2502.02862</a> <span> [<a href="https://arxiv.org/pdf/2502.02862">pdf</a>, <a href="https://arxiv.org/format/2502.02862">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Learning Generalizable Features for Tibial Plateau Fracture Segmentation Using Masked Autoencoder and Limited Annotations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yue%2C+P">Peiyan Yue</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+D">Die Cai</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+C">Chu Guo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mengxing Liu</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+J">Jun Xia</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02862v1-abstract-short" style="display: inline;"> Accurate automated segmentation of tibial plateau fractures (TPF) from computed tomography (CT) requires large amounts of annotated data to train deep learning models, but obtaining such annotations presents unique challenges. The process demands expert knowledge to identify diverse fracture patterns, assess severity, and account for individual anatomical variations, making the annotation process… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02862v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02862v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02862v1-abstract-full" style="display: none;"> Accurate automated segmentation of tibial plateau fractures (TPF) from computed tomography (CT) requires large amounts of annotated data to train deep learning models, but obtaining such annotations presents unique challenges. The process demands expert knowledge to identify diverse fracture patterns, assess severity, and account for individual anatomical variations, making the annotation process highly time-consuming and expensive. Although semi-supervised learning methods can utilize unlabeled data, existing approaches often struggle with the complexity and variability of fracture morphologies, as well as limited generalizability across datasets. To tackle these issues, we propose an effective training strategy based on masked autoencoder (MAE) for the accurate TPF segmentation in CT. Our method leverages MAE pretraining to capture global skeletal structures and fine-grained fracture details from unlabeled data, followed by fine-tuning with a small set of labeled data. This strategy reduces the dependence on extensive annotations while enhancing the model's ability to learn generalizable and transferable features. The proposed method is evaluated on an in-house dataset containing 180 CT scans with TPF. Experimental results demonstrate that our method consistently outperforms semi-supervised methods, achieving an average Dice similarity coefficient (DSC) of 95.81%, average symmetric surface distance (ASSD) of 1.91mm, and Hausdorff distance (95HD) of 9.42mm with only 20 annotated cases. Moreover, our method exhibits strong transferability when applying to another public pelvic CT dataset with hip fractures, highlighting its potential for broader applications in fracture segmentation tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02862v1-abstract-full').style.display = 'none'; document.getElementById('2502.02862v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02225">arXiv:2502.02225</a> <span> [<a href="https://arxiv.org/pdf/2502.02225">pdf</a>, <a href="https://arxiv.org/format/2502.02225">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Exploring the latent space of diffusion models directly through singular value decomposition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+L">Li Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+B">Boyan Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yanran Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhao Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaosong Yang</a>, <a href="/search/cs?searchtype=author&query=Clifton%2C+D+A">David A. Clifton</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jun Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02225v1-abstract-short" style="display: inline;"> Despite the groundbreaking success of diffusion models in generating high-fidelity images, their latent space remains relatively under-explored, even though it holds significant promise for enabling versatile and interpretable image editing capabilities. The complicated denoising trajectory and high dimensionality of the latent space make it extremely challenging to interpret. Existing methods mai… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02225v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02225v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02225v1-abstract-full" style="display: none;"> Despite the groundbreaking success of diffusion models in generating high-fidelity images, their latent space remains relatively under-explored, even though it holds significant promise for enabling versatile and interpretable image editing capabilities. The complicated denoising trajectory and high dimensionality of the latent space make it extremely challenging to interpret. Existing methods mainly explore the feature space of U-Net in Diffusion Models (DMs) instead of the latent space itself. In contrast, we directly investigate the latent space via Singular Value Decomposition (SVD) and discover three useful properties that can be used to control generation results without the requirements of data collection and maintain identity fidelity generated images. Based on these properties, we propose a novel image editing framework that is capable of learning arbitrary attributes from one pair of latent codes destined by text prompts in Stable Diffusion Models. To validate our approach, extensive experiments are conducted to demonstrate its effectiveness and flexibility in image editing. We will release our codes soon to foster further research and applications in this area. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02225v1-abstract-full').style.display = 'none'; document.getElementById('2502.02225v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01666">arXiv:2502.01666</a> <span> [<a href="https://arxiv.org/pdf/2502.01666">pdf</a>, <a href="https://arxiv.org/format/2502.01666">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Leveraging Stable Diffusion for Monocular Depth Estimation via Image Semantic Encoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xia%2C+J">Jingming Xia</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+G">Guanqun Cao</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+G">Guang Ma</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yiben Luo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qinzhao Li</a>, <a href="/search/cs?searchtype=author&query=Oyekan%2C+J">John Oyekan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01666v1-abstract-short" style="display: inline;"> Monocular depth estimation involves predicting depth from a single RGB image and plays a crucial role in applications such as autonomous driving, robotic navigation, 3D reconstruction, etc. Recent advancements in learning-based methods have significantly improved depth estimation performance. Generative models, particularly Stable Diffusion, have shown remarkable potential in recovering fine detai… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01666v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01666v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01666v1-abstract-full" style="display: none;"> Monocular depth estimation involves predicting depth from a single RGB image and plays a crucial role in applications such as autonomous driving, robotic navigation, 3D reconstruction, etc. Recent advancements in learning-based methods have significantly improved depth estimation performance. Generative models, particularly Stable Diffusion, have shown remarkable potential in recovering fine details and reconstructing missing regions through large-scale training on diverse datasets. However, models like CLIP, which rely on textual embeddings, face limitations in complex outdoor environments where rich context information is needed. These limitations reduce their effectiveness in such challenging scenarios. Here, we propose a novel image-based semantic embedding that extracts contextual information directly from visual features, significantly improving depth prediction in complex environments. Evaluated on the KITTI and Waymo datasets, our method achieves performance comparable to state-of-the-art models while addressing the shortcomings of CLIP embeddings in handling outdoor scenes. By leveraging visual semantics directly, our method demonstrates enhanced robustness and adaptability in depth estimation tasks, showcasing its potential for application to other visual perception tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01666v1-abstract-full').style.display = 'none'; document.getElementById('2502.01666v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01035">arXiv:2502.01035</a> <span> [<a href="https://arxiv.org/pdf/2502.01035">pdf</a>, <a href="https://arxiv.org/format/2502.01035">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> UASTHN: Uncertainty-Aware Deep Homography Estimation for UAV Satellite-Thermal Geo-localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jiuhong Xiao</a>, <a href="/search/cs?searchtype=author&query=Loianno%2C+G">Giuseppe Loianno</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01035v1-abstract-short" style="display: inline;"> Geo-localization is an essential component of Unmanned Aerial Vehicle (UAV) navigation systems to ensure precise absolute self-localization in outdoor environments. To address the challenges of GPS signal interruptions or low illumination, Thermal Geo-localization (TG) employs aerial thermal imagery to align with reference satellite maps to accurately determine the UAV's location. However, existin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01035v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01035v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01035v1-abstract-full" style="display: none;"> Geo-localization is an essential component of Unmanned Aerial Vehicle (UAV) navigation systems to ensure precise absolute self-localization in outdoor environments. To address the challenges of GPS signal interruptions or low illumination, Thermal Geo-localization (TG) employs aerial thermal imagery to align with reference satellite maps to accurately determine the UAV's location. However, existing TG methods lack uncertainty measurement in their outputs, compromising system robustness in the presence of textureless or corrupted thermal images, self-similar or outdated satellite maps, geometric noises, or thermal images exceeding satellite maps. To overcome these limitations, this paper presents \textit{UASTHN}, a novel approach for Uncertainty Estimation (UE) in Deep Homography Estimation (DHE) tasks for TG applications. Specifically, we introduce a novel Crop-based Test-Time Augmentation (CropTTA) strategy, which leverages the homography consensus of cropped image views to effectively measure data uncertainty. This approach is complemented by Deep Ensembles (DE) employed for model uncertainty, offering comparable performance with improved efficiency and seamless integration with any DHE model. Extensive experiments across multiple DHE models demonstrate the effectiveness and efficiency of CropTTA in TG applications. Analysis of detected failure cases underscores the improved reliability of CropTTA under challenging conditions. Finally, we demonstrate the capability of combining CropTTA and DE for a comprehensive assessment of both data and model uncertainty. Our research provides profound insights into the broader intersection of localization and uncertainty estimation. The code and data is publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01035v1-abstract-full').style.display = 'none'; document.getElementById('2502.01035v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 6 figures, accepted at ICRA 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00298">arXiv:2502.00298</a> <span> [<a href="https://arxiv.org/pdf/2502.00298">pdf</a>, <a href="https://arxiv.org/ps/2502.00298">ps</a>, <a href="https://arxiv.org/format/2502.00298">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> The Price of Linear Time: Error Analysis of Structured Kernel Interpolation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Moreno%2C+A">Alexander Moreno</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Justin Xiao</a>, <a href="/search/cs?searchtype=author&query=Mei%2C+J">Jonathan Mei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00298v2-abstract-short" style="display: inline;"> Structured Kernel Interpolation (SKI) (Wilson et al. 2015) helps scale Gaussian Processes (GPs) by approximating the kernel matrix via interpolation at inducing points, achieving linear computational complexity. However, it lacks rigorous theoretical error analysis. This paper bridges the gap: we prove error bounds for the SKI Gram matrix and examine the error's effect on hyperparameter estimation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00298v2-abstract-full').style.display = 'inline'; document.getElementById('2502.00298v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00298v2-abstract-full" style="display: none;"> Structured Kernel Interpolation (SKI) (Wilson et al. 2015) helps scale Gaussian Processes (GPs) by approximating the kernel matrix via interpolation at inducing points, achieving linear computational complexity. However, it lacks rigorous theoretical error analysis. This paper bridges the gap: we prove error bounds for the SKI Gram matrix and examine the error's effect on hyperparameter estimation and posterior inference. We further provide a practical guide to selecting the number of inducing points under convolutional cubic interpolation: they should grow as $n^{d/3}$ for error control. Crucially, we identify two dimensionality regimes governing the trade-off between SKI Gram matrix spectral norm error and computational complexity. For $d \leq 3$, any error tolerance can achieve linear time for sufficiently large sample size. For $d > 3$, the error must increase with sample size to maintain linear time. Our analysis provides key insights into SKI's scalability-accuracy trade-offs, establishing precise conditions for achieving linear-time GP inference with controlled approximation error. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00298v2-abstract-full').style.display = 'none'; document.getElementById('2502.00298v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.19298">arXiv:2501.19298</a> <span> [<a href="https://arxiv.org/pdf/2501.19298">pdf</a>, <a href="https://arxiv.org/format/2501.19298">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Synthetic User Behavior Sequence Generation with Large Language Models for Smart Homes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhiyao Xu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+D">Dan Zhao</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+Q">Qingsong Zou</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jingyu Xiao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yong Jiang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Z">Zhenhui Yuan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qing Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.19298v1-abstract-short" style="display: inline;"> In recent years, as smart home systems have become more widespread, security concerns within these environments have become a growing threat. Currently, most smart home security solutions, such as anomaly detection and behavior prediction models, are trained using fixed datasets that are precollected. However, the process of dataset collection is time-consuming and lacks the flexibility needed to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19298v1-abstract-full').style.display = 'inline'; document.getElementById('2501.19298v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.19298v1-abstract-full" style="display: none;"> In recent years, as smart home systems have become more widespread, security concerns within these environments have become a growing threat. Currently, most smart home security solutions, such as anomaly detection and behavior prediction models, are trained using fixed datasets that are precollected. However, the process of dataset collection is time-consuming and lacks the flexibility needed to adapt to the constantly evolving smart home environment. Additionally, the collection of personal data raises significant privacy concerns for users. Lately, large language models (LLMs) have emerged as a powerful tool for a wide range of tasks across diverse application domains, thanks to their strong capabilities in natural language processing, reasoning, and problem-solving. In this paper, we propose an LLM-based synthetic dataset generation IoTGen framework to enhance the generalization of downstream smart home intelligent models. By generating new synthetic datasets that reflect changes in the environment, smart home intelligent models can be retrained to overcome the limitations of fixed and outdated data, allowing them to better align with the dynamic nature of real-world home environments. Specifically, we first propose a Structure Pattern Perception Compression (SPPC) method tailored for IoT behavior data, which preserves the most informative content in the data while significantly reducing token consumption. Then, we propose a systematic approach to create prompts and implement data generation to automatically generate IoT synthetic data with normative and reasonable properties, assisting task models in adaptive training to improve generalization and real-world performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19298v1-abstract-full').style.display = 'none'; document.getElementById('2501.19298v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.19267">arXiv:2501.19267</a> <span> [<a href="https://arxiv.org/pdf/2501.19267">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> Transformer-Based Financial Fraud Detection with Cloud-Optimized Real-Time Streaming </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+T">Tingting Deng</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+S">Shuochen Bi</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jue Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.19267v1-abstract-short" style="display: inline;"> As the financial industry becomes more interconnected and reliant on digital systems, fraud detection systems must evolve to meet growing threats. Cloud-enabled Transformer models present a transformative opportunity to address these challenges. By leveraging the scalability, flexibility, and advanced AI capabilities of cloud platforms, companies can deploy fraud detection solutions that adapt to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19267v1-abstract-full').style.display = 'inline'; document.getElementById('2501.19267v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.19267v1-abstract-full" style="display: none;"> As the financial industry becomes more interconnected and reliant on digital systems, fraud detection systems must evolve to meet growing threats. Cloud-enabled Transformer models present a transformative opportunity to address these challenges. By leveraging the scalability, flexibility, and advanced AI capabilities of cloud platforms, companies can deploy fraud detection solutions that adapt to real-time data patterns and proactively respond to evolving threats. Using the Graph self-attention Transformer neural network module, we can directly excavate gang fraud features from the transaction network without constructing complicated feature engineering. Finally, the fraud prediction network is combined to optimize the topological pattern and the temporal transaction pattern to realize the high-precision detection of fraudulent transactions. The results of antifraud experiments on credit card transaction data show that the proposed model outperforms the 7 baseline models on all evaluation indicators: In the transaction fraud detection task, the average accuracy (AP) increased by 20% and the area under the ROC curve (AUC) increased by 2.7% on average compared with the benchmark graph attention neural network (GAT), which verified the effectiveness of the proposed model in the detection of credit card fraud transactions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19267v1-abstract-full').style.display = 'none'; document.getElementById('2501.19267v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 Pages, 3 figures, 2 Tables. arXiv admin note: text overlap with arXiv:2406.03733 by other authors</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.19129">arXiv:2501.19129</a> <span> [<a href="https://arxiv.org/pdf/2501.19129">pdf</a>, <a href="https://arxiv.org/format/2501.19129">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> RGB-Event ISP: The Dataset and Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yunfan Lu</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+Y">Yanlin Qian</a>, <a href="/search/cs?searchtype=author&query=Rao%2C+Z">Ziyang Rao</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Junren Xiao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liming Chen</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+H">Hui Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.19129v1-abstract-short" style="display: inline;"> Event-guided imaging has received significant attention due to its potential to revolutionize instant imaging systems. However, the prior methods primarily focus on enhancing RGB images in a post-processing manner, neglecting the challenges of image signal processor (ISP) dealing with event sensor and the benefits events provide for reforming the ISP process. To achieve this, we conduct the first… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19129v1-abstract-full').style.display = 'inline'; document.getElementById('2501.19129v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.19129v1-abstract-full" style="display: none;"> Event-guided imaging has received significant attention due to its potential to revolutionize instant imaging systems. However, the prior methods primarily focus on enhancing RGB images in a post-processing manner, neglecting the challenges of image signal processor (ISP) dealing with event sensor and the benefits events provide for reforming the ISP process. To achieve this, we conduct the first research on event-guided ISP. First, we present a new event-RAW paired dataset, collected with a novel but still confidential sensor that records pixel-level aligned events and RAW images. This dataset includes 3373 RAW images with 2248 x 3264 resolution and their corresponding events, spanning 24 scenes with 3 exposure modes and 3 lenses. Second, we propose a conventional ISP pipeline to generate good RGB frames as reference. This conventional ISP pipleline performs basic ISP operations, e.g.demosaicing, white balancing, denoising and color space transforming, with a ColorChecker as reference. Third, we classify the existing learnable ISP methods into 3 classes, and select multiple methods to train and evaluate on our new dataset. Lastly, since there is no prior work for reference, we propose a simple event-guided ISP method and test it on our dataset. We further put forward key technical challenges and future directions in RGB-Event ISP. In summary, to the best of our knowledge, this is the very first research focusing on event-guided ISP, and we hope it will inspire the community. The code and dataset are available at: https://github.com/yunfanLu/RGB-Event-ISP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19129v1-abstract-full').style.display = 'none'; document.getElementById('2501.19129v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICLR 2025; 14 pages, 8 figures, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18492">arXiv:2501.18492</a> <span> [<a href="https://arxiv.org/pdf/2501.18492">pdf</a>, <a href="https://arxiv.org/format/2501.18492">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> GuardReasoner: Towards Reasoning-based LLM Safeguards </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yue Liu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Hongcheng Gao</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+S">Shengfang Zhai</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+J">Jun Xia</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tianyi Wu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+Z">Zhiwei Xue</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yulin Chen</a>, <a href="/search/cs?searchtype=author&query=Kawaguchi%2C+K">Kenji Kawaguchi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiaheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Hooi%2C+B">Bryan Hooi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18492v1-abstract-short" style="display: inline;"> As LLMs increasingly impact safety-critical applications, ensuring their safety using guardrails remains a key challenge. This paper proposes GuardReasoner, a new safeguard for LLMs, by guiding the guard model to learn to reason. Concretely, we first create the GuardReasonerTrain dataset, which consists of 127K samples with 460K detailed reasoning steps. Then, we introduce reasoning SFT to unlock… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18492v1-abstract-full').style.display = 'inline'; document.getElementById('2501.18492v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18492v1-abstract-full" style="display: none;"> As LLMs increasingly impact safety-critical applications, ensuring their safety using guardrails remains a key challenge. This paper proposes GuardReasoner, a new safeguard for LLMs, by guiding the guard model to learn to reason. Concretely, we first create the GuardReasonerTrain dataset, which consists of 127K samples with 460K detailed reasoning steps. Then, we introduce reasoning SFT to unlock the reasoning capability of guard models. In addition, we present hard sample DPO to further strengthen their reasoning ability. In this manner, GuardReasoner achieves better performance, explainability, and generalizability. Extensive experiments and analyses on 13 benchmarks of 3 guardrail tasks demonstrate its superiority. Remarkably, GuardReasoner 8B surpasses GPT-4o+CoT by 5.74% and LLaMA Guard 3 8B by 20.84% F1 score on average. We release the training data, code, and models with different scales (1B, 3B, 8B) of GuardReasoner : https://github.com/yueliu1999/GuardReasoner/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18492v1-abstract-full').style.display = 'none'; document.getElementById('2501.18492v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages, 18 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15504">arXiv:2501.15504</a> <span> [<a href="https://arxiv.org/pdf/2501.15504">pdf</a>, <a href="https://arxiv.org/format/2501.15504">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Task Scheduling in Geo-Distributed Computing: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yujian Wu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Shanjiang Tang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+C">Ce Yu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+B">Bin Yang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+C">Chao Sun</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jian Xiao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Hutong Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15504v1-abstract-short" style="display: inline;"> Geo-distributed computing, a paradigm that assigns computational tasks to globally distributed nodes, has emerged as a promising approach in cloud computing, edge computing, cloud-edge computing and supercomputer computing (HPC). It enables low-latency services, ensures data locality, and handles large-scale applications. As global computing capacity and task demands increase rapidly, scheduling t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15504v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15504v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15504v1-abstract-full" style="display: none;"> Geo-distributed computing, a paradigm that assigns computational tasks to globally distributed nodes, has emerged as a promising approach in cloud computing, edge computing, cloud-edge computing and supercomputer computing (HPC). It enables low-latency services, ensures data locality, and handles large-scale applications. As global computing capacity and task demands increase rapidly, scheduling tasks for efficient execution in geo-distributed computing systems has become an increasingly critical research challenge. It arises from the inherent characteristics of geographic distribution, including heterogeneous network conditions, region-specific resource pricing, and varying computational capabilities across locations. Researchers have developed diverse task scheduling methods tailored to geo-distributed scenarios, aiming to achieve objectives such as performance enhancement, fairness assurance, and fault-tolerance improvement. This survey provides a comprehensive and systematic review of task scheduling techniques across four major distributed computing environments, with an in-depth analysis of these approaches based on their core scheduling objectives. Through our analysis, we identify key research challenges and outline promising directions for advancing task scheduling in geo-distributed computing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15504v1-abstract-full').style.display = 'none'; document.getElementById('2501.15504v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12235">arXiv:2501.12235</a> <span> [<a href="https://arxiv.org/pdf/2501.12235">pdf</a>, <a href="https://arxiv.org/format/2501.12235">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> DLEN: Dual Branch of Transformer for Low-Light Image Enhancement in Dual Domains </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xia%2C+J">Junyu Xia</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+J">Jiesong Bai</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yihang Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12235v1-abstract-short" style="display: inline;"> Low-light image enhancement (LLE) aims to improve the visual quality of images captured in poorly lit conditions, which often suffer from low brightness, low contrast, noise, and color distortions. These issues hinder the performance of computer vision tasks such as object detection, facial recognition, and autonomous driving.Traditional enhancement techniques, such as multi-scale fusion and histo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12235v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12235v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12235v1-abstract-full" style="display: none;"> Low-light image enhancement (LLE) aims to improve the visual quality of images captured in poorly lit conditions, which often suffer from low brightness, low contrast, noise, and color distortions. These issues hinder the performance of computer vision tasks such as object detection, facial recognition, and autonomous driving.Traditional enhancement techniques, such as multi-scale fusion and histogram equalization, fail to preserve fine details and often struggle with maintaining the natural appearance of enhanced images under complex lighting conditions. Although the Retinex theory provides a foundation for image decomposition, it often amplifies noise, leading to suboptimal image quality. In this paper, we propose the Dual Light Enhance Network (DLEN), a novel architecture that incorporates two distinct attention mechanisms, considering both spatial and frequency domains. Our model introduces a learnable wavelet transform module in the illumination estimation phase, preserving high- and low-frequency components to enhance edge and texture details. Additionally, we design a dual-branch structure that leverages the power of the Transformer architecture to enhance both the illumination and structural components of the image.Through extensive experiments, our model outperforms state-of-the-art methods on standard benchmarks.Code is available here: https://github.com/LaLaLoXX/DLEN <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12235v1-abstract-full').style.display = 'none'; document.getElementById('2501.12235v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10pages,6figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12202">arXiv:2501.12202</a> <span> [<a href="https://arxiv.org/pdf/2501.12202">pdf</a>, <a href="https://arxiv.org/format/2501.12202">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Hunyuan3D 2.0: Scaling Diffusion Models for High Resolution Textured 3D Assets Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zibo Zhao</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+Z">Zeqiang Lai</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Q">Qingxiang Lin</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yunfei Zhao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haolin Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuhui Yang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yifei Feng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Mingxin Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Sheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xianghui Yang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+H">Huiwen Shi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Sicong Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Junta Wu</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+Y">Yihang Lian</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+F">Fan Yang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+R">Ruining Tang</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zebin He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinzhou Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jian Liu</a>, <a href="/search/cs?searchtype=author&query=Zuo%2C+X">Xuhui Zuo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhuo Chen</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+B">Biwen Lei</a>, <a href="/search/cs?searchtype=author&query=Weng%2C+H">Haohan Weng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jing Xu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yiling Zhu</a> , et al. (46 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12202v2-abstract-short" style="display: inline;"> We present Hunyuan3D 2.0, an advanced large-scale 3D synthesis system for generating high-resolution textured 3D assets. This system includes two foundation components: a large-scale shape generation model -- Hunyuan3D-DiT, and a large-scale texture synthesis model -- Hunyuan3D-Paint. The shape generative model, built on a scalable flow-based diffusion transformer, aims to create geometry that pro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12202v2-abstract-full').style.display = 'inline'; document.getElementById('2501.12202v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12202v2-abstract-full" style="display: none;"> We present Hunyuan3D 2.0, an advanced large-scale 3D synthesis system for generating high-resolution textured 3D assets. This system includes two foundation components: a large-scale shape generation model -- Hunyuan3D-DiT, and a large-scale texture synthesis model -- Hunyuan3D-Paint. The shape generative model, built on a scalable flow-based diffusion transformer, aims to create geometry that properly aligns with a given condition image, laying a solid foundation for downstream applications. The texture synthesis model, benefiting from strong geometric and diffusion priors, produces high-resolution and vibrant texture maps for either generated or hand-crafted meshes. Furthermore, we build Hunyuan3D-Studio -- a versatile, user-friendly production platform that simplifies the re-creation process of 3D assets. It allows both professional and amateur users to manipulate or even animate their meshes efficiently. We systematically evaluate our models, showing that Hunyuan3D 2.0 outperforms previous state-of-the-art models, including the open-source models and closed-source models in geometry details, condition alignment, texture quality, and etc. Hunyuan3D 2.0 is publicly released in order to fill the gaps in the open-source 3D community for large-scale foundation generative models. The code and pre-trained weights of our models are available at: https://github.com/Tencent/Hunyuan3D-2 <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12202v2-abstract-full').style.display = 'none'; document.getElementById('2501.12202v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">GitHub link: https://github.com/Tencent/Hunyuan3D-2</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.11937">arXiv:2501.11937</a> <span> [<a href="https://arxiv.org/pdf/2501.11937">pdf</a>, <a href="https://arxiv.org/format/2501.11937">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MeshONet: A Generalizable and Efficient Operator Learning Method for Structured Mesh Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jing Xiao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinhai Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qingling Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jie Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.11937v2-abstract-short" style="display: inline;"> Mesh generation plays a crucial role in scientific computing. Traditional mesh generation methods, such as TFI and PDE-based methods, often struggle to achieve a balance between efficiency and mesh quality. To address this challenge, physics-informed intelligent learning methods have recently emerged, significantly improving generation efficiency while maintaining high mesh quality. However, physi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11937v2-abstract-full').style.display = 'inline'; document.getElementById('2501.11937v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.11937v2-abstract-full" style="display: none;"> Mesh generation plays a crucial role in scientific computing. Traditional mesh generation methods, such as TFI and PDE-based methods, often struggle to achieve a balance between efficiency and mesh quality. To address this challenge, physics-informed intelligent learning methods have recently emerged, significantly improving generation efficiency while maintaining high mesh quality. However, physics-informed methods fail to generalize when applied to previously unseen geometries, as even small changes in the boundary shape necessitate burdensome retraining to adapt to new geometric variations. In this paper, we introduce MeshONet, the first generalizable intelligent learning method for structured mesh generation. The method transforms the mesh generation task into an operator learning problem with multiple input and solution functions. To effectively overcome the multivariable mapping restriction of operator learning methods, we propose a dual-branch, shared-trunk architecture to approximate the mapping between function spaces based on input-output pairs. Experimental results show that MeshONet achieves a speedup of up to four orders of magnitude in generation efficiency over traditional methods. It also enables generalization to different geometries without retraining, greatly enhancing the practicality of intelligent methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11937v2-abstract-full').style.display = 'none'; document.getElementById('2501.11937v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.11508">arXiv:2501.11508</a> <span> [<a href="https://arxiv.org/pdf/2501.11508">pdf</a>, <a href="https://arxiv.org/format/2501.11508">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> See In Detail: Enhancing Sparse-view 3D Gaussian Splatting with Local Depth and Semantic Regularization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+Z">Zongqi He</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhe Xiao</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+K">Kin-Chung Chan</a>, <a href="/search/cs?searchtype=author&query=Zuo%2C+Y">Yushen Zuo</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jun Xiao</a>, <a href="/search/cs?searchtype=author&query=Lam%2C+K">Kin-Man Lam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.11508v1-abstract-short" style="display: inline;"> 3D Gaussian Splatting (3DGS) has shown remarkable performance in novel view synthesis. However, its rendering quality deteriorates with sparse inphut views, leading to distorted content and reduced details. This limitation hinders its practical application. To address this issue, we propose a sparse-view 3DGS method. Given the inherently ill-posed nature of sparse-view rendering, incorporating pri… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11508v1-abstract-full').style.display = 'inline'; document.getElementById('2501.11508v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.11508v1-abstract-full" style="display: none;"> 3D Gaussian Splatting (3DGS) has shown remarkable performance in novel view synthesis. However, its rendering quality deteriorates with sparse inphut views, leading to distorted content and reduced details. This limitation hinders its practical application. To address this issue, we propose a sparse-view 3DGS method. Given the inherently ill-posed nature of sparse-view rendering, incorporating prior information is crucial. We propose a semantic regularization technique, using features extracted from the pretrained DINO-ViT model, to ensure multi-view semantic consistency. Additionally, we propose local depth regularization, which constrains depth values to improve generalization on unseen views. Our method outperforms state-of-the-art novel view synthesis approaches, achieving up to 0.4dB improvement in terms of PSNR on the LLFF dataset, with reduced distortion and enhanced visual quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11508v1-abstract-full').style.display = 'none'; document.getElementById('2501.11508v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 5 figures, has been accepted by the ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10891">arXiv:2501.10891</a> <span> [<a href="https://arxiv.org/pdf/2501.10891">pdf</a>, <a href="https://arxiv.org/format/2501.10891">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> OpenEarthMap-SAR: A Benchmark Synthetic Aperture Radar Dataset for Global High-Resolution Land Cover Mapping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xia%2C+J">Junshi Xia</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hongruixuan Chen</a>, <a href="/search/cs?searchtype=author&query=Broni-Bediako%2C+C">Clifford Broni-Bediako</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yimin Wei</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jian Song</a>, <a href="/search/cs?searchtype=author&query=Yokoya%2C+N">Naoto Yokoya</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10891v2-abstract-short" style="display: inline;"> High-resolution land cover mapping plays a crucial role in addressing a wide range of global challenges, including urban planning, environmental monitoring, disaster response, and sustainable development. However, creating accurate, large-scale land cover datasets remains a significant challenge due to the inherent complexities of geospatial data, such as diverse terrain, varying sensor modalities… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10891v2-abstract-full').style.display = 'inline'; document.getElementById('2501.10891v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10891v2-abstract-full" style="display: none;"> High-resolution land cover mapping plays a crucial role in addressing a wide range of global challenges, including urban planning, environmental monitoring, disaster response, and sustainable development. However, creating accurate, large-scale land cover datasets remains a significant challenge due to the inherent complexities of geospatial data, such as diverse terrain, varying sensor modalities, and atmospheric conditions. Synthetic Aperture Radar (SAR) imagery, with its ability to penetrate clouds and capture data in all-weather, day-and-night conditions, offers unique advantages for land cover mapping. Despite these strengths, the lack of benchmark datasets tailored for SAR imagery has limited the development of robust models specifically designed for this data modality. To bridge this gap and facilitate advancements in SAR-based geospatial analysis, we introduce OpenEarthMap-SAR, a benchmark SAR dataset, for global high-resolution land cover mapping. OpenEarthMap-SAR consists of 1.5 million segments of 5033 aerial and satellite images with the size of 1024$\times$1024 pixels, covering 35 regions from Japan, France, and the USA, with partially manually annotated and fully pseudo 8-class land cover labels at a ground sampling distance of 0.15--0.5 m. We evaluated the performance of state-of-the-art methods for semantic segmentation and present challenging problem settings suitable for further technical development. The dataset also serves the official dataset for IEEE GRSS Data Fusion Contest Track I. The dataset has been made publicly available at https://zenodo.org/records/14622048. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10891v2-abstract-full').style.display = 'none'; document.getElementById('2501.10891v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.08332">arXiv:2501.08332</a> <span> [<a href="https://arxiv.org/pdf/2501.08332">pdf</a>, <a href="https://arxiv.org/format/2501.08332">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MangaNinja: Line Art Colorization with Precise Reference Following </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiheng Liu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+K+L">Ka Leong Cheng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xi Chen</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jie Xiao</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+H">Hao Ouyang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+K">Kai Zhu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yu Liu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yujun Shen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qifeng Chen</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+P">Ping Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.08332v1-abstract-short" style="display: inline;"> Derived from diffusion models, MangaNinjia specializes in the task of reference-guided line art colorization. We incorporate two thoughtful designs to ensure precise character detail transcription, including a patch shuffling module to facilitate correspondence learning between the reference color image and the target line art, and a point-driven control scheme to enable fine-grained color matchin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08332v1-abstract-full').style.display = 'inline'; document.getElementById('2501.08332v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.08332v1-abstract-full" style="display: none;"> Derived from diffusion models, MangaNinjia specializes in the task of reference-guided line art colorization. We incorporate two thoughtful designs to ensure precise character detail transcription, including a patch shuffling module to facilitate correspondence learning between the reference color image and the target line art, and a point-driven control scheme to enable fine-grained color matching. Experiments on a self-collected benchmark demonstrate the superiority of our model over current solutions in terms of precise colorization. We further showcase the potential of the proposed interactive point control in handling challenging cases, cross-character colorization, multi-reference harmonization, beyond the reach of existing algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08332v1-abstract-full').style.display = 'none'; document.getElementById('2501.08332v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page and code: https://johanan528.github.io/MangaNinjia/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07045">arXiv:2501.07045</a> <span> [<a href="https://arxiv.org/pdf/2501.07045">pdf</a>, <a href="https://arxiv.org/format/2501.07045">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ACCon: Angle-Compensated Contrastive Regularizer for Deep Regression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+B">Botao Zhao</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+X">Xiaoyang Qu</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+Z">Zuheng Kang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+J">Junqing Peng</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jing Xiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianzong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07045v1-abstract-short" style="display: inline;"> In deep regression, capturing the relationship among continuous labels in feature space is a fundamental challenge that has attracted increasing interest. Addressing this issue can prevent models from converging to suboptimal solutions across various regression tasks, leading to improved performance, especially for imbalanced regression and under limited sample sizes. However, existing approaches… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07045v1-abstract-full').style.display = 'inline'; document.getElementById('2501.07045v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07045v1-abstract-full" style="display: none;"> In deep regression, capturing the relationship among continuous labels in feature space is a fundamental challenge that has attracted increasing interest. Addressing this issue can prevent models from converging to suboptimal solutions across various regression tasks, leading to improved performance, especially for imbalanced regression and under limited sample sizes. However, existing approaches often rely on order-aware representation learning or distance-based weighting. In this paper, we hypothesize a linear negative correlation between label distances and representation similarities in regression tasks. To implement this, we propose an angle-compensated contrastive regularizer for deep regression, which adjusts the cosine distance between anchor and negative samples within the contrastive learning framework. Our method offers a plug-and-play compatible solution that extends most existing contrastive learning methods for regression tasks. Extensive experiments and theoretical analysis demonstrate that our proposed angle-compensated contrastive regularizer not only achieves competitive regression performance but also excels in data efficiency and effectiveness on imbalanced datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07045v1-abstract-full').style.display = 'none'; document.getElementById('2501.07045v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accept by AAAI-2025 (The 39th Annual AAAI Conference on Artificial Intelligence)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06173">arXiv:2501.06173</a> <span> [<a href="https://arxiv.org/pdf/2501.06173">pdf</a>, <a href="https://arxiv.org/format/2501.06173">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VideoAuteur: Towards Long Narrative Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Junfei Xiao</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+F">Feng Cheng</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+L">Lu Qi</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+L">Liangke Gui</a>, <a href="/search/cs?searchtype=author&query=Cen%2C+J">Jiepeng Cen</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zhibei Ma</a>, <a href="/search/cs?searchtype=author&query=Yuille%2C+A">Alan Yuille</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+L">Lu Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06173v1-abstract-short" style="display: inline;"> Recent video generation models have shown promising results in producing high-quality video clips lasting several seconds. However, these models face challenges in generating long sequences that convey clear and informative events, limiting their ability to support coherent narrations. In this paper, we present a large-scale cooking video dataset designed to advance long-form narrative generation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06173v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06173v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06173v1-abstract-full" style="display: none;"> Recent video generation models have shown promising results in producing high-quality video clips lasting several seconds. However, these models face challenges in generating long sequences that convey clear and informative events, limiting their ability to support coherent narrations. In this paper, we present a large-scale cooking video dataset designed to advance long-form narrative generation in the cooking domain. We validate the quality of our proposed dataset in terms of visual fidelity and textual caption accuracy using state-of-the-art Vision-Language Models (VLMs) and video generation models, respectively. We further introduce a Long Narrative Video Director to enhance both visual and semantic coherence in generated videos and emphasize the role of aligning visual embeddings to achieve improved overall video quality. Our method demonstrates substantial improvements in generating visually detailed and semantically aligned keyframes, supported by finetuning techniques that integrate text and image embeddings within the video generation process. Project page: https://videoauteur.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06173v1-abstract-full').style.display = 'none'; document.getElementById('2501.06173v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint, https://videoauteur.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06019">arXiv:2501.06019</a> <span> [<a href="https://arxiv.org/pdf/2501.06019">pdf</a>, <a href="https://arxiv.org/format/2501.06019">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> BRIGHT: A globally distributed multimodal building damage assessment dataset with very-high-resolution for all-weather disaster response </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hongruixuan Chen</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jian Song</a>, <a href="/search/cs?searchtype=author&query=Dietrich%2C+O">Olivier Dietrich</a>, <a href="/search/cs?searchtype=author&query=Broni-Bediako%2C+C">Clifford Broni-Bediako</a>, <a href="/search/cs?searchtype=author&query=Xuan%2C+W">Weihao Xuan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junjue Wang</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+X">Xinlei Shao</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yimin Wei</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+J">Junshi Xia</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+C">Cuiling Lan</a>, <a href="/search/cs?searchtype=author&query=Schindler%2C+K">Konrad Schindler</a>, <a href="/search/cs?searchtype=author&query=Yokoya%2C+N">Naoto Yokoya</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06019v1-abstract-short" style="display: inline;"> Disaster events occur around the world and cause significant damage to human life and property. Earth observation (EO) data enables rapid and comprehensive building damage assessment (BDA), an essential capability in the aftermath of a disaster to reduce human casualties and to inform disaster relief efforts. Recent research focuses on the development of AI models to achieve accurate mapping of un… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06019v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06019v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06019v1-abstract-full" style="display: none;"> Disaster events occur around the world and cause significant damage to human life and property. Earth observation (EO) data enables rapid and comprehensive building damage assessment (BDA), an essential capability in the aftermath of a disaster to reduce human casualties and to inform disaster relief efforts. Recent research focuses on the development of AI models to achieve accurate mapping of unseen disaster events, mostly using optical EO data. However, solutions based on optical data are limited to clear skies and daylight hours, preventing a prompt response to disasters. Integrating multimodal (MM) EO data, particularly the combination of optical and SAR imagery, makes it possible to provide all-weather, day-and-night disaster responses. Despite this potential, the development of robust multimodal AI models has been constrained by the lack of suitable benchmark datasets. In this paper, we present a BDA dataset using veRy-hIGH-resoluTion optical and SAR imagery (BRIGHT) to support AI-based all-weather disaster response. To the best of our knowledge, BRIGHT is the first open-access, globally distributed, event-diverse MM dataset specifically curated to support AI-based disaster response. It covers five types of natural disasters and two types of man-made disasters across 12 regions worldwide, with a particular focus on developing countries where external assistance is most needed. The optical and SAR imagery in BRIGHT, with a spatial resolution between 0.3-1 meters, provides detailed representations of individual buildings, making it ideal for precise BDA. In our experiments, we have tested seven advanced AI models trained with our BRIGHT to validate the transferability and robustness. The dataset and code are available at https://github.com/ChenHongruixuan/BRIGHT. BRIGHT also serves as the official dataset for the 2025 IEEE GRSS Data Fusion Contest. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06019v1-abstract-full').style.display = 'none'; document.getElementById('2501.06019v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.01059">arXiv:2501.01059</a> <span> [<a href="https://arxiv.org/pdf/2501.01059">pdf</a>, <a href="https://arxiv.org/format/2501.01059">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Attention-Guided Context Decoding for Mitigating Context Faithfulness Hallucinations in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yanwen Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yong Zhang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+N">Ning Cheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhitao Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shaojun Wang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jing Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.01059v1-abstract-short" style="display: inline;"> Large language models (LLMs) often suffer from context faithfulness hallucinations, where outputs deviate from retrieved information due to insufficient context utilization and high output uncertainty. Our uncertainty evaluation experiments reveal a strong correlation between high uncertainty and hallucinations. We hypothesize that attention mechanisms encode signals indicative of contextual utili… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01059v1-abstract-full').style.display = 'inline'; document.getElementById('2501.01059v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.01059v1-abstract-full" style="display: none;"> Large language models (LLMs) often suffer from context faithfulness hallucinations, where outputs deviate from retrieved information due to insufficient context utilization and high output uncertainty. Our uncertainty evaluation experiments reveal a strong correlation between high uncertainty and hallucinations. We hypothesize that attention mechanisms encode signals indicative of contextual utilization, validated through probing analysis. Based on these insights, we propose Dynamic Attention-Guided Context Decoding (DAGCD), a lightweight framework that integrates attention distributions and uncertainty signals in a single-pass decoding process. Experiments across QA datasets demonstrate DAGCD's effectiveness, achieving significant improvements in faithfulness and robustness while maintaining computational efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01059v1-abstract-full').style.display = 'none'; document.getElementById('2501.01059v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00346">arXiv:2501.00346</a> <span> [<a href="https://arxiv.org/pdf/2501.00346">pdf</a>, <a href="https://arxiv.org/format/2501.00346">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> CNC: Cross-modal Normality Constraint for Unsupervised Multi-class Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaolei Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaoyang Wang</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+H">Huihui Bai</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+E+G">Eng Gee Lim</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jimin Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00346v1-abstract-short" style="display: inline;"> Existing unsupervised distillation-based methods rely on the differences between encoded and decoded features to locate abnormal regions in test images. However, the decoder trained only on normal samples still reconstructs abnormal patch features well, degrading performance. This issue is particularly pronounced in unsupervised multi-class anomaly detection tasks. We attribute this behavior to ov… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00346v1-abstract-full').style.display = 'inline'; document.getElementById('2501.00346v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00346v1-abstract-full" style="display: none;"> Existing unsupervised distillation-based methods rely on the differences between encoded and decoded features to locate abnormal regions in test images. However, the decoder trained only on normal samples still reconstructs abnormal patch features well, degrading performance. This issue is particularly pronounced in unsupervised multi-class anomaly detection tasks. We attribute this behavior to over-generalization(OG) of decoder: the significantly increasing diversity of patch patterns in multi-class training enhances the model generalization on normal patches, but also inadvertently broadens its generalization to abnormal patches. To mitigate OG, we propose a novel approach that leverages class-agnostic learnable prompts to capture common textual normality across various visual patterns, and then apply them to guide the decoded features towards a normal textual representation, suppressing over-generalization of the decoder on abnormal patterns. To further improve performance, we also introduce a gated mixture-of-experts module to specialize in handling diverse patch patterns and reduce mutual interference between them in multi-class training. Our method achieves competitive performance on the MVTec AD and VisA datasets, demonstrating its effectiveness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00346v1-abstract-full').style.display = 'none'; document.getElementById('2501.00346v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00339">arXiv:2501.00339</a> <span> [<a href="https://arxiv.org/pdf/2501.00339">pdf</a>, <a href="https://arxiv.org/format/2501.00339">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Layer Removal: Preserving Critical Components with Task-Aware Singular Value Decomposition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kainan Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yong Zhang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+N">Ning Cheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhitao Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shaojun Wang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jing Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00339v1-abstract-short" style="display: inline;"> Layer removal has emerged as a promising approach for compressing large language models (LLMs) by leveraging redundancy within layers to reduce model size and accelerate inference. However, this technique often compromises internal consistency, leading to performance degradation and instability, with varying impacts across different model architectures. In this work, we propose Taco-SVD, a task-aw… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00339v1-abstract-full').style.display = 'inline'; document.getElementById('2501.00339v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00339v1-abstract-full" style="display: none;"> Layer removal has emerged as a promising approach for compressing large language models (LLMs) by leveraging redundancy within layers to reduce model size and accelerate inference. However, this technique often compromises internal consistency, leading to performance degradation and instability, with varying impacts across different model architectures. In this work, we propose Taco-SVD, a task-aware framework that retains task-critical singular value directions, preserving internal consistency while enabling efficient compression. Unlike direct layer removal, Taco-SVD preserves task-critical transformations to mitigate performance degradation. By leveraging gradient-based attribution methods, Taco-SVD aligns singular values with downstream task objectives. Extensive evaluations demonstrate that Taco-SVD outperforms existing methods in perplexity and task performance across different architectures while ensuring minimal computational overhead. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00339v1-abstract-full').style.display = 'none'; document.getElementById('2501.00339v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20646">arXiv:2412.20646</a> <span> [<a href="https://arxiv.org/pdf/2412.20646">pdf</a>, <a href="https://arxiv.org/format/2412.20646">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Visual Representation for Text-based Person Searching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+W">Wei Shen</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+M">Ming Fang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuxia Wang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jiafeng Xiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Diping Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Huangqun Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+L">Ling Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weifeng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20646v1-abstract-short" style="display: inline;"> Text-based person search aims to retrieve the matched pedestrians from a large-scale image database according to the text description. The core difficulty of this task is how to extract effective details from pedestrian images and texts, and achieve cross-modal alignment in a common latent space. Prior works adopt image and text encoders pre-trained on unimodal data to extract global and local fea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20646v1-abstract-full').style.display = 'inline'; document.getElementById('2412.20646v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20646v1-abstract-full" style="display: none;"> Text-based person search aims to retrieve the matched pedestrians from a large-scale image database according to the text description. The core difficulty of this task is how to extract effective details from pedestrian images and texts, and achieve cross-modal alignment in a common latent space. Prior works adopt image and text encoders pre-trained on unimodal data to extract global and local features from image and text respectively, and then global-local alignment is achieved explicitly. However, these approaches still lack the ability of understanding visual details, and the retrieval accuracy is still limited by identity confusion. In order to alleviate the above problems, we rethink the importance of visual features for text-based person search, and propose VFE-TPS, a Visual Feature Enhanced Text-based Person Search model. It introduces a pre-trained multimodal backbone CLIP to learn basic multimodal features and constructs Text Guided Masked Image Modeling task to enhance the model's ability of learning local visual details without explicit annotation. In addition, we design Identity Supervised Global Visual Feature Calibration task to guide the model learn identity-aware global visual features. The key finding of our study is that, with the help of our proposed auxiliary tasks, the knowledge embedded in the pre-trained CLIP model can be successfully adapted to text-based person search task, and the model's visual understanding ability is significantly enhanced. Experimental results on three benchmarks demonstrate that our proposed model exceeds the existing approaches, and the Rank-1 accuracy is significantly improved with a notable margin of about $1\%\sim9\%$. Our code can be found at https://github.com/zhangweifeng1218/VFE_TPS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20646v1-abstract-full').style.display = 'none'; document.getElementById('2412.20646v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20439">arXiv:2412.20439</a> <span> [<a href="https://arxiv.org/pdf/2412.20439">pdf</a>, <a href="https://arxiv.org/format/2412.20439">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Image Augmentation Agent for Weakly Supervised Semantic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wangyu Wu</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xianglin Qiu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+S">Siqi Song</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhenhong Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiaowei Huang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+F">Fei Ma</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jimin Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20439v1-abstract-short" style="display: inline;"> Weakly-supervised semantic segmentation (WSSS) has achieved remarkable progress using only image-level labels. However, most existing WSSS methods focus on designing new network structures and loss functions to generate more accurate dense labels, overlooking the limitations imposed by fixed datasets, which can constrain performance improvements. We argue that more diverse trainable images provide… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20439v1-abstract-full').style.display = 'inline'; document.getElementById('2412.20439v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20439v1-abstract-full" style="display: none;"> Weakly-supervised semantic segmentation (WSSS) has achieved remarkable progress using only image-level labels. However, most existing WSSS methods focus on designing new network structures and loss functions to generate more accurate dense labels, overlooking the limitations imposed by fixed datasets, which can constrain performance improvements. We argue that more diverse trainable images provides WSSS richer information and help model understand more comprehensive semantic pattern. Therefore in this paper, we introduce a novel approach called Image Augmentation Agent (IAA) which shows that it is possible to enhance WSSS from data generation perspective. IAA mainly design an augmentation agent that leverages large language models (LLMs) and diffusion models to automatically generate additional images for WSSS. In practice, to address the instability in prompt generation by LLMs, we develop a prompt self-refinement mechanism. It allow LLMs to re-evaluate the rationality of generated prompts to produce more coherent prompts. Additionally, we insert an online filter into diffusion generation process to dynamically ensure the quality and balance of generated images. Experimental results show that our method significantly surpasses state-of-the-art WSSS approaches on the PASCAL VOC 2012 and MS COCO 2014 datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20439v1-abstract-full').style.display = 'none'; document.getElementById('2412.20439v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18842">arXiv:2412.18842</a> <span> [<a href="https://arxiv.org/pdf/2412.18842">pdf</a>, <a href="https://arxiv.org/format/2412.18842">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Context-Based Semantic-Aware Alignment for Semi-Supervised Multi-Label Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fan%2C+H">Heng-Bo Fan</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+M">Ming-Kun Xie</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jia-Hao Xiao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Sheng-Jun Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18842v1-abstract-short" style="display: inline;"> Due to the lack of extensive precisely-annotated multi-label data in real word, semi-supervised multi-label learning (SSMLL) has gradually gained attention. Abundant knowledge embedded in vision-language models (VLMs) pre-trained on large-scale image-text pairs could alleviate the challenge of limited labeled data under SSMLL setting.Despite existing methods based on fine-tuning VLMs have achieved… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18842v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18842v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18842v1-abstract-full" style="display: none;"> Due to the lack of extensive precisely-annotated multi-label data in real word, semi-supervised multi-label learning (SSMLL) has gradually gained attention. Abundant knowledge embedded in vision-language models (VLMs) pre-trained on large-scale image-text pairs could alleviate the challenge of limited labeled data under SSMLL setting.Despite existing methods based on fine-tuning VLMs have achieved advances in weakly-supervised multi-label learning, they failed to fully leverage the information from labeled data to enhance the learning of unlabeled data. In this paper, we propose a context-based semantic-aware alignment method to solve the SSMLL problem by leveraging the knowledge of VLMs. To address the challenge of handling multiple semantics within an image, we introduce a novel framework design to extract label-specific image features. This design allows us to achieve a more compact alignment between text features and label-specific image features, leading the model to generate high-quality pseudo-labels. To incorporate the model with comprehensive understanding of image, we design a semi-supervised context identification auxiliary task to enhance the feature representation by capturing co-occurrence information. Extensive experiments on multiple benchmark datasets demonstrate the effectiveness of our proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18842v1-abstract-full').style.display = 'none'; document.getElementById('2412.18842v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18214">arXiv:2412.18214</a> <span> [<a href="https://arxiv.org/pdf/2412.18214">pdf</a>, <a href="https://arxiv.org/format/2412.18214">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/LGRS.2024.3493249">10.1109/LGRS.2024.3493249 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> SDM-Car: A Dataset for Small and Dim Moving Vehicles Detection in Satellite Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhen Zhang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+T">Tao Peng</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+L">Liang Liao</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jing Xiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18214v1-abstract-short" style="display: inline;"> Vehicle detection and tracking in satellite video is essential in remote sensing (RS) applications. However, upon the statistical analysis of existing datasets, we find that the dim vehicles with low radiation intensity and limited contrast against the background are rarely annotated, which leads to the poor effect of existing approaches in detecting moving vehicles under low radiation conditions.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18214v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18214v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18214v1-abstract-full" style="display: none;"> Vehicle detection and tracking in satellite video is essential in remote sensing (RS) applications. However, upon the statistical analysis of existing datasets, we find that the dim vehicles with low radiation intensity and limited contrast against the background are rarely annotated, which leads to the poor effect of existing approaches in detecting moving vehicles under low radiation conditions. In this paper, we address the challenge by building a \textbf{S}mall and \textbf{D}im \textbf{M}oving Cars (SDM-Car) dataset with a multitude of annotations for dim vehicles in satellite videos, which is collected by the Luojia 3-01 satellite and comprises 99 high-quality videos. Furthermore, we propose a method based on image enhancement and attention mechanisms to improve the detection accuracy of dim vehicles, serving as a benchmark for evaluating the dataset. Finally, we assess the performance of several representative methods on SDM-Car and present insightful findings. The dataset is openly available at https://github.com/TanedaM/SDM-Car. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18214v1-abstract-full').style.display = 'none'; document.getElementById('2412.18214v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 7 figures, IEEE Geoscience and Remote Sensing Letters</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16849">arXiv:2412.16849</a> <span> [<a href="https://arxiv.org/pdf/2412.16849">pdf</a>, <a href="https://arxiv.org/format/2412.16849">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> OpenRFT: Adapting Reasoning Foundation Model for Domain-specific Tasks with Reinforcement Fine-Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuxiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuqi Yang</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+J">Jiangming Shu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuhang Wang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jinlin Xiao</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16849v1-abstract-short" style="display: inline;"> OpenAI's recent introduction of Reinforcement Fine-Tuning (RFT) showcases the potential of reasoning foundation model and offers a new paradigm for fine-tuning beyond simple pattern imitation. This technical report presents \emph{OpenRFT}, our attempt to fine-tune generalist reasoning models for domain-specific tasks under the same settings as RFT. OpenRFT addresses two key challenges of lacking r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16849v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16849v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16849v1-abstract-full" style="display: none;"> OpenAI's recent introduction of Reinforcement Fine-Tuning (RFT) showcases the potential of reasoning foundation model and offers a new paradigm for fine-tuning beyond simple pattern imitation. This technical report presents \emph{OpenRFT}, our attempt to fine-tune generalist reasoning models for domain-specific tasks under the same settings as RFT. OpenRFT addresses two key challenges of lacking reasoning step data and the limited quantity of training samples, by leveraging the domain-specific samples in three ways: question augmentation, synthesizing reasoning-process data, and few-shot ICL. The evaluation is conducted on SciKnowEval, where OpenRFT achieves notable performance gains with only $100$ domain-specific samples for each task. More experimental results will be updated continuously in later versions. Source codes, datasets, and models are disclosed at: https://github.com/ADaM-BJTU/OpenRFT <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16849v1-abstract-full').style.display = 'none'; document.getElementById('2412.16849v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16214">arXiv:2412.16214</a> <span> [<a href="https://arxiv.org/pdf/2412.16214">pdf</a>, <a href="https://arxiv.org/format/2412.16214">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> FairTP: A Prolonged Fairness Framework for Traffic Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xia%2C+J">Jiangnan Xia</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yu Yang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Jiaxing Shen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Senzhang Wang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jiannong Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16214v1-abstract-short" style="display: inline;"> Traffic prediction plays a crucial role in intelligent transportation systems. Existing approaches primarily focus on improving overall accuracy, often neglecting a critical issue: whether predictive models lead to biased decisions by transportation authorities. In practice, the uneven deployment of traffic sensors across urban areas results in imbalanced data, causing prediction models to perform… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16214v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16214v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16214v1-abstract-full" style="display: none;"> Traffic prediction plays a crucial role in intelligent transportation systems. Existing approaches primarily focus on improving overall accuracy, often neglecting a critical issue: whether predictive models lead to biased decisions by transportation authorities. In practice, the uneven deployment of traffic sensors across urban areas results in imbalanced data, causing prediction models to perform poorly in certain regions and leading to unfair decision-making. This imbalance ultimately harms the equity and quality of life for residents. Moreover, current fairness-aware machine learning models only ensure fairness at specific time points, failing to maintain fairness over extended periods. As traffic conditions change, such static fairness approaches become ineffective. To address this gap, we propose FairTP, a framework for prolonged fair traffic prediction. We introduce two new fairness definitions tailored for dynamic traffic scenarios. Fairness in traffic prediction is not static; it varies over time and across regions. Each sensor or urban area can alternate between two states: "sacrifice" (low prediction accuracy) and "benefit" (high prediction accuracy). Prolonged fairness is achieved when the overall states of sensors remain similar over a given period. We define two types of fairness: region-based static fairness and sensor-based dynamic fairness. To implement this, FairTP incorporates a state identification module to classify sensors' states as either "sacrifice" or "benefit," enabling prolonged fairness-aware predictions. Additionally, we introduce a state-guided balanced sampling strategy to further enhance fairness, addressing performance disparities among regions with uneven sensor distributions. Extensive experiments on two real-world datasets demonstrate that FairTP significantly improves prediction fairness while minimizing accuracy degradation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16214v1-abstract-full').style.display = 'none'; document.getElementById('2412.16214v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15310">arXiv:2412.15310</a> <span> [<a href="https://arxiv.org/pdf/2412.15310">pdf</a>, <a href="https://arxiv.org/format/2412.15310">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> MRWeb: An Exploration of Generating Multi-Page Resource-Aware Web Code from UI Designs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wan%2C+Y">Yuxuan Wan</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yi Dong</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jingyu Xiao</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+Y">Yintong Huo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+M+R">Michael R. Lyu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15310v1-abstract-short" style="display: inline;"> Multi-page websites dominate modern web development. However, existing design-to-code methods rely on simplified assumptions, limiting to single-page, self-contained webpages without external resource connection. To address this gap, we introduce the Multi-Page Resource-Aware Webpage (MRWeb) generation task, which transforms UI designs into multi-page, functional web UIs with internal/external nav… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15310v1-abstract-full').style.display = 'inline'; document.getElementById('2412.15310v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15310v1-abstract-full" style="display: none;"> Multi-page websites dominate modern web development. However, existing design-to-code methods rely on simplified assumptions, limiting to single-page, self-contained webpages without external resource connection. To address this gap, we introduce the Multi-Page Resource-Aware Webpage (MRWeb) generation task, which transforms UI designs into multi-page, functional web UIs with internal/external navigation, image loading, and backend routing. We propose a novel resource list data structure to track resources, links, and design components. Our study applies existing methods to the MRWeb problem using a newly curated dataset of 500 websites (300 synthetic, 200 real-world). Specifically, we identify the best metric to evaluate the similarity of the web UI, assess the impact of the resource list on MRWeb generation, analyze MLLM limitations, and evaluate the effectiveness of the MRWeb tool in real-world workflows. The results show that resource lists boost navigation functionality from 0% to 66%-80% while facilitating visual similarity. Our proposed metrics and evaluation framework provide new insights into MLLM performance on MRWeb tasks. We release the MRWeb tool, dataset, and evaluation framework to promote further research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15310v1-abstract-full').style.display = 'none'; document.getElementById('2412.15310v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15199">arXiv:2412.15199</a> <span> [<a href="https://arxiv.org/pdf/2412.15199">pdf</a>, <a href="https://arxiv.org/format/2412.15199">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> LiDAR-RT: Gaussian-based Ray Tracing for Dynamic LiDAR Re-simulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+C">Chenxu Zhou</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+L">Lvchang Fu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+S">Sida Peng</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yunzhi Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhanhua Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yong Chen</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+J">Jiazhi Xia</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xiaowei Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15199v1-abstract-short" style="display: inline;"> This paper targets the challenge of real-time LiDAR re-simulation in dynamic driving scenarios. Recent approaches utilize neural radiance fields combined with the physical modeling of LiDAR sensors to achieve high-fidelity re-simulation results. Unfortunately, these methods face limitations due to high computational demands in large-scale scenes and cannot perform real-time LiDAR rendering. To ove… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15199v1-abstract-full').style.display = 'inline'; document.getElementById('2412.15199v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15199v1-abstract-full" style="display: none;"> This paper targets the challenge of real-time LiDAR re-simulation in dynamic driving scenarios. Recent approaches utilize neural radiance fields combined with the physical modeling of LiDAR sensors to achieve high-fidelity re-simulation results. Unfortunately, these methods face limitations due to high computational demands in large-scale scenes and cannot perform real-time LiDAR rendering. To overcome these constraints, we propose LiDAR-RT, a novel framework that supports real-time, physically accurate LiDAR re-simulation for driving scenes. Our primary contribution is the development of an efficient and effective rendering pipeline, which integrates Gaussian primitives and hardware-accelerated ray tracing technology. Specifically, we model the physical properties of LiDAR sensors using Gaussian primitives with learnable parameters and incorporate scene graphs to handle scene dynamics. Building upon this scene representation, our framework first constructs a bounding volume hierarchy (BVH), then casts rays for each pixel and generates novel LiDAR views through a differentiable rendering algorithm. Importantly, our framework supports realistic rendering with flexible scene editing operations and various sensor configurations. Extensive experiments across multiple public benchmarks demonstrate that our method outperforms state-of-the-art methods in terms of rendering quality and efficiency. Our project page is at https://zju3dv.github.io/lidar-rt. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15199v1-abstract-full').style.display = 'none'; document.getElementById('2412.15199v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://zju3dv.github.io/lidar-rt</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13823">arXiv:2412.13823</a> <span> [<a href="https://arxiv.org/pdf/2412.13823">pdf</a>, <a href="https://arxiv.org/format/2412.13823">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Prompt Categories Cluster for Weakly Supervised Semantic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wangyu Wu</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xianglin Qiu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+S">Siqi Song</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiaowei Huang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+F">Fei Ma</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jimin Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13823v1-abstract-short" style="display: inline;"> Weakly Supervised Semantic Segmentation (WSSS), which leverages image-level labels, has garnered significant attention due to its cost-effectiveness. The previous methods mainly strengthen the inter-class differences to avoid class semantic ambiguity which may lead to erroneous activation. However, they overlook the positive function of some shared information between similar classes. Categories w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13823v1-abstract-full').style.display = 'inline'; document.getElementById('2412.13823v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13823v1-abstract-full" style="display: none;"> Weakly Supervised Semantic Segmentation (WSSS), which leverages image-level labels, has garnered significant attention due to its cost-effectiveness. The previous methods mainly strengthen the inter-class differences to avoid class semantic ambiguity which may lead to erroneous activation. However, they overlook the positive function of some shared information between similar classes. Categories within the same cluster share some similar features. Allowing the model to recognize these features can further relieve the semantic ambiguity between these classes. To effectively identify and utilize this shared information, in this paper, we introduce a novel WSSS framework called Prompt Categories Clustering (PCC). Specifically, we explore the ability of Large Language Models (LLMs) to derive category clusters through prompts. These clusters effectively represent the intrinsic relationships between categories. By integrating this relational information into the training network, our model is able to better learn the hidden connections between categories. Experimental results demonstrate the effectiveness of our approach, showing its ability to enhance performance on the PASCAL VOC 2012 dataset and surpass existing state-of-the-art methods in WSSS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13823v1-abstract-full').style.display = 'none'; document.getElementById('2412.13823v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10853">arXiv:2412.10853</a> <span> [<a href="https://arxiv.org/pdf/2412.10853">pdf</a>, <a href="https://arxiv.org/format/2412.10853">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SEW: Self-calibration Enhanced Whole Slide Pathology Image Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+H">Haoming Luo</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xiaotian Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shengxuming Zhang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+J">Jiabin Xia</a>, <a href="/search/cs?searchtype=author&query=Jian%2C+Y">Yang Jian</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuning Sun</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+L">Liang Xue</a>, <a href="/search/cs?searchtype=author&query=Song%2C+M">Mingli Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiuming Zhang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zunlei Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10853v2-abstract-short" style="display: inline;"> Pathology images are considered the ``gold standard" for cancer diagnosis and treatment, with gigapixel images providing extensive tissue and cellular information. Existing methods fail to simultaneously extract global structural and local detail features for comprehensive pathology image analysis efficiently. To address these limitations, we propose a self-calibration enhanced framework for whole… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10853v2-abstract-full').style.display = 'inline'; document.getElementById('2412.10853v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10853v2-abstract-full" style="display: none;"> Pathology images are considered the ``gold standard" for cancer diagnosis and treatment, with gigapixel images providing extensive tissue and cellular information. Existing methods fail to simultaneously extract global structural and local detail features for comprehensive pathology image analysis efficiently. To address these limitations, we propose a self-calibration enhanced framework for whole slide pathology image analysis, comprising three components: a global branch, a focus predictor, and a detailed branch. The global branch initially classifies using the pathological thumbnail, while the focus predictor identifies relevant regions for classification based on the last layer features of the global branch. The detailed extraction branch then assesses whether the magnified regions correspond to the lesion area. Finally, a feature consistency constraint between the global and detail branches ensures that the global branch focuses on the appropriate region and extracts sufficient discriminative features for final identification. These focused discriminative features prove invaluable for uncovering novel prognostic tumor markers from the perspective of feature cluster uniqueness and tissue spatial distribution. Extensive experiment results demonstrate that the proposed framework can rapidly deliver accurate and explainable results for pathological grading and prognosis tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10853v2-abstract-full').style.display = 'none'; document.getElementById('2412.10853v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09899">arXiv:2412.09899</a> <span> [<a href="https://arxiv.org/pdf/2412.09899">pdf</a>, <a href="https://arxiv.org/format/2412.09899">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> TTAQ: Towards Stable Post-training Quantization in Continuous Domain Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Junrui Xiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhikai Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Lianwei Yang</a>, <a href="/search/cs?searchtype=author&query=Mei%2C+Y">Yiduo Mei</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Q">Qingyi Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09899v1-abstract-short" style="display: inline;"> Post-training quantization (PTQ) reduces excessive hardware cost by quantizing full-precision models into lower bit representations on a tiny calibration set, without retraining. Despite the remarkable progress made through recent efforts, traditional PTQ methods typically encounter failure in dynamic and ever-changing real-world scenarios, involving unpredictable data streams and continual domain… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09899v1-abstract-full').style.display = 'inline'; document.getElementById('2412.09899v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09899v1-abstract-full" style="display: none;"> Post-training quantization (PTQ) reduces excessive hardware cost by quantizing full-precision models into lower bit representations on a tiny calibration set, without retraining. Despite the remarkable progress made through recent efforts, traditional PTQ methods typically encounter failure in dynamic and ever-changing real-world scenarios, involving unpredictable data streams and continual domain shifts, which poses greater challenges. In this paper, we propose a novel and stable quantization process for test-time adaptation (TTA), dubbed TTAQ, to address the performance degradation of traditional PTQ in dynamically evolving test domains. To tackle domain shifts in quantizer, TTAQ proposes the Perturbation Error Mitigation (PEM) and Perturbation Consistency Reconstruction (PCR). Specifically, PEM analyzes the error propagation and devises a weight regularization scheme to mitigate the impact of input perturbations. On the other hand, PCR introduces consistency learning to ensure that quantized models provide stable predictions for same sample. Furthermore, we introduce Adaptive Balanced Loss (ABL) to adjust the logits by taking advantage of the frequency and complexity of the class, which can effectively address the class imbalance caused by unpredictable data streams during optimization. Extensive experiments are conducted on multiple datasets with generic TTA methods, proving that TTAQ can outperform existing baselines and encouragingly improve the accuracy of low bit PTQ models in continually changing test domains. For instance, TTAQ decreases the mean error of 2-bit models on ImageNet-C dataset by an impressive 10.1\%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09899v1-abstract-full').style.display = 'none'; document.getElementById('2412.09899v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09624">arXiv:2412.09624</a> <span> [<a href="https://arxiv.org/pdf/2412.09624">pdf</a>, <a href="https://arxiv.org/format/2412.09624">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> GenEx: Generating an Explorable World </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+T">Taiming Lu</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianmin Shu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Junfei Xiao</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+L">Luoxin Ye</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiahao Wang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Cheng Peng</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+C">Chen Wei</a>, <a href="/search/cs?searchtype=author&query=Khashabi%2C+D">Daniel Khashabi</a>, <a href="/search/cs?searchtype=author&query=Chellappa%2C+R">Rama Chellappa</a>, <a href="/search/cs?searchtype=author&query=Yuille%2C+A">Alan Yuille</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jieneng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09624v4-abstract-short" style="display: inline;"> Understanding, navigating, and exploring the 3D physical real world has long been a central challenge in the development of artificial intelligence. In this work, we take a step toward this goal by introducing GenEx, a system capable of planning complex embodied world exploration, guided by its generative imagination that forms priors (expectations) about the surrounding environments. GenEx genera… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09624v4-abstract-full').style.display = 'inline'; document.getElementById('2412.09624v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09624v4-abstract-full" style="display: none;"> Understanding, navigating, and exploring the 3D physical real world has long been a central challenge in the development of artificial intelligence. In this work, we take a step toward this goal by introducing GenEx, a system capable of planning complex embodied world exploration, guided by its generative imagination that forms priors (expectations) about the surrounding environments. GenEx generates an entire 3D-consistent imaginative environment from as little as a single RGB image, bringing it to life through panoramic video streams. Leveraging scalable 3D world data curated from Unreal Engine, our generative model is rounded in the physical world. It captures a continuous 360-degree environment with little effort, offering a boundless landscape for AI agents to explore and interact with. GenEx achieves high-quality world generation, robust loop consistency over long trajectories, and demonstrates strong 3D capabilities such as consistency and active 3D mapping. Powered by generative imagination of the world, GPT-assisted agents are equipped to perform complex embodied tasks, including both goal-agnostic exploration and goal-driven navigation. These agents utilize predictive expectation regarding unseen parts of the physical world to refine their beliefs, simulate different outcomes based on potential decisions, and make more informed choices. In summary, we demonstrate that GenEx provides a transformative platform for advancing embodied AI in imaginative spaces and brings potential for extending these capabilities to real-world exploration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09624v4-abstract-full').style.display = 'none'; document.getElementById('2412.09624v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Website: GenEx.world</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09324">arXiv:2412.09324</a> <span> [<a href="https://arxiv.org/pdf/2412.09324">pdf</a>, <a href="https://arxiv.org/format/2412.09324">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Are Conditional Latent Diffusion Models Effective for Image Restoration? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yunchen Yuan</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Junyuan Xiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xinjie Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09324v2-abstract-short" style="display: inline;"> Recent advancements in image restoration increasingly employ conditional latent diffusion models (CLDMs). While these models have demonstrated notable performance improvements in recent years, this work questions their suitability for IR tasks. CLDMs excel in capturing high-level semantic correlations, making them effective for tasks like text-to-image generation with spatial conditioning. However… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09324v2-abstract-full').style.display = 'inline'; document.getElementById('2412.09324v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09324v2-abstract-full" style="display: none;"> Recent advancements in image restoration increasingly employ conditional latent diffusion models (CLDMs). While these models have demonstrated notable performance improvements in recent years, this work questions their suitability for IR tasks. CLDMs excel in capturing high-level semantic correlations, making them effective for tasks like text-to-image generation with spatial conditioning. However, in IR, where the goal is to enhance image perceptual quality, these models face difficulty of modeling the relationship between degraded images and ground truth images using a low-level representation. To support our claims, we compare state-of-the-art CLDMs with traditional image restoration models through extensive experiments. Results reveal that despite the scaling advantages of CLDMs, they suffer from high distortion and semantic deviation, especially in cases with minimal degradation, where traditional methods outperform them. Additionally, we perform empirical studies to examine the impact of various CLDM design elements on their restoration performance. We hope this finding inspires a reexamination of current CLDM-based IR solutions, opening up more opportunities in this field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09324v2-abstract-full').style.display = 'none'; document.getElementById('2412.09324v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08161">arXiv:2412.08161</a> <span> [<a href="https://arxiv.org/pdf/2412.08161">pdf</a>, <a href="https://arxiv.org/format/2412.08161">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Collaborative Hybrid Propagator for Temporal Misalignment in Audio-Visual Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+K">Kexin Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zongxin Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yi Yang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jun Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08161v1-abstract-short" style="display: inline;"> Audio-visual video segmentation (AVVS) aims to generate pixel-level maps of sound-producing objects that accurately align with the corresponding audio. However, existing methods often face temporal misalignment, where audio cues and segmentation results are not temporally coordinated. Audio provides two critical pieces of information: i) target object-level details and ii) the timing of when objec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08161v1-abstract-full').style.display = 'inline'; document.getElementById('2412.08161v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08161v1-abstract-full" style="display: none;"> Audio-visual video segmentation (AVVS) aims to generate pixel-level maps of sound-producing objects that accurately align with the corresponding audio. However, existing methods often face temporal misalignment, where audio cues and segmentation results are not temporally coordinated. Audio provides two critical pieces of information: i) target object-level details and ii) the timing of when objects start and stop producing sounds. Current methods focus more on object-level information but neglect the boundaries of audio semantic changes, leading to temporal misalignment. To address this issue, we propose a Collaborative Hybrid Propagator Framework~(Co-Prop). This framework includes two main steps: Preliminary Audio Boundary Anchoring and Frame-by-Frame Audio-Insert Propagation. To Anchor the audio boundary, we employ retrieval-assist prompts with Qwen large language models to identify control points of audio semantic changes. These control points split the audio into semantically consistent audio portions. After obtaining the control point lists, we propose the Audio Insertion Propagator to process each audio portion using a frame-by-frame audio insertion propagation and matching approach. We curated a compact dataset comprising diverse source conversion cases and devised a metric to assess alignment rates. Compared to traditional simultaneous processing methods, our approach reduces memory requirements and facilitates frame alignment. Experimental results demonstrate the effectiveness of our approach across three datasets and two backbones. Furthermore, our method can be integrated with existing AVVS approaches, offering plug-and-play functionality to enhance their performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08161v1-abstract-full').style.display = 'none'; document.getElementById('2412.08161v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03968">arXiv:2412.03968</a> <span> [<a href="https://arxiv.org/pdf/2412.03968">pdf</a>, <a href="https://arxiv.org/format/2412.03968">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Exact: Exploring Space-Time Perceptive Clues for Weakly Supervised Satellite Image Time Series Semantic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Hao Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yan Zhu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jiayu Xiao</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tianxiang Xiao</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yike Ma</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yucheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+F">Feng Dai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03968v1-abstract-short" style="display: inline;"> Automated crop mapping through Satellite Image Time Series (SITS) has emerged as a crucial avenue for agricultural monitoring and management. However, due to the low resolution and unclear parcel boundaries, annotating pixel-level masks is exceptionally complex and time-consuming in SITS. This paper embraces the weakly supervised paradigm (i.e., only image-level categories available) to liberate t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03968v1-abstract-full').style.display = 'inline'; document.getElementById('2412.03968v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03968v1-abstract-full" style="display: none;"> Automated crop mapping through Satellite Image Time Series (SITS) has emerged as a crucial avenue for agricultural monitoring and management. However, due to the low resolution and unclear parcel boundaries, annotating pixel-level masks is exceptionally complex and time-consuming in SITS. This paper embraces the weakly supervised paradigm (i.e., only image-level categories available) to liberate the crop mapping task from the exhaustive annotation burden. The unique characteristics of SITS give rise to several challenges in weakly supervised learning: (1) noise perturbation from spatially neighboring regions, and (2) erroneous semantic bias from anomalous temporal periods. To address the above difficulties, we propose a novel method, termed exploring space-time perceptive clues (Exact). First, we introduce a set of spatial clues to explicitly capture the representative patterns of different crops from the most class-relative regions. Besides, we leverage the temporal-to-class interaction of the model to emphasize the contributions of pivotal clips, thereby enhancing the model perception for crop regions. Build upon the space-time perceptive clues, we derive the clue-based CAMs to effectively supervise the SITS segmentation network. Our method demonstrates impressive performance on various SITS benchmarks. Remarkably, the segmentation network trained on Exact-generated masks achieves 95% of its fully supervised performance, showing the bright promise of weakly supervised paradigm in crop mapping scenario. Our code will be publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03968v1-abstract-full').style.display = 'none'; document.getElementById('2412.03968v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review. Code will be available at https://github.com/MiSsU-HH/Exact</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03568">arXiv:2412.03568</a> <span> [<a href="https://arxiv.org/pdf/2412.03568">pdf</a>, <a href="https://arxiv.org/format/2412.03568">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> The Matrix: Infinite-Horizon World Generation with Real-Time Moving Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+R">Ruili Feng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Han Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhantao Yang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jie Xiao</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+Z">Zhilei Shu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiheng Liu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+A">Andy Zheng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yukun Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yu Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongyang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03568v1-abstract-short" style="display: inline;"> We present The Matrix, the first foundational realistic world simulator capable of generating continuous 720p high-fidelity real-scene video streams with real-time, responsive control in both first- and third-person perspectives, enabling immersive exploration of richly dynamic environments. Trained on limited supervised data from AAA games like Forza Horizon 5 and Cyberpunk 2077, complemented by… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03568v1-abstract-full').style.display = 'inline'; document.getElementById('2412.03568v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03568v1-abstract-full" style="display: none;"> We present The Matrix, the first foundational realistic world simulator capable of generating continuous 720p high-fidelity real-scene video streams with real-time, responsive control in both first- and third-person perspectives, enabling immersive exploration of richly dynamic environments. Trained on limited supervised data from AAA games like Forza Horizon 5 and Cyberpunk 2077, complemented by large-scale unsupervised footage from real-world settings like Tokyo streets, The Matrix allows users to traverse diverse terrains -- deserts, grasslands, water bodies, and urban landscapes -- in continuous, uncut hour-long sequences. Operating at 16 FPS, the system supports real-time interactivity and demonstrates zero-shot generalization, translating virtual game environments to real-world contexts where collecting continuous movement data is often infeasible. For example, The Matrix can simulate a BMW X3 driving through an office setting--an environment present in neither gaming data nor real-world sources. This approach showcases the potential of AAA game data to advance robust world models, bridging the gap between simulations and real-world applications in scenarios with limited data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03568v1-abstract-full').style.display = 'none'; document.getElementById('2412.03568v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.02960">arXiv:2412.02960</a> <span> [<a href="https://arxiv.org/pdf/2412.02960">pdf</a>, <a href="https://arxiv.org/format/2412.02960">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Semantic Segmentation Prior for Diffusion-Based Real-World Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jiahua Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiawei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+D">Dongqing Zou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaodan Zhang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+J">Jimmy Ren</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xing Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.02960v1-abstract-short" style="display: inline;"> Real-world image super-resolution (Real-ISR) has achieved a remarkable leap by leveraging large-scale text-to-image models, enabling realistic image restoration from given recognition textual prompts. However, these methods sometimes fail to recognize some salient objects, resulting in inaccurate semantic restoration in these regions. Additionally, the same region may have a strong response to mor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02960v1-abstract-full').style.display = 'inline'; document.getElementById('2412.02960v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.02960v1-abstract-full" style="display: none;"> Real-world image super-resolution (Real-ISR) has achieved a remarkable leap by leveraging large-scale text-to-image models, enabling realistic image restoration from given recognition textual prompts. However, these methods sometimes fail to recognize some salient objects, resulting in inaccurate semantic restoration in these regions. Additionally, the same region may have a strong response to more than one prompt and it will lead to semantic ambiguity for image super-resolution. To alleviate the above two issues, in this paper, we propose to consider semantic segmentation as an additional control condition into diffusion-based image super-resolution. Compared to textual prompt conditions, semantic segmentation enables a more comprehensive perception of salient objects within an image by assigning class labels to each pixel. It also mitigates the risks of semantic ambiguities by explicitly allocating objects to their respective spatial regions. In practice, inspired by the fact that image super-resolution and segmentation can benefit each other, we propose SegSR which introduces a dual-diffusion framework to facilitate interaction between the image super-resolution and segmentation diffusion models. Specifically, we develop a Dual-Modality Bridge module to enable updated information flow between these two diffusion models, achieving mutual benefit during the reverse diffusion process. Extensive experiments show that SegSR can generate realistic images while preserving semantic structures more effectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02960v1-abstract-full').style.display = 'none'; document.getElementById('2412.02960v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Xiao%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Xiao%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Xiao%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Xiao%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Xiao%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Xiao%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository